class SectionHeading(SettingBase): value = Constant('') def __init__(self, name): self.name = name
class BleedthroughPiecewiseOp(HasStrictTraits): """ *THIS OPERATION IS DEPRECATED.* Apply bleedthrough correction to a set of fluorescence channels. This is not a traditional bleedthrough matrix-based compensation; it uses a similar set of single-color controls, but instead of computing a compensation matrix, it fits a piecewise-linear spline to the untransformed data and uses those splines to compute the correction factor at each point in a mesh across the color space. The experimental data is corrected using a linear interpolation along that mesh: this is much faster than computing the correction factor for each cell indiviually (an operation that takes 5 msec each.) To use, set up the `controls` dict with the single color controls; call `estimate()` to parameterize the operation; check that the bleedthrough plots look good with `default_view().plot()`; and then `apply()` to an Experiment. *THIS OPERATION IS DEPRECATED AND WILL BE REMOVED IN A FUTURE RELEASE. TO USE IT, SET `ignore_deprecated` TO `True`. IF YOU HAVE A USE CASE WHERE THIS WORKS BETTER THAN THE LINEAR BLEEDTHROUGH CORRECTION, PLEASE EMAIL ME OR FILE A BUG.* Attributes ---------- name : Str The operation name (for UI representation; optional for interactive use) controls : Dict(Str, File) The channel names to correct, and corresponding single-color control FCS files to estimate the correction splines with. Must be set to use `estimate()`. num_knots : Int (default = 12) The number of internal control points to estimate, spaced log-evenly from 0 to the range of the channel. Must be set to use `estimate()`. mesh_size : Int (default = 32) The size of each axis in the mesh used to interpolate corrected values. ignore_deprecated : Bool (default = False) Metadata -------- bleedthrough_channels : List(Str) The channels that were used to correct this one. bleedthrough_fn : Callable (Tuple(Float) --> Float) The function that will correct one event in this channel. Pass it the values specified in `bleedthrough_channels` and it will return the corrected value for this channel. Notes ----- We use an interpolation-based scheme to estimate corrected bleedthrough. The algorithm is as follows: - Fit a piecewise-linear spline to each single-color control's bleedthrough into other channels. Because we want to fit the spline to untransfomed data, but capture both the negative, positive-linear and positive-log portions of a traditional flow data set, we distribute the spline knots evenly on an hlog-transformed axis for each color we're correcting. - At each point on a regular mesh spanning the entire range of the instrument, estimate the mapping from (raw colors) --> (actual colors). The mesh points are also distributed evenly along the hlog-transformed color axes; this captures negative data as well as positive This is quite slow: ~30 seconds for a mesh size of 32 in 3-space. Remember that additional channels expand the number of mesh points exponentially! - Use these estimates to paramaterize a linear interpolator (in linear space, this time). There's one interpolator per output channel (so for a 3-channel correction, each interpolator is R^3 --> R). For each measured cell, run each interpolator to give the corrected output. Examples -------- >>> bl_op = flow.BleedthroughPiecewiseOp() >>> bl_op.controls = {'Pacific Blue-A' : 'ebfp.fcs', ... 'FITC-A' : 'eyfp.fcs', ... 'PE-Tx-Red-YG-A' : 'mkate.fcs'} >>> >>> bl_op.estimate(ex2) >>> bl_op.default_view().plot(ex2) >>> >>> %time ex3 = bl_op.apply(ex2) # 410,000 cells CPU times: user 577 ms, sys: 27.7 ms, total: 605 ms Wall time: 607 ms """ # traits id = Constant('edu.mit.synbio.cytoflow.operations.bleedthrough_piecewise') friendly_id = Constant("Piecewise Bleedthrough Correction") name = Constant("Bleedthrough") controls = Dict(Str, File) num_knots = Int(12) mesh_size = Int(32) ignore_deprecated = Bool(False) _splines = Dict(Str, Dict(Str, Python), transient=True) _interpolators = Dict(Str, Python, transient=True) # because the order of the channels is important, we can't just call # _interpolators.keys() # TODO - this is ugly and unpythonic. :-/ _channels = List(Str, transient=True) def estimate(self, experiment, subset=None): """ Estimate the bleedthrough from the single-channel controls in `controls` """ if not self.ignore_deprecated: raise util.CytoflowOpError( "BleedthroughPiecewiseOp is DEPRECATED. " "To use it anyway, set ignore_deprected " "to True.") if experiment is None: raise util.CytoflowOpError("No experiment specified") if self.num_knots < 3: raise util.CytoflowOpError( "Need to allow at least 3 knots in the spline") self._channels = list(self.controls.keys()) if len(self._channels) < 2: raise util.CytoflowOpError( "Need at least two channels to correct bleedthrough.") for channel in list(self.controls.keys()): if 'range' not in experiment.metadata[channel]: raise util.CytoflowOpError( "Can't find range for channel {}".format(channel)) self._splines = {} mesh_axes = [] for channel in self._channels: self._splines[channel] = {} # make a little Experiment check_tube(self.controls[channel], experiment) tube_exp = ImportOp( tubes=[Tube(file=self.controls[channel])], channels={ experiment.metadata[c]["fcs_name"]: c for c in experiment.channels }, name_metadata=experiment.metadata['name_metadata']).apply() # apply previous operations for op in experiment.history: tube_exp = op.apply(tube_exp) # subset it if subset: try: tube_exp = tube_exp.query(subset) except Exception as e: raise util.CytoflowOpError( "Subset string '{0}' isn't valid".format( self.subset)) from e if len(tube_exp.data) == 0: raise util.CytoflowOpError( "Subset string '{0}' returned no events".format( self.subset)) tube_data = tube_exp.data # polyfit requires sorted data tube_data.sort_values(by=channel, inplace=True) channel_min = tube_data[channel].min() channel_max = tube_data[channel].max() # we're going to set the knots and splines evenly across the # logicle-transformed data, so as to captur both the "linear" # aspect of the near-0 and negative values, and the "log" # aspect of large values. scale = util.scale_factory("logicle", experiment, channel=channel) # the splines' knots knot_min = channel_min knot_max = channel_max lg_knot_min = scale(knot_min) lg_knot_max = scale(knot_max) lg_knots = np.linspace(lg_knot_min, lg_knot_max, self.num_knots) knots = scale.inverse(lg_knots) # only keep the interior knots knots = knots[1:-1] # the interpolators' mesh if 'af_median' in experiment.metadata[channel] and \ 'af_stdev' in experiment.metadata[channel]: mesh_min = experiment.metadata[channel]['af_median'] - \ 3 * experiment.metadata[channel]['af_stdev'] elif 'range' in experiment.metadata[channel]: mesh_min = -0.01 * experiment.metadata[channel][ 'range'] # TODO - does this even work? warn( "This works best if you apply AutofluorescenceOp before " "computing bleedthrough", util.CytoflowOpWarning) mesh_max = experiment.metadata[channel]['range'] lg_mesh_min = scale(mesh_min) lg_mesh_max = scale(mesh_max) lg_mesh_axis = \ np.linspace(lg_mesh_min, lg_mesh_max, self.mesh_size) mesh_axis = scale.inverse(lg_mesh_axis) mesh_axes.append(mesh_axis) for to_channel in self._channels: from_channel = channel if from_channel == to_channel: continue self._splines[from_channel][to_channel] = \ scipy.interpolate.LSQUnivariateSpline(tube_data[from_channel].values, tube_data[to_channel].values, t = knots, k = 1) mesh = pd.DataFrame(util.cartesian(mesh_axes), columns=[x for x in self._channels]) mesh_corrected = mesh.apply(_correct_bleedthrough, axis=1, args=([[x for x in self._channels], self._splines])) for channel in self._channels: chan_values = mesh_corrected[channel].values.reshape( [len(x) for x in mesh_axes]) self._interpolators[channel] = \ scipy.interpolate.RegularGridInterpolator(points = mesh_axes, values = chan_values, bounds_error = False, fill_value = 0.0) # TODO - some sort of validity checking. def apply(self, experiment): """Applies the bleedthrough correction to an experiment. Parameters ---------- experiment : Experiment the old_experiment to which this op is applied Returns ------- a new experiment with the bleedthrough subtracted out. """ if not self.ignore_deprecated: raise util.CytoflowOpError( "BleedthroughPiecewiseOp is DEPRECATED. " "To use it anyway, set ignore_deprected " "to True.") if experiment is None: raise util.CytoflowOpError("No experiment specified") if not self._interpolators: raise util.CytoflowOpError("Module interpolators aren't set. " "Did you run estimate()?") if not set(self._interpolators.keys()) <= set(experiment.channels): raise util.CytoflowOpError( "Module parameters don't match experiment channels") new_experiment = experiment.clone() # get rid of data outside of the interpolators' mesh # (-3 * autofluorescence sigma) for channel in self._channels: # if you update the mesh calculation above, update it here too! if 'af_median' in experiment.metadata[channel] and \ 'af_stdev' in experiment.metadata[channel]: mesh_min = experiment.metadata[channel]['af_median'] - \ 3 * experiment.metadata[channel]['af_stdev'] else: mesh_min = -0.01 * experiment.metadata[channel][ 'range'] # TODO - does this even work? new_experiment.data = \ new_experiment.data[new_experiment.data[channel] > mesh_min] new_experiment.data.reset_index(drop=True, inplace=True) old_data = new_experiment.data[self._channels] for channel in self._channels: new_experiment[channel] = self._interpolators[channel](old_data) new_experiment.metadata[channel][ 'bleedthrough_channels'] = self._channels new_experiment.metadata[channel][ 'bleedthrough_fn'] = self._interpolators[channel] new_experiment.history.append( self.clone_traits(transient=lambda _: True)) return new_experiment def default_view(self, **kwargs): """ Returns a diagnostic plot to see if the bleedthrough spline estimation is working. Returns ------- IView : An IView, call plot() to see the diagnostic plots """ if not self.ignore_deprecated: raise util.CytoflowOpError( "BleedthroughPiecewiseOp is DEPRECATED. " "To use it anyway, set ignore_deprected " "to True.") if set(self.controls.keys()) != set(self._splines.keys()): raise util.CytoflowOpError( "Must have both the controls and bleedthrough to plot") return BleedthroughPiecewiseDiagnostic(op=self, **kwargs)
class AutofluorescenceOp(HasStrictTraits): """ Apply autofluorescence correction to a set of fluorescence channels. The :meth:`estimate` function loads a separate FCS file (not part of the input :class:`.Experiment`) and computes the untransformed median and standard deviation of the blank cells. Then, :meth:`apply` subtracts the median from the experiment data. To use, set the :attr:`blank_file` property to point to an FCS file with unstained or nonfluorescing cells in it; set the :attr:`channels` property to a list of channels to correct. :meth:`apply` also adds the ``af_median`` and ``af_stdev`` metadata to the corrected channels, representing the median and standard deviation of the measured blank distributions. Attributes ---------- channels : List(Str) The channels to correct. blank_file : File The filename of a file with "blank" cells (not fluorescent). Used to :meth:`estimate` the autofluorescence. Examples -------- Create a small experiment: .. plot:: :context: close-figs >>> import cytoflow as flow >>> import_op = flow.ImportOp() >>> import_op.tubes = [flow.Tube(file = "tasbe/rby.fcs")] >>> ex = import_op.apply() Create and parameterize the operation .. plot:: :context: close-figs >>> af_op = flow.AutofluorescenceOp() >>> af_op.channels = ["Pacific Blue-A", "FITC-A", "PE-Tx-Red-YG-A"] >>> af_op.blank_file = "tasbe/blank.fcs" Estimate the model parameters .. plot:: :context: close-figs >>> af_op.estimate(ex) Plot the diagnostic plot .. plot:: :context: close-figs >>> af_op.default_view().plot(ex) Apply the operation to the experiment .. plot:: :context: close-figs >>> ex2 = af_op.apply(ex) """ # traits id = Constant('edu.mit.synbio.cytoflow.operations.autofluorescence') friendly_id = Constant("Autofluorescence correction") name = Constant("Autofluorescence") channels = List(Str) blank_file = File(exists=True) _af_median = Dict(Str, CFloat, transient=True) _af_stdev = Dict(Str, CFloat, transient=True) def estimate(self, experiment, subset=None): """ Estimate the autofluorescence from :attr:`blank_file` in channels specified in :attr:`channels`. Parameters ---------- experiment : Experiment The experiment to which this operation is applied subset : str (default = "") An expression that specifies the events used to compute the autofluorescence """ if experiment is None: raise util.CytoflowOpError('experiment', "No experiment specified") if not self.channels: raise util.CytoflowOpError('channels', "No channels specified") if not set(self.channels) <= set(experiment.channels): raise util.CytoflowOpError( 'channels', "Specified channels that weren't found " "in the experiment.") # don't have to validate that blank_file exists; should crap out on # trying to set a bad value # make a little Experiment check_tube(self.blank_file, experiment) blank_exp = ImportOp( tubes=[Tube(file=self.blank_file)], channels={ experiment.metadata[c]["fcs_name"]: c for c in experiment.channels }, name_metadata=experiment.metadata['name_metadata']).apply() # apply previous operations for op in experiment.history: blank_exp = op.apply(blank_exp) # subset it if subset: try: blank_exp = blank_exp.query(subset) except Exception as exc: raise util.CytoflowOpError( 'subset', "Subset string '{0}' isn't valid".format(subset)) from exc if len(blank_exp.data) == 0: raise util.CytoflowOpError( 'subset', "Subset string '{0}' returned no events".format(subset)) for channel in self.channels: channel_min = blank_exp[channel].quantile(0.025) channel_max = blank_exp[channel].quantile(0.975) blank_exp[channel] = blank_exp[channel].clip( channel_min, channel_max) self._af_median[channel] = np.median(blank_exp[channel]) self._af_stdev[channel] = np.std(blank_exp[channel]) def apply(self, experiment): """ Applies the autofluorescence correction to channels in an experiment. Parameters ---------- experiment : Experiment the experiment to which this op is applied Returns ------- Experiment a new experiment with the autofluorescence median subtracted. The corrected channels have the following metadata added to them: - **af_median** : Float The median of the non-fluorescent distribution - **af_stdev** : Float The standard deviation of the non-fluorescent distribution """ if experiment is None: raise util.CytoflowOpError('experiment', "No experiment specified") if not self.channels: raise util.CytoflowOpError('channels', "No channels specified") if not self._af_median: raise util.CytoflowOpError( None, "Autofluorescence values aren't set. Did " "you forget to run estimate()?") if not set(self._af_median.keys()) <= set(experiment.channels) or \ not set(self._af_stdev.keys()) <= set(experiment.channels): raise util.CytoflowOpError( None, "Autofluorescence estimates aren't set, or are " "different than those in the experiment " "parameter. Did you forget to run estimate()?") if not set(self._af_median.keys()) == set(self._af_stdev.keys()): raise util.CytoflowOpError( None, "Median and stdev keys are different! " "What the hell happened?!") if not set(self.channels) == set(self._af_median.keys()): raise util.CytoflowOpError( 'channels', "Estimated channels differ from the channels " "parameter. Did you forget to (re)run estimate()?") new_experiment = experiment.clone() for channel in self.channels: new_experiment[channel] = \ experiment[channel] - self._af_median[channel] new_experiment.metadata[channel]['af_median'] = self._af_median[ channel] new_experiment.metadata[channel]['af_stdev'] = self._af_stdev[ channel] new_experiment.history.append( self.clone_traits(transient=lambda t: True)) return new_experiment def default_view(self, **kwargs): """ Returns a diagnostic plot to see if the autofluorescence estimation is working. Returns ------- IView An diagnostic view, call :meth:`~AutofluorescenceDiagnosticView.plot` to see the diagnostic plots """ return AutofluorescenceDiagnosticView(op=self, **kwargs)
class LogicleScale(HasStrictTraits): """ A scale that transforms the data using the `logicle` function. This scaling method implements a "linear-like" region around 0, and a "log-like" region for large values, with a very smooth transition between them. It's particularly good for compensated data, and data where you have "negative" events (events with a fluorescence of ~0.) If you don't have any data around 0, you might be better of with a more traditional log scale. The transformation has one parameter, `W`, which specifies the width of the "linear" range in log10 decades. By default, the optimal value is estimated from the data; but if you assign a value to `W` it will be used. `0.5` is usually a good start. Attributes ---------- experiment : Instance(cytoflow.Experiment) the `cytoflow.Experiment` used to estimate the scale parameters. channel : Str If set, choose scale parameters from this channel in `experiment`. One of `channel`, `condition` or `statistic` must be set. condition : Str If set, choose scale parameters from this condition in `experiment`. One of `channel`, `condition` or `statistic` must be set. statistic : Str If set, choose scale parameters from this statistic in `experiment`. One of `channel`, `condition` or `statistic` must be set. quantiles = Tuple(Float, Float) (default = (0.001, 0.999)) If there are a few very large or very small values, this can throw off matplotlib's choice of default axis ranges. Set `quantiles` to choose what part of the data to consider when choosing axis ranges. W : Float (default = estimated from data) The width of the linear range, in log10 decades. can estimate from data, or use a fixed value like 0.5. M : Float (default = 4.5) The width of the log portion of the display, in log10 decades. A : Float (default = 0.0) additional decades of negative data to include. the default display usually captures all the data, so 0 is fine to start. r : Float (default = 0.05) Quantile used to estimate `W`. References ---------- [1] A new "Logicle" display method avoids deceptive effects of logarithmic scaling for low signals and compensated data. Parks DR, Roederer M, Moore WA. Cytometry A. 2006 Jun;69(6):541-51. PMID: 16604519 http://onlinelibrary.wiley.com/doi/10.1002/cyto.a.20258/full [2] Update for the logicle data scale including operational code implementations. Moore WA, Parks DR. Cytometry A. 2012 Apr;81(4):273-7. doi: 10.1002/cyto.a.22030 PMID: 22411901 http://onlinelibrary.wiley.com/doi/10.1002/cyto.a.22030/full """ id = Constant("edu.mit.synbio.cytoflow.utility.logicle_scale") name = "logicle" experiment = Instance("cytoflow.Experiment") # what data do we use to compute scale parameters? set one. channel = Str condition = Str statistic = Tuple(Str, Str) error_statistic = Tuple(Str, Str) data = Array W = Property(Float, depends_on = "[experiment, channel, M, _T, r]") M = Float(4.5, desc = "the width of the display in log10 decades") A = Float(0.0, desc = "additional decades of negative data to include.") r = Float(0.05, desc = "quantile to use for estimating the W parameter.") _W = Float(Undefined) _T = Property(Float, depends_on = "[experiment, condition, channel]") _logicle = Property(Instance(FastLogicle), depends_on = "[_T, W, M, A]") mpl_params = Property(Dict, depends_on = "_logicle") def __call__(self, data): """ Transforms `data` using this scale. Careful! May return `NaN` if the scale domain doesn't match the data (ie, applying a log10 scale to negative numbers.) """ try: logicle_min = self._logicle.inverse(0.0) logicle_max = self._logicle.inverse(1.0 - sys.float_info.epsilon) if isinstance(data, pd.Series): data = data.clip(logicle_min, logicle_max) return data.apply(self._logicle.scale) elif isinstance(data, np.ndarray): data = np.clip(data, logicle_min, logicle_max) scale = np.vectorize(self._logicle.scale) return scale(data) elif isinstance(data, float): data = max(min(data, logicle_max), logicle_min) return self._logicle.scale(data) else: try: return list(map(self._logicle.scale, data)) except TypeError as e: raise CytoflowError("Unknown data type") from e except ValueError as e: raise CytoflowError(e.strerror) def inverse(self, data): """ Transforms 'data' using the inverse of this scale. """ try: if isinstance(data, pd.Series): data = data.clip(0, 1.0 - sys.float_info.epsilon) return data.apply(self._logicle.inverse) elif isinstance(data, np.ndarray): data = np.clip(data, 0, 1.0 - sys.float_info.epsilon) inverse = np.vectorize(self._logicle.inverse) return inverse(data) elif isinstance(data, float): data = max(min(data, 1.0 - sys.float_info.epsilon), 0.0) return self._logicle.inverse(data) else: try: return list(map(self._logicle.inverse, data)) except TypeError as e: raise CytoflowError("Unknown data type") from e except ValueError as e: raise CytoflowError(str(e)) def clip(self, data): try: logicle_min = self._logicle.inverse(0.0) logicle_max = self._logicle.inverse(1.0 - sys.float_info.epsilon) if isinstance(data, pd.Series): return data.clip(logicle_min, logicle_max) elif isinstance(data, np.ndarray): return np.clip(data, logicle_min, logicle_max) elif isinstance(data, float): return max(min(data, logicle_max), logicle_min) else: try: return [max(min(x, logicle_max), logicle_min) for x in data] except TypeError as e: raise CytoflowError("Unknown data type") from e except ValueError as e: raise CytoflowError(e.strerror) def color_norm(self): # it turns out that Logicle is already defined as a normalization to # [0, 1]. class LogicleNormalize(matplotlib.colors.Normalize): def __init__(self, scale = None): self._scale = scale self.vmin = scale.inverse(0.0) self.vmax = scale.inverse(1.0 - sys.float_info.epsilon) def __call__(self, data, clip = None): # it turns out that Logicle is already defined as a # normalization to [0, 1]. ret = self._scale(data) return np.ma.masked_array(ret) return LogicleNormalize(scale = self) @cached_property def _get__T(self): "The range of possible data values" if self.experiment: if self.channel and self.channel in self.experiment.channels: if "range" in self.experiment.metadata[self.channel]: return self.experiment.metadata[self.channel]["range"] else: return self.experiment.data[self.channel].max() elif self.condition and self.condition in self.experiment.conditions: return self.experiment.data[self.condition].max() elif self.statistic in self.experiment.statistics \ and not self.error_statistic in self.experiment.statistics: stat = self.experiment.statistics[self.statistic] assert is_numeric(stat) return stat.max() elif self.statistic in self.experiment.statistics and \ self.error_statistic in self.experiment.statistics: stat = self.experiment.statistics[self.statistic] err_stat = self.experiment.statistics[self.error_statistic] try: err_max = max([max(x) for x in err_stat]) return err_max except (TypeError, IndexError): err_max = err_stat.max() stat_max = stat.max() return stat_max + err_max elif self.data.size > 0: return self.data.max() else: return Undefined else: return Undefined @cached_property def _get_W(self): if not self.experiment: return Undefined if self._W is not Undefined: return self._W if self.channel and self.channel in self.experiment.channels: data = self.experiment[self.channel] if self.r <= 0 or self.r >= 1: raise CytoflowError("r must be between 0 and 1") # get the range by finding the rth quantile of the negative values neg_values = data[data < 0] if(not neg_values.empty): r_value = neg_values.quantile(self.r) W = (self.M - math.log10(self._T/math.fabs(r_value)))/2 if W <= 0: warn("Channel {0} doesn't have enough negative data. " "Try a log transform instead." .format(self.channel), CytoflowWarning) return 0.5 else: return W else: # ... unless there aren't any negative values, in which case # you probably shouldn't use this transform warn("Channel {0} doesn't have any negative data. " "Try a log transform instead." .format(self.channel), CytoflowWarning) return 0.5 else: return 0.5 # a reasonable default for non-channel scales def _set_W(self, value): self._W = value @cached_property def _get__logicle(self): if self.W is Undefined or self._T is Undefined: return Undefined if self._T <= 0: raise CytoflowError("Logicle range must be > 0") if self.W < 0: raise CytoflowError("Logicle param W must be >= 0") if self.M <= 0: raise CytoflowError("Logicle param M must be > 0") if (2 * self.W > self.M): raise CytoflowError("Logicle param W is too large; it must be " "less than half of param M.") if (-self.A > self.W or self.A + self.W > self.M - self.W): raise CytoflowError("Logicle param A is too large.") return FastLogicle(self._T, self.W, self.M, self.A) @cached_property def _get_mpl_params(self): return {"logicle" : self._logicle}
class BeadCalibrationDiagnostic(HasStrictTraits): """ A diagnostic view for `BeadCalibrationOp`. Plots the smoothed histogram of the bead data; the peak locations; a scatter plot of the raw bead fluorescence values vs the calibrated unit values; and a line plot of the model that was computed. Make sure that the relationship is linear; if it's not, it likely isn't a good calibration! Attributes ---------- op : Instance(BeadCalibrationOp) The operation instance whose parameters we're plotting. Set automatically if you created the instance using :meth:`BeadCalibrationOp.default_view`. """ # traits id = Constant("edu.mit.synbio.cytoflow.view.beadcalibrationdiagnosticview") friendly_id = Constant("Bead Calibration Diagnostic") op = Instance(BeadCalibrationOp) def plot(self, experiment): """ Plots the diagnostic view. Parameters ---------- experiment : Experiment The experiment used to create the diagnostic plot. """ if experiment is None: raise util.CytoflowViewError('experiment', "No experiment specified") channels = list(self.op.units.keys()) if not channels: raise util.CytoflowViewError(None, "No channels to plot") if set(channels) != set(self.op._histograms.keys()): raise util.CytoflowViewError(None, "You must estimate the parameters " "before plotting") plt.figure() for idx, channel in enumerate(channels): _, hist_bins, hist_smooth = self.op._histograms[channel] plt.subplot(len(channels), 2, 2 * idx + 1) plt.xscale('log') plt.xlabel(channel) plt.plot(hist_bins[1:], hist_smooth) plt.axvline(self.op.bead_brightness_threshold, color = 'blue', linestyle = '--' ) if self.op.bead_brightness_cutoff: plt.axvline(self.op.bead_brightness_cutoff, color = 'blue', linestyle = '--' ) else: plt.axvline(experiment.metadata[channel]['range'] * 0.7, color = 'blue', linestyle = '--') if channel in self.op._peaks: for peak in self.op._peaks[channel]: plt.axvline(peak, color = 'r') if channel in self.op._peaks and channel in self.op._mefs: plt.subplot(len(channels), 2, 2 * idx + 2) plt.xscale('log') plt.yscale('log') plt.xlabel(channel) plt.ylabel(self.op.units[channel]) plt.plot(self.op._peaks[channel], self.op._mefs[channel], marker = 'o') xmin, xmax = plt.xlim() x = np.logspace(np.log10(xmin), np.log10(xmax)) plt.plot(x, self.op._calibration_functions[channel](x), color = 'r', linestyle = ':') plt.tight_layout(pad = 0.8)
class TasbeCalibrationView(PluginViewMixin): handler_factory = Callable(TasbeViewHandler) op = Instance(TasbeCalibrationOp) id = "edu.mit.synbio.cytoflowgui.op_plugins.tasbe" friendly_id = "TASBE Calibration" name = Constant("TASBE Calibration") fsc_channel = DelegatesTo('op') ssc_channel = DelegatesTo('op') _polygon_view = Instance(PolygonSelection, transient=True) interactive = Property(Bool) def _get_interactive(self): if self._polygon_view: return self._polygon_view.interactive else: return False def _set_interactive(self, val): if self._polygon_view: self._polygon_view.interactive = val def plot_wi(self, wi): self.plot(None, plot_name=self.current_plot) def enum_plots(self, experiment): return iter([ "Morphology", "Autofluorescence", "Bleedthrough", "Bead Calibration", "Color Translation" ]) def enum_plots_wi(self, wi): return iter([ "Morphology", "Autofluorescence", "Bleedthrough", "Bead Calibration", "Color Translation" ]) def should_plot(self, changed, payload): """ Should the owning WorkflowItem refresh the plot when certain things change? `changed` can be: - Changed.VIEW -- the view's parameters changed - Changed.RESULT -- this WorkflowItem's result changed - Changed.PREV_RESULT -- the previous WorkflowItem's result changed - Changed.ESTIMATE_RESULT -- the results of calling "estimate" changed """ if changed == Changed.VIEW: _, name, _ = payload if self.current_plot == 'Morphology' and (name == 'fsc_channel' or name == 'ssc_channel'): return True elif name == 'current_plot': return True elif changed == Changed.PREV_RESULT: if self.current_plot == payload: return True else: return False def plot(self, experiment, plot_name=None, **kwargs): if plot_name not in [ "Morphology", "Autofluorescence", "Bleedthrough", "Bead Calibration", "Color Translation" ]: raise util.CytoflowViewError( "Which plot do you want? Must be one " "of \"Morphology\", \"Autofluorescence\", " "\"Bleedthrough\", \"Bead Calibration\", " "or \"Color Translation\"") if not self.op._blank_exp: raise util.CytoflowViewError( "Must set at least the blank control file!") new_ex = self.op._blank_exp.clone() if plot_name == "Morphology": if not self._polygon_view: self._polygon_view = self.op._polygon_op.default_view() self._polygon_view.plot(new_ex, **kwargs) return else: new_ex = self.op._polygon_op.apply(new_ex) if plot_name == "Autofluorescence": self.op._af_op.default_view().plot(new_ex, **kwargs) return else: new_ex = self.op._af_op.apply(new_ex) if plot_name == "Bleedthrough": self.op._bleedthrough_op.default_view().plot(new_ex, **kwargs) return else: new_ex = self.op._bleedthrough_op.apply(new_ex) if plot_name == "Bead Calibration": self.op._bead_calibration_op.default_view().plot(new_ex, **kwargs) return else: new_ex = self.op._bead_calibration_op.apply(new_ex) if plot_name == "Color Translation": self.op._color_translation_op.default_view().plot(new_ex, **kwargs)
class PolygonSelection(Op2DView, ScatterplotView): """ Plots, and lets the user interact with, a 2D polygon selection. Attributes ---------- interactive : bool is this view interactive? Ie, can the user set the polygon verticies with mouse clicks? Examples -------- In a Jupyter notebook with `%matplotlib notebook` >>> s = flow.PolygonOp(xchannel = "V2-A", ... ychannel = "Y2-A") >>> poly = s.default_view() >>> poly.plot(ex2) >>> poly.interactive = True """ id = Constant('edu.mit.synbio.cytoflow.views.polygon') friendly_id = Constant("Polygon Selection") xfacet = Constant(None) yfacet = Constant(None) interactive = Bool(False, transient=True) # internal state. _ax = Any(transient=True) _widget = Instance(util.PolygonSelector, transient=True) _patch = Instance(mpl.patches.PathPatch, transient=True) def plot(self, experiment, **kwargs): """ Plot the scatter plot, and then plot the selection on top of it. Parameters ---------- """ super(PolygonSelection, self).plot(experiment, **kwargs) self._ax = plt.gca() self._draw_poly() self._interactive() @on_trait_change('op.vertices', post_init=True) def _draw_poly(self): if not self._ax: return if self._patch and self._patch in self._ax.patches: self._patch.remove() if not self.op.vertices or len(self.op.vertices) < 3: return patch_vert = np.concatenate( (np.array(self.op.vertices), np.array((0, 0), ndmin=2))) self._patch = \ mpl.patches.PathPatch(mpl.path.Path(patch_vert, closed = True), edgecolor="black", linewidth = 2, fill = False) self._ax.add_patch(self._patch) plt.draw() @on_trait_change('interactive', post_init=True) def _interactive(self): if self._ax and self.interactive: self._widget = util.PolygonSelector(self._ax, self._onselect, useblit=True) elif self._widget: self._widget = None def _onselect(self, vertices): self.op.vertices = vertices
class FlowPeaks2DDensityView(By2DView, AnnotatingView, NullView): """ A two-dimensional diagnostic view for :class:`FlowPeaksOp`. Plots the estimated density function of the two channels, then overlays the k-means centroids in blue and the clusters-of-k-means in pink. Attributes ---------- """ id = Constant('edu.mit.synbio.cytoflow.view.flowpeaks2ddensityview') friendly_id = Constant("FlowPeaks 2D Diagnostic Plot (Density)") xchannel = Str ychannel = Str xscale = util.ScaleEnum yscale = util.ScaleEnum huefacet = Constant(None) def plot(self, experiment, **kwargs): """ Plot the plots. Parameters ---------- """ if experiment is None: raise util.CytoflowViewError('experiment', "No experiment specified") annotations = {} for k in self.op._kmeans: annotations[k] = (self.op._kmeans[k], self.op._peaks[k], self.op._cluster_peak[k]) if self.xchannel in self.op._scale: xscale = self.op._scale[self.xchannel] else: xscale = util.scale_factory(self.xscale, experiment, channel=self.xchannel) if self.ychannel in self.op._scale: yscale = self.op._scale[self.ychannel] else: yscale = util.scale_factory(self.yscale, experiment, channel=self.ychannel) if not self.op._kmeans: raise util.CytoflowViewError( None, "Must estimate a model before plotting " "the density plot.") for k in self.op._kmeans: annotations[k] = (self.op._kmeans[k], self.op._peaks[k], self.op._cluster_peak[k], self.op._density[k]) super().plot(experiment, annotations=annotations, xscale=xscale, yscale=yscale, **kwargs) def _grid_plot(self, experiment, grid, **kwargs): # all the real plotting happens in _annotation_plot. this just sets some # defaults and then stores them for later. kwargs.setdefault('antialiased', False) kwargs.setdefault('linewidth', 0) kwargs.setdefault('edgecolors', 'face') kwargs.setdefault('cmap', plt.get_cmap('viridis')) xscale = kwargs['scale'][self.xchannel] xlim = kwargs['lim'][self.xchannel] yscale = kwargs['scale'][self.ychannel] ylim = kwargs['lim'][self.ychannel] under_color = kwargs.pop('under_color', None) if under_color is not None: kwargs['cmap'].set_under(color=under_color) else: kwargs['cmap'].set_under(color=kwargs['cmap'](0.0)) bad_color = kwargs.pop('bad_color', None) if bad_color is not None: kwargs['cmap'].set_bad(color=kwargs['cmap'](0.0)) gridsize = kwargs.pop('gridsize', 50) xbins = xscale.inverse( np.linspace(xscale(xlim[0]), xscale(xlim[1]), gridsize)) ybins = yscale.inverse( np.linspace(yscale(ylim[0]), yscale(ylim[1]), gridsize)) for (i, j, _), _ in grid.facet_data(): ax = grid.facet_axis(i, j) ax.fp_xbins = xbins ax.fp_ybins = ybins ax.fp_keywords = kwargs super()._grid_plot(experiment, grid, **kwargs) return dict(xscale=xscale, xlim=xlim, yscale=yscale, ylim=ylim, cmap=kwargs['cmap']) def _annotation_plot(self, axes, annotation, annotation_facet, annotation_value, annotation_color, **kwargs): km = annotation[0] peaks = annotation[1] cluster_peak = annotation[2] density = annotation[3] xbins = axes.fp_xbins ybins = axes.fp_ybins kwargs = axes.fp_keywords # get rid of some kwargs that confuse pcolormesh kwargs.pop('annotations', None) kwargs.pop('annotation_facet', None) kwargs.pop('plot_name', None) xscale = kwargs['scale'][self.xchannel] yscale = kwargs['scale'][self.ychannel] kwargs.pop('scale') kwargs.pop('lim') h = density(util.cartesian([xscale(xbins), yscale(ybins)])) h = np.reshape(h, (len(xbins), len(ybins))) axes.pcolormesh(xbins, ybins, h.T, **kwargs) ix = self.op.channels.index(self.xchannel) iy = self.op.channels.index(self.ychannel) for k in range(len(km.cluster_centers_)): x = self.op._scale[self.xchannel].inverse( km.cluster_centers_[k][ix]) y = self.op._scale[self.ychannel].inverse( km.cluster_centers_[k][iy]) plt.plot(x, y, '*', color='blue') peak_idx = cluster_peak[k] peak = peaks[peak_idx] peak_x = xscale.inverse(peak[0]) peak_y = yscale.inverse(peak[1]) plt.plot([x, peak_x], [y, peak_y]) for peak in peaks: x = self.op._scale[self.ychannel].inverse(peak[0]) y = self.op._scale[self.xchannel].inverse(peak[1]) plt.plot(x, y, 'o', color="magenta")
class BruteForceOptimizerStep(ExperimentOptimizerStep): """ Optimize a set of simulation parameters to model the provided experiment using the grid search (brute force) approach. If sim_group_max_size is 0, the step creates 1 simulation grid around a simulation built to model each target experiment. if sim_group_max_size is a positive integer, all simulations for a target experiments are split into groups of size less or equal to sim_group_max_size. When a simulation grid is fully run, the cost of each simulation to the corresponding target experiment is computed using the cost function attribute. The cost data from each simulation grid is stored in the group_cost_data dict and combined into the step's cost_data once the simulation names are stripped. """ # General step traits ----------------------------------------------------- #: Type of the optimizer step optimizer_step_type = Constant(OPTIMIZER_STEP_TYPE) #: List of parameter objects to scan parameter_list = List(ParameterScanDescription) #: List of parameter names to scan scanned_param_names = Property(List(Str), depends_on="parameter_list[]") # SimulationGroup related traits ------------------------------------------ #: List of simulation groups, scanning desired parameters, 1 per target exp # Built from start_point_simulation and scanned_params if not provided. simulation_groups = List(Instance(SimulationGroup)) #: Cost function to minimize, one per simulation group group_cost_functions = Dict(Str, Callable) #: Maximum size for each of the simulation groups in the step # if the step needs a larger grid, it will be split into SimGroups of size # less or equal to this sim_group_max_size = Int #: Number of the next simulation group to run _next_group_to_run = Int(0) #: Local storage of the job_manager to run subsequent groups _job_manager = Instance(JobManager) #: Make the run call blocking? _wait_on_run = Bool # Run related traits ------------------------------------------------------ # Total number of simulations involved in the optimization step size = Property(Int, depends_on="simulation_groups[]") #: Number of simulations already run size_run = Property(Int, depends_on="simulation_groups.size_run") #: Percentage of the optimizer that has already run percent_run = Property(Str, depends_on="size_run") # Output related traits --------------------------------------------------- #: Aggregation method to combine costs for all components & all experiments cost_agg_func = Enum("sum", "mean") #: Dict mapping each simulation group to its cost data. _group_cost_data = Dict #: Dict mapping each component to a list of the best simulations optimal_simulation_for_comp = Dict # Run related methods ----------------------------------------------------- def run(self, job_manager, wait=False): """ Run optimization step by running all simulation groups it contains. """ # Initialize run parameters super(BruteForceOptimizerStep, self).run(job_manager, wait=wait) if not self.simulation_groups: self.initialize_sim_group() first_group = self.simulation_groups[0] runner = first_group.run(job_manager, wait=wait) self._job_manager = job_manager self._next_group_to_run = 1 self._wait_on_run = wait return runner def wait(self): """ Wait for currently known simulation groups to finish running. """ for group in self.simulation_groups: msg = "Waiting for {} to finish...".format(group.name) logger.debug(msg) group.wait() def initialize_sim_group(self): """ Initialize simulation groups with one based on self attribute. Depending on the group_max_size, there may be multiple simulation groups to target a given experiment. """ for exp, start_point_sim in zip(self.target_experiments, self.starting_point_simulations): name = "Grid {}_{}".format(exp.name, self.name) groups = param_scans_to_sim_group( name, self.parameter_list, start_point_sim, max_size=self.sim_group_max_size ) self.simulation_groups.extend(groups) # Cost related methods ---------------------------------------------------- def recompute_costs_for_weights(self, new_weights): """ Assume new weights for all cost functions. Also recompute costs for all groups if they have already been computed. """ if not self.has_run: self.cost_func_kw["weights"] = new_weights return # Otherwise, recompute all costs data (using cached metrics stored in # cost functions: self.invalidate_group_cost_data() for group in self.simulation_groups: # Rebuild the simulations so that we can recover parameter values # for the cost data dataframe: if not group.simulations: group.initialize_simulations(use_output_cache=True) group_name = group.name cost_func = self.group_cost_functions[group_name] cost_func.weights = new_weights cost_data = cost_func.compute_costs() # Don't aggregate yet, to avoid triggering listeners until all # cost_data recomputed: self.update_cost_data_dict(group, cost_data, skip_aggregate=True) # Now we are ready to compute the step's cost_data: self.aggregate_cost_data() def compute_costs(self, sim_group, cost_function=None): """ Compute the costs of one of the SimulationGroups of the step. Also cache the cost_function for each sim_group, so that costs can be recomputed if weights are changed. Parameters ---------- sim_group : SimulationGroup Group for which to compute costs. cost_function : Callable [OPTIONAL] Target cost function to use to compute costs. Optional: if a cost_function_type has been provided at step creation, and this is None, a cost_function will be created. """ if cost_function is None: klass = ALL_COST_FUNCTIONS[self.cost_function_type] cost_function = klass(**self.cost_func_kw) target_exp = sim_group.center_point_simulation.source_experiment cost_data = cost_function(sim_group.simulations, target_exps=target_exp) self.group_cost_functions[sim_group.name] = cost_function self.update_cost_data_dict(sim_group, cost_data) def update_cost_data_dict(self, group, cost_data, skip_aggregate=False): """ Collect all cost_function cost data for all sim groups. Also aggregates all into the step's cost_data if the step has finished running. The step's cost data will aggregate data from all simulation groups, sum/average it over all components, and display the scanned parameters values along side with the aggregate cost. """ if cost_data is None: return # Copy to avoid modifying the cost function object which has a hold on # the cost_data cost_data = cost_data.copy() simulations = group.simulations # Aggregate the cost function data df_agg_method = getattr(cost_data, self.cost_agg_func) cost_data[ALL_COST_COL_NAME] = df_agg_method(axis=1) # Add the values of the scanned parameters self.append_param_values(cost_data, simulations) # Collect the group's cost data with the rest of the data targeting the # same experiment if any: exp_name = group.center_point_simulation.source_experiment.name if exp_name in self._group_cost_data: existing = self._group_cost_data[exp_name] self._group_cost_data[exp_name] = pd.concat([existing, cost_data]) else: self._group_cost_data[exp_name] = cost_data if self.has_run and not skip_aggregate: self.aggregate_cost_data() def invalidate_group_cost_data(self): """ Past cost_data are invalid. Delete them. """ self._group_cost_data = {} def aggregate_cost_data(self): """ Aggregate cost data over all target experiment. The step's cost data will aggregate data from all simulation groups, sum/average it over all components, and display the scanned parameters values along side with the aggregate cost. """ # Remove the column name from the final cost_data since there may be # more than 1 simulation for a given parameter setup, one per target # experiment: cost_data_list = [data.drop(SIM_COL_NAME, axis=1) for data in self._group_cost_data.values()] average_cost_data = sum(cost_data_list) if self.cost_agg_func == "mean": average_cost_data /= len(self.target_experiments) self.cost_data = average_cost_data def append_param_values(self, costs_df, simulations): """ Evaluate parameters for provided sims and reset as cost DF index. """ for param_name in self.scanned_param_names: expr = "sim.{}".format(param_name) costs_df[param_name] = [eval(expr, {"sim": sim}) for sim in simulations] first_val = costs_df[param_name][0] if isinstance(first_val, UnitScalar): costs_df[param_name] = costs_df[param_name].apply(float) elif is_squeezable(first_val): # FIXME: WHEN DOES THIS HAPPEN? costs_df[param_name] = costs_df[param_name].apply(float) elif is_repeating_array(first_val): # This can happen when a parameter is a slice of an array: # replace with the first value if all the same because we can't # index with an array (unhashable). costs_df[param_name] = costs_df[param_name].apply( lambda x: x[0] ) costs_df.reset_index(inplace=True) costs_df.set_index(self.scanned_param_names, inplace=True) # Optimal simulation methods ---------------------------------------------- def update_optimal_simulation_for_comp(self): """ Extract the best simulation for each product component. """ best_simulations = defaultdict(list) for comp in self.target_components: for group_cost_data in self._group_cost_data.values(): data = group_cost_data[comp] try: idx = data.argmin(axis=0) sim_name = group_cost_data.loc[idx, SIM_COL_NAME] sim = self._get_sim_from_name(sim_name) best_simulations[comp].append(sim) except Exception as e: msg = "Failing to find the simulation with minimal cost " \ "for component {}. Data was {}. (Exception was {})" logger.error(msg.format(comp, data, e)) self.optimal_simulation_for_comp = best_simulations def get_optimal_sims(self, exp_name, num_sims): """ Collect optimal num_sims simulations matching specific experiment. """ if len(self.cost_data) == 0: return [] # Make sure we are not trying to extract more optimal simulations that # the total number of available simulations (for a given experiment) sorted_data = self.cost_data.sort_values(by=ALL_COST_COL_NAME) optim_sim_idx = sorted_data.index[:num_sims] # This assumes that self.cost_data and elements of # self._group_cost_data are indexed on the same columns: group_data = self._group_cost_data[exp_name] sim_names = group_data.loc[optim_sim_idx, SIM_COL_NAME].tolist() return [self._get_sim_from_name(name) for name in sim_names] # Private interface ------------------------------------------------------- def _get_sim_from_name(self, sim_name): """ Find a simulation ran in the step in the simulation sim groups. Raises ------ ValueError If the simulation isn't found. """ pattern = "Sim (\d+)_(.+)" match = re.match(pattern, sim_name) target_sim_num, target_group_name = match.groups() group = self._get_group_from_name(target_group_name) try: sim = group.get_simulation(int(target_sim_num)) if sim.name != sim_name: msg = "Logical error: the simulation's name isn't what was " \ "expected!" logger.exception(msg) raise ValueError(msg) return sim except (IndexError, AssertionError) as e: msg = "Simulation with name {} not found in step's simulation " \ "groups. Error was {}." msg = msg.format(sim_name, e) logger.error(msg) raise ValueError(msg) def _get_group_from_name(self, group_name): """ Return the simulation group with provided name. """ for group in self.simulation_groups: if group.name.startswith(group_name): return group msg = "SimulationGroup with name {} not found in step's groups. " \ "Known names are {}" known_group_names = [group.name for group in self.simulation_groups] msg = msg.format(group_name, known_group_names) logger.error(msg) raise ValueError(msg) def _get_step_has_run(self): if not self.simulation_groups: return False return all([group.has_run for group in self.simulation_groups]) # Traits listeners -------------------------------------------------------- @on_trait_change("simulation_groups:has_run") def optimize_costs(self, sim_group, attr_name, group_has_run): self.has_run = self._get_step_has_run() if group_has_run: msg = "Group {} has finished running: updating costs." msg = msg.format(sim_group.name) logger.info(msg) self.compute_costs(sim_group) if self.has_run: self.update_optimal_simulation_for_comp() else: self._run_next_sim_group() # Save memory by throwing away simulations: they can be rebuilt # from the simulation diffs. sim_group.release_simulation_list() self.data_updated = True def _run_next_sim_group(self): """ A simGroup has finished running: run the next one. """ next_group = self.simulation_groups[self._next_group_to_run] msg = "Now submitting {} to run...".format(next_group.name) logger.debug(msg) next_group.run(self._job_manager, wait=self._wait_on_run) self._next_group_to_run += 1 # Traits property getters ------------------------------------------------- def _get_size(self): return sum([group.size for group in self.simulation_groups]) def _get_size_run(self): return sum([group.size_run for group in self.simulation_groups]) def _get_percent_run(self): if self.size: percent_run = self.size_run / self.size * 100. else: percent_run = np.nan return "{:.2f} %".format(percent_run) def _get_scanned_param_names(self): step_params = [] for param in self.parameter_list: p_name = param.name parallel_params = hasattr(param, "parallel_parameters") and \ len(param.parallel_parameters) > 0 if parallel_params: step_params.extend([p.name for p in param.parallel_parameters]) step_params.append(p_name) return step_params # Traits initialization methods ------------------------------------------- def _cost_data_default(self): cols = self.target_components + [ALL_COST_COL_NAME] data = {name: [] for name in cols} return pd.DataFrame(data, index=[]) def _sim_group_max_size_default(self): preferences = get_preferences() return preferences.optimizer_preferences.optimizer_step_chunk_size
class FlowPeaks1DView(By1DView, AnnotatingView, HistogramView): """ A one-dimensional diagnostic view for :class:`FlowPeaksOp`. Plots a histogram of the channel, then overlays the k-means centroids in blue. Attributes ---------- """ id = Constant('edu.mit.synbio.cytoflow.view.flowpeaks1dview') friendly_id = Constant("1D FlowPeaks Diagnostic Plot") channel = Str scale = util.ScaleEnum def plot(self, experiment, **kwargs): """ Plot the plots. Parameters ---------- """ if experiment is None: raise util.CytoflowViewError('experiment', "No experiment specified") view, trait_name = self._strip_trait(self.op.name) if self.channel in self.op._scale: scale = self.op._scale[self.channel] else: scale = util.scale_factory(self.scale, experiment, channel=self.channel) super(FlowPeaks1DView, view).plot(experiment, annotation_facet=self.op.name, annotation_trait=trait_name, annotations=self.op._kmeans, scale=scale, **kwargs) def _annotation_plot(self, axes, annotation, annotation_facet, annotation_value, annotation_color, **kwargs): kwargs.setdefault('orientation', 'vertical') if kwargs['orientation'] == 'horizontal': cidx = self.op.channels.index(self.channel) for k in range(0, self.op.num_clusters): c = self.op._scale[self.channel].inverse( annotation.cluster_centers_[k][cidx]) plt.axhline(c, linewidth=3, color='blue') else: cidx = self.op.channels.index(self.channel) for k in range(0, self.op.num_clusters): c = self.op._scale[self.channel].inverse( annotation.cluster_centers_[k][cidx]) plt.axvline(c, linewidth=3, color='blue')
class FlowPeaks2DView(By2DView, AnnotatingView, ScatterplotView): """ A two-dimensional diagnostic view for :class:`FlowPeaksOp`. Plots a scatter-plot of the two channels, then overlays the k-means centroids in blue and the clusters-of-k-means in pink. Attributes ---------- """ id = Constant('edu.mit.synbio.cytoflow.view.flowpeaks2dview') friendly_id = Constant("FlowPeaks 2D Diagnostic Plot") xchannel = Str ychannel = Str xscale = util.ScaleEnum yscale = util.ScaleEnum def plot(self, experiment, **kwargs): """ Plot the plots. Parameters ---------- """ if experiment is None: raise util.CytoflowViewError('experiment', "No experiment specified") annotations = {} for k in self.op._kmeans: annotations[k] = (self.op._kmeans[k], self.op._peaks[k], self.op._cluster_peak[k]) view, trait_name = self._strip_trait(self.op.name) if self.xchannel in self.op._scale: xscale = self.op._scale[self.xchannel] else: xscale = util.scale_factory(self.xscale, experiment, channel=self.xchannel) if self.ychannel in self.op._scale: yscale = self.op._scale[self.ychannel] else: yscale = util.scale_factory(self.yscale, experiment, channel=self.ychannel) super(FlowPeaks2DView, view).plot(experiment, annotation_facet=self.op.name, annotation_trait=trait_name, annotations=annotations, xscale=xscale, yscale=yscale, **kwargs) def _annotation_plot(self, axes, annotation, annotation_facet, annotation_value, annotation_color, **kwargs): ix = self.op.channels.index(self.xchannel) iy = self.op.channels.index(self.ychannel) xscale = kwargs['xscale'] yscale = kwargs['yscale'] km = annotation[0] peaks = annotation[1] cluster_peak = annotation[2] for k in range(len(km.cluster_centers_)): x = self.op._scale[self.xchannel].inverse( km.cluster_centers_[k][ix]) y = self.op._scale[self.ychannel].inverse( km.cluster_centers_[k][iy]) plt.plot(x, y, '*', color='blue') peak_idx = cluster_peak[k] peak = peaks[peak_idx] peak_x = xscale.inverse(peak[0]) peak_y = yscale.inverse(peak[1]) plt.plot([x, peak_x], [y, peak_y]) for peak in peaks: x = self.op._scale[self.ychannel].inverse(peak[0]) y = self.op._scale[self.xchannel].inverse(peak[1]) plt.plot(x, y, 'o', color="magenta")
class FlowPeaksOp(HasStrictTraits): """ This module uses the **flowPeaks** algorithm to assign events to clusters in an unsupervised manner. Call :meth:`estimate` to compute the clusters. Calling :meth:`apply` creates a new categorical metadata variable named ``name``, with possible values ``{name}_1`` .... ``name_n`` where ``n`` is the number of clusters estimated. The same model may not be appropriate for different subsets of the data set. If this is the case, you can use the :attr:`by` attribute to specify metadata by which to aggregate the data before estimating (and applying) a model. The number of clusters is a model parameter and it may vary in each subset. Attributes ---------- name : Str The operation name; determines the name of the new metadata column channels : List(Str) The channels to apply the clustering algorithm to. scale : Dict(Str : Enum("linear", "logicle", "log")) Re-scale the data in the specified channels before fitting. If a channel is in :attr:`channels` but not in :attr:`scale`, the current package-wide default (set with :func:`set_default_scale`) is used. by : List(Str) A list of metadata attributes to aggregate the data before estimating the model. For example, if the experiment has two pieces of metadata, ``Time`` and ``Dox``, setting ``by = ["Time", "Dox"]`` will fit the model separately to each subset of the data with a unique combination of ``Time`` and ``Dox``. h : Float (default = 1.5) A scalar value by which to scale the covariance matrices of the underlying density function. (See ``Notes``, below, for more details.) h0 : Float (default = 1.0) A scalar value by which to smooth the covariance matrices of the underlying density function. (See ``Notes``, below, for more details.) tol : Float (default = 0.5) How readily should clusters be merged? Must be between 0 and 1. See ``Notes``, below, for more details. merge_dist : Float (default = 5) How far apart can clusters be before they are merged? This is a unit-free scalar, and is approximately the maximum number of k-means clusters between peaks. find_outliers : Bool (default = False) Should the algorithm use an extra step to identify outliers? .. note:: I have disabled this code until I can try to make it faster. Notes ----- This algorithm uses kmeans to find a large number of clusters, then hierarchically merges those clusters. Thus, the user does not need to specify the number of clusters in advance, and it can find non-convex clusters. It also operates in an arbitrary number of dimensions. The merging happens in two steps. First, the cluster centroids are used to estimate an underlying density function. Then, the local maxima of the density function are found using a numerical optimization starting from each centroid, and k-means clusters that converge to the same local maximum are merged. Finally, these clusters-of-clusters are merged if their local maxima are (a) close enough, and (b) the density function between them is smooth enough. Thus, the final assignment of each event depends on the k-means cluster it ends up in, and which cluster-of-clusters that k-means centroid is assigned to. There are a lot of parameters that affect this process. The k-means clustering is pretty robust (though somewhat sensitive to the number of clusters, which is currently not exposed in the API.) The most important are exposed as attributes of the :class:`FlowPeaksOp` class. These include: - :attr:`h`, :attr:`h0`: sometimes the density function is too "rough" to find good local maxima. These parameters smooth it out by widening the covariance matrices. Increasing :attr:`h` makes the density rougher; increasing :attr:`h0` makes it smoother. - :attr:`tol`: How smooth does the density function have to be between two density maxima to merge them? Must be between 0 and 1. - :attr:`merge_dist`: How close must two maxima be to merge them? This value is a unit-free scalar, and is approximately the number of k-means clusters between the two maxima. For details and a theoretical justification, see [1]_ References ---------- .. [1] Ge, Yongchao and Sealfon, Stuart C. "flowPeaks: a fast unsupervised clustering for flow cytometry data via K-means and density peak finding" Bioinformatics (2012) 28 (15): 2052-2058. Examples -------- .. plot:: :context: close-figs Make a little data set. >>> import cytoflow as flow >>> import_op = flow.ImportOp() >>> import_op.tubes = [flow.Tube(file = "Plate01/RFP_Well_A3.fcs", ... conditions = {'Dox' : 10.0}), ... flow.Tube(file = "Plate01/CFP_Well_A4.fcs", ... conditions = {'Dox' : 1.0})] >>> import_op.conditions = {'Dox' : 'float'} >>> ex = import_op.apply() Create and parameterize the operation. .. plot:: :context: close-figs >>> fp_op = flow.FlowPeaksOp(name = 'Flow', ... channels = ['V2-A', 'Y2-A'], ... scale = {'V2-A' : 'log', ... 'Y2-A' : 'log'}, ... h0 = 3) Estimate the clusters .. plot:: :context: close-figs >>> fp_op.estimate(ex) Plot a diagnostic view of the underlying density .. plot:: :context: close-figs >>> fp_op.default_view(density = True).plot(ex) Apply the gate .. plot:: :context: close-figs >>> ex2 = fp_op.apply(ex) Plot a diagnostic view with the event assignments .. plot:: :context: close-figs >>> fp_op.default_view().plot(ex2) """ id = Constant('edu.mit.synbio.cytoflow.operations.flowpeaks') friendly_id = Constant("FlowPeaks Clustering") name = CStr() channels = List(Str) scale = Dict(Str, util.ScaleEnum) by = List(Str) # find_outliers = Bool(False) # parameters that control estimation, with sensible defaults h = util.PositiveFloat(1.5, allow_zero=False) h0 = util.PositiveFloat(1, allow_zero=False) tol = util.PositiveFloat(0.5, allow_zero=False) merge_dist = util.PositiveFloat(5, allow_zero=False) # parameters that control outlier selection, with sensible defaults _kmeans = Dict(Any, Instance(sklearn.cluster.MiniBatchKMeans), transient=True) _normals = Dict(Any, List(Function), transient=True) _density = Dict(Any, Function, transient=True) _peaks = Dict(Any, List(Array), transient=True) _cluster_peak = Dict(Any, List, transient=True) # kmeans cluster idx --> peak idx _cluster_group = Dict(Any, List, transient=True) # kmeans cluster idx --> group idx _scale = Dict(Str, Instance(util.IScale), transient=True) def estimate(self, experiment, subset=None): """ Estimate the k-means clusters, then hierarchically merge them. Parameters ---------- experiment : Experiment The :class:`.Experiment` to use to estimate the k-means clusters subset : str (default = None) A Python expression that specifies a subset of the data in ``experiment`` to use to parameterize the operation. """ if experiment is None: raise util.CytoflowOpError('experiment', "No experiment specified") if len(self.channels) == 0: raise util.CytoflowOpError('channels', "Must set at least one channel") for c in self.channels: if c not in experiment.data: raise util.CytoflowOpError( 'channels', "Channel {0} not found in the experiment".format(c)) for c in self.scale: if c not in self.channels: raise util.CytoflowOpError( 'scale', "Scale set for channel {0}, but it isn't " "in the experiment".format(c)) for b in self.by: if b not in experiment.conditions: raise util.CytoflowOpError( 'by', "Aggregation metadata {} not found, " "must be one of {}".format(b, experiment.conditions)) if subset: try: experiment = experiment.query(subset) except: raise util.CytoflowOpError( 'subset', "Subset string '{0}' isn't valid".format(subset)) if len(experiment) == 0: raise util.CytoflowOpError( 'subset', "Subset string '{0}' returned no events".format(subset)) if self.by: groupby = experiment.data.groupby(self.by) else: # use a lambda expression to return a group that contains # all the events groupby = experiment.data.groupby(lambda _: True) # get the scale. estimate the scale params for the ENTIRE data set, # not subsets we get from groupby(). And we need to save it so that # the data is transformed the same way when we apply() for c in self.channels: if c in self.scale: self._scale[c] = util.scale_factory(self.scale[c], experiment, channel=c) # if self.scale[c] == 'log': # self._scale[c].mode = 'mask' else: self._scale[c] = util.scale_factory(util.get_default_scale(), experiment, channel=c) for data_group, data_subset in groupby: if len(data_subset) == 0: raise util.CytoflowOpError( 'by', "Group {} had no data".format(data_group)) x = data_subset.loc[:, self.channels[:]] for c in self.channels: x[c] = self._scale[c](x[c]) # drop data that isn't in the scale range for c in self.channels: x = x[~(np.isnan(x[c]))] x = x.values #### choose the number of clusters and fit the kmeans num_clusters = [ util.num_hist_bins(x[:, c]) for c in range(len(self.channels)) ] num_clusters = np.ceil(np.median(num_clusters)) num_clusters = int(num_clusters) self._kmeans[data_group] = kmeans = \ sklearn.cluster.MiniBatchKMeans(n_clusters = num_clusters, random_state = 0) kmeans.fit(x) x_labels = kmeans.predict(x) d = len(self.channels) #### use the kmeans centroids to parameterize a finite gaussian #### mixture model which estimates the density function d = len(self.channels) s0 = np.zeros([d, d]) for j in range(d): r = x[d].max() - x[d].min() s0[j, j] = (r / (num_clusters**(1. / d)))**0.5 means = [] weights = [] normals = [] for k in range(num_clusters): xk = x[x_labels == k] num_k = np.sum(x_labels == k) weight_k = num_k / len(x_labels) mu = xk.mean(axis=0) means.append(mu) s = np.cov(xk, rowvar=False) el = num_k / (num_clusters + num_k) s_smooth = el * self.h * s + (1.0 - el) * self.h0 * s0 n = scipy.stats.multivariate_normal(mean=mu, cov=s_smooth) weights.append(weight_k) normals.append(lambda x, n=n: n.pdf(x)) self._normals[data_group] = normals self._density[ data_group] = density = lambda x, weights=weights, normals=normals: np.sum( [w * n(x) for w, n in zip(weights, normals)], axis=0) ### use optimization on the finite gmm to find the local peak for ### each kmeans cluster peaks = [] peak_clusters = [] # peak idx --> list of clusters min_mu = [np.inf] * len(self.channels) max_mu = [-1.0 * np.inf] * len(self.channels) for k in range(num_clusters): mu = means[k] for ci in range(len(self.channels)): if mu[ci] < min_mu[ci]: min_mu[ci] = mu[ci] if mu[ci] > max_mu[ci]: max_mu[ci] = mu[ci] for k in range(num_clusters): mu = means[k] f = lambda x: -1.0 * density(x) res = scipy.optimize.minimize(f, mu, method="CG", options={'gtol': 1e-3}) if not res.success: warn( "Peak finding failed for cluster {}: {}".format( k, res.message), util.CytoflowWarning) # ### The peak-searching algorithm from the paper. works fine, # ### but slow! we get similar results with the conjugate gradient # ### optimization method from scipy # x0 = x = means[k] # k0 = k # b = beta_max[k] / 10.0 # Nsuc = 0 # n = 0 # # while(n < 1000): # # df = scipy.misc.derivative(density, x, 1e-6) # df = statsmodels.tools.numdiff.approx_fprime(x, density) # if np.linalg.norm(df) < 1e-3: # break # # y = x + b * df / np.linalg.norm(df) # if density(y) <= density(x): # Nsuc = 0 # b = b / 2.0 # continue # # Nsuc += 1 # if Nsuc >= 2: # b = min(2*b, beta_max[k]) # # ky = kmeans.predict(y[np.newaxis, :])[0] # if ky == k: # x = y # else: # k = ky # b = beta_max[k] / 10.0 # mu = means[k] # if density(mu) > density(y): # x = mu # else: # x = y # # n += 1 merged = False for pi, p in enumerate(peaks): # TODO - this probably only works for scaled measurements if np.linalg.norm(p - res.x) < (1e-2): peak_clusters[pi].append(k) merged = True break if not merged: peak_clusters.append([k]) peaks.append(res.x) self._peaks[data_group] = peaks ### merge peaks that are sufficiently close groups = [[x] for x in range(len(peaks))] peak_groups = [x for x in range(len(peaks)) ] # peak idx --> group idx def max_tol(x, y): f = lambda a: density(a[np.newaxis, :]) # lx = kmeans.predict(x[np.newaxis, :])[0] # ly = kmeans.predict(y[np.newaxis, :])[0] n = len(x) n_scale = 1 # n_scale = np.sqrt(((nx + ny) / 2.0) / (n / num_clusters)) def tol(t): zt = x + t * (y - x) fhat_zt = f(x) + t * (f(y) - f(x)) return -1.0 * abs((f(zt) - fhat_zt) / fhat_zt) * n_scale res = scipy.optimize.minimize_scalar(tol, bounds=[0, 1], method='Bounded') if res.status != 0: raise util.CytoflowOpError( None, "tol optimization failed for {}, {}".format(x, y)) return -1.0 * res.fun def nearest_neighbor_dist(k): min_dist = np.inf for i in range(num_clusters): if i == k: continue dist = np.linalg.norm(means[k] - means[i]) if dist < min_dist: min_dist = dist return min_dist sk = [nearest_neighbor_dist(x) for x in range(num_clusters)] def s(x): k = kmeans.predict(x[np.newaxis, :])[0] return sk[k] def can_merge(g, h): for pg in g: for ph in h: vg = peaks[pg] vh = peaks[ph] dist_gh = np.linalg.norm(vg - vh) if max_tol(vg, vh) < self.tol and dist_gh / ( s(vg) + s(vh)) <= self.merge_dist: return True return False while True: if len(groups) == 1: break # find closest mergable groups min_dist = np.inf for gi in range(len(groups)): g = groups[gi] for hi in range(gi + 1, len(groups)): h = groups[hi] if can_merge(g, h): dist_gh = np.inf for pg in g: vg = peaks[pg] for ph in h: vh = peaks[ph] # print("vg {} vh {}".format(vg, vh)) dist_gh = min(dist_gh, np.linalg.norm(vg - vh)) if dist_gh < min_dist: min_gi = gi min_hi = hi min_dist = dist_gh if min_dist == np.inf: break # merge the groups groups[min_gi].extend(groups[min_hi]) for g in groups[min_hi]: peak_groups[g] = min_gi del groups[min_hi] cluster_group = [0] * num_clusters cluster_peaks = [0] * num_clusters for gi, g in enumerate(groups): for p in g: for cluster in peak_clusters[p]: cluster_group[cluster] = gi cluster_peaks[cluster] = p self._cluster_peak[data_group] = cluster_peaks self._cluster_group[data_group] = cluster_group def apply(self, experiment): """ Assign events to a cluster. Assigns each event to one of the k-means centroids from :meth:`estimate`, then groups together events in the same cluster hierarchy. Parameters ---------- experiment : Experiment the :class:`.Experiment` to apply the gate to. Returns ------- Experiment A new :class:`.Experiment` with the gate applied to it. TODO - document the extra statistics """ if experiment is None: raise util.CytoflowOpError('experiment', "No experiment specified") # make sure name got set! if not self.name: raise util.CytoflowOpError( 'name', "You have to set the gate's name " "before applying it!") if self.name in experiment.data.columns: raise util.CytoflowOpError( 'name', "Experiment already has a column named {0}".format(self.name)) if len(self.channels) == 0: raise util.CytoflowOpError('channels', "Must set at least one channel") if not self._peaks: raise util.CytoflowOpError( None, "No model found. Did you forget to " "call estimate()?") for c in self.channels: if c not in experiment.data: raise util.CytoflowOpError( 'channels', "Channel {0} not found in the experiment".format(c)) for c in self.scale: if c not in self.channels: raise util.CytoflowOpError( 'scale', "Scale set for channel {0}, but it isn't " "in the experiment".format(c)) for b in self.by: if b not in experiment.conditions: raise util.CytoflowOpError( 'by', "Aggregation metadata {} not found, " "must be one of {}".format(b, experiment.conditions)) if self.by: groupby = experiment.data.groupby(self.by) else: # use a lambda expression to return a group that contains # all the events groupby = experiment.data.groupby(lambda _: True) event_assignments = pd.Series(["{}_None".format(self.name)] * len(experiment), dtype="object") # make the statistics # clusters = [x + 1 for x in range(self.num_clusters)] # # idx = pd.MultiIndex.from_product([experiment[x].unique() for x in self.by] + [clusters] + [self.channels], # names = list(self.by) + ["Cluster"] + ["Channel"]) # centers_stat = pd.Series(index = idx, dtype = np.dtype(object)).sort_index() for group, data_subset in groupby: if len(data_subset) == 0: raise util.CytoflowOpError( 'by', "Group {} had no data".format(group)) if group not in self._kmeans: raise util.CytoflowOpError( 'by', "Group {} not found in the estimated " "model. Do you need to re-run estimate()?".format(group)) x = data_subset.loc[:, self.channels[:]] for c in self.channels: x[c] = self._scale[c](x[c]) # which values are missing? x_na = pd.Series([False] * len(x)) for c in self.channels: x_na[np.isnan(x[c]).values] = True x = x.values x_na = x_na.values group_idx = groupby.groups[group] kmeans = self._kmeans[group] predicted_km = np.full(len(x), -1, "int") predicted_km[~x_na] = kmeans.predict(x[~x_na]) groups = np.asarray(self._cluster_group[group]) predicted_group = np.full(len(x), -1, "int") predicted_group[~x_na] = groups[predicted_km[~x_na]] # outlier detection code. this is disabled for the moment # because it is really slow. # num_groups = len(set(groups)) # if self.find_outliers: # density = self._density[group] # max_d = [-1.0 * np.inf] * num_groups # # for xi in range(len(x)): # if x_na[xi]: # continue # # x_c = predicted_group[xi] # d_x_c = density(x[xi]) # if d_x_c > max_d[x_c]: # max_d[x_c] = d_x_c # # group_density = [None] * num_groups # group_weight = [0.0] * num_groups # # for c in range(num_groups): # num_c = np.sum(predicted_group == c) # clusters = np.argwhere(groups == c).flatten() # # normals = [] # weights = [] # for k in range(len(clusters)): # num_k = np.sum(predicted_km == k) # weight_k = num_k / num_c # group_weight[c] += num_k / len(x) # weights.append(weight_k) # normals.append(self._normals[group][k]) # # group_density[c] = lambda x, weights = weights, normals = normals: np.sum([w * n(x) for w, n in zip(weights, normals)], axis = 0) # # for xi in range(len(x)): # if x_na[xi]: # continue # # x_c = predicted_group[xi] # # if density(x[xi]) / max_d[x_c] < 0.01: # predicted_group[xi] = -1 # continue # # sum_d = 0 # for c in set(groups): # sum_d += group_weight[c] * group_density[c](x[xi]) # # if group_weight[x_c] * group_density[x_c](x[xi]) / sum_d < 0.8: # predicted_group[xi] = -1 # # max_d = -1.0 * np.inf # for x_c in x[predicted_group == c]: # x_c_d = density(x_c) # if x_c_d > max_d: # max_d = x_c_d # # for i in range(len(x)): # if predicted_group[i] == c and density(x[i]) / max_d <= 0.01: # predicted_group[i] = -1 # # predicted_str = pd.Series(["(none)"] * len(predicted_group)) for c in range(len(self._cluster_group[group])): predicted_str[predicted_group == c] = "{0}_{1}".format( self.name, c + 1) predicted_str[predicted_group == -1] = "{0}_None".format(self.name) predicted_str.index = group_idx event_assignments.iloc[group_idx] = predicted_str new_experiment = experiment.clone() new_experiment.add_condition(self.name, "category", event_assignments) # new_experiment.statistics[(self.name, "centers")] = pd.to_numeric(centers_stat) new_experiment.history.append( self.clone_traits(transient=lambda _: True)) return new_experiment def default_view(self, **kwargs): """ Returns a diagnostic plot of the Gaussian mixture model. Parameters ---------- channels : List(Str) Which channels to plot? Must be contain either one or two channels. scale : List({'linear', 'log', 'logicle'}) How to scale the channels before plotting them density : bool Should we plot a scatterplot or the estimated density function? Returns ------- IView an IView, call :meth:`plot` to see the diagnostic plot. """ channels = kwargs.pop('channels', self.channels) scale = kwargs.pop('scale', self.scale) density = kwargs.pop('density', False) for c in channels: if c not in self.channels: raise util.CytoflowViewError( 'channels', "Channel {} isn't in the operation's channels".format(c)) for s in scale: if s not in self.channels: raise util.CytoflowViewError( 'channels', "Channel {} isn't in the operation's channels".format(s)) for c in channels: if c not in scale: scale[c] = util.get_default_scale() if len(channels) == 0: raise util.CytoflowViewError( 'channels', "Must specify at least one channel for a default view") elif len(channels) == 1: v = FlowPeaks1DView(op=self) v.trait_set(channel=channels[0], scale=scale[channels[0]], **kwargs) return v elif len(channels) == 2: if density: v = FlowPeaks2DDensityView(op=self) v.trait_set(xchannel=channels[0], ychannel=channels[1], xscale=scale[channels[0]], yscale=scale[channels[1]], **kwargs) return v else: v = FlowPeaks2DView(op=self) v.trait_set(xchannel=channels[0], ychannel=channels[1], xscale=scale[channels[0]], yscale=scale[channels[1]], **kwargs) return v else: raise util.CytoflowViewError( None, "Can't specify more than two channels for a default view")
class BeadCalibrationOp(HasStrictTraits): """ Calibrate arbitrary channels to molecules-of-fluorophore using fluorescent beads (eg, the Spherotech RCP-30-5A rainbow beads.) To use, set the `beads_file` property to an FCS file containing the beads' events; specify which beads you ran by setting the `beads_type` property to match one of the values of BeadCalibrationOp.BEADS; and set the `units` dict to which channels you want calibrated and in which units. Then, call `estimate()` and check the peak-finding with `default_view().plot()`. If the peak-finding is wacky, try adjusting `bead_peak_quantile` and `bead_brightness_threshold`. When the peaks are successfully identified, call apply() on your experimental data set. If you can't make the peak finding work, please submit a bug report! This procedure works best when the beads file is very clean data. It does not do its own gating (maybe a future addition?) In the meantime, I recommend gating the *acquisition* on the FSC/SSC channels in order to get rid of debris, cells, and other noise. Finally, because you can't have a negative number of fluorescent molecules (MEFLs, etc) (as well as for math reasons), this module filters out negative values. Attributes ---------- name : Str The operation name (for UI representation.) units : Dict(Str, Str) A dictionary specifying the channels you want calibrated (keys) and the units you want them calibrated in (values). The units must be keys of the `beads` attribute. beads_file : File A file containing the FCS events from the beads. Must be set to use `estimate()`. This isn't persisted by `pickle()`. beads : Dict(Str, List(Float)) The beads' characteristics. Keys are calibrated units (ie, MEFL or MEAP) and values are ordered lists of known fluorophore levels. Common values for this dict are included in BeadCalibrationOp.BEADS. Must be set to use `estimate()`. bead_peak_quantile : Int The quantile threshold used to choose bead peaks. Default == 80. Must be set to use `estimate()`. bead_brightness_threshold : Float How bright must a bead peak be to be considered? Default == 100. Must be set to use `estimate()`. bead_brightness_cutoff : Float If a bead peak is above this, then don't consider it. Takes care of clipping saturated detection. Defaults to 70% of the detector range. Notes ----- The peak finding is rather sophisticated. For each channel, a 256-bin histogram is computed on the log-transformed bead data, and then the histogram is smoothed with a Savitzky-Golay filter (with a window length of 5 and a polynomial order of 1). Next, a wavelet-based peak-finding algorithm is used: it convolves the smoothed histogram with a series of wavelets and looks for relative maxima at various length-scales. The parameters of the smoothing algorithm were arrived at empircally, using beads collected at a wide range of PMT voltages. Finally, the peaks are filtered by height (the histogram bin has a quantile greater than `bead_peak_quantile`) and intensity (brighter than `bead_brightness_threshold`). How to convert from a series of peaks to mean equivalent fluorochrome? If there's one peak, we assume that it's the brightest peak. If there are two peaks, we assume they're the brightest two. If there are n >=3 peaks, we check all the contiguous n-subsets of the bead intensities and find the one whose linear regression (in log space!) has the smallest norm (square-root sum-of-squared-residuals.) There's a slight subtlety in the fact that we're performing the linear regression in log-space: if the relationship in log10-space is Y=aX + b, then the same relationship in linear space is x = 10**X, y = 10**y, and y = (10**b) * (x ** a). One more thing. Because the beads are (log) evenly spaced across all the channels, we can directly compute the fluorophore equivalent in channels where we wouldn't usually measure that fluorophore: for example, you can compute MEFL (mean equivalent fluorosceine) in the PE-Texas Red channel, because the bead peak pattern is the same in the PE-Texas Red channel as it would be in the FITC channel. Examples -------- >>> bead_op = flow.BeadCalibrationOp() >>> bead_op.beads = flow.BeadCalibrationOp.BEADS["Spherotech RCP-30-5A Lot AA01-AA04, AB01, AB02, AC01, GAA01-R"] >>> bead_op.units = {"Pacific Blue-A" : "MEFL", "FITC-A" : "MEFL", "PE-Tx-Red-YG-A" : "MEFL"} >>> >>> bead_op.beads_file = "beads.fcs" >>> bead_op.estimate(ex3) >>> >>> bead_op.default_view().plot(ex3) >>> # check the plot! >>> >>> ex4 = bead_op.apply(ex3) """ # traits id = Constant('edu.mit.synbio.cytoflow.operations.beads_calibrate') friendly_id = Constant("Bead Calibration") name = Constant("Bead Calibration") units = Dict(Str, Str) beads_file = File(exists = True) bead_peak_quantile = Int(80) bead_brightness_threshold = Float(100) bead_brightness_cutoff = Float(Undefined) # TODO - bead_brightness_threshold should probably be different depending # on the data range of the input. beads = Dict(Str, List(Float)) _calibration_functions = Dict(Str, Python, transient = True) _peaks = Dict(Str, Python, transient = True) _mefs = Dict(Str, Python, transient = True) def estimate(self, experiment, subset = None): """ Estimate the calibration coefficients from the beads file. """ if not experiment: raise util.CytoflowOpError("No experiment specified") if not self.beads_file: raise util.CytoflowOpError("No beads file specified") if not set(self.units.keys()) <= set(experiment.channels): raise util.CytoflowOpError("Specified channels that weren't found in " "the experiment.") if not set(self.units.values()) <= set(self.beads.keys()): raise util.CytoflowOpError("Units don't match beads.") # make a little Experiment check_tube(self.beads_file, experiment) beads_exp = ImportOp(tubes = [Tube(file = self.beads_file)], name_metadata = experiment.metadata['name_metadata']).apply() channels = self.units.keys() for channel in channels: data = beads_exp.data[channel] # TODO - this assumes the data is on a linear scale. check it! data_range = experiment.metadata[channel]['range'] if self.bead_brightness_cutoff is Undefined: cutoff = 0.7 * data_range else: cutoff = self.bead_brightness_cutoff # bin the data on a log scale hist_bins = np.logspace(1, math.log(data_range, 2), num = 256, base = 2) hist = np.histogram(data, bins = hist_bins) # mask off-scale values hist[0][0] = 0 hist[0][-1] = 0 # smooth it with a Savitzky-Golay filter hist_smooth = scipy.signal.savgol_filter(hist[0], 5, 1) # find peaks peak_bins = scipy.signal.find_peaks_cwt(hist_smooth, widths = np.arange(3, 20), max_distances = np.arange(3, 20) / 2) # filter by height and intensity peak_threshold = np.percentile(hist_smooth, self.bead_peak_quantile) peak_bins_filtered = \ [x for x in peak_bins if hist_smooth[x] > peak_threshold and hist[1][x] > self.bead_brightness_threshold and hist[1][x] < cutoff] peaks = [hist_bins[x] for x in peak_bins_filtered] mef_unit = self.units[channel] if not mef_unit in self.beads: raise util.CytoflowOpError("Invalid unit {0} specified for channel {1}".format(mef_unit, channel)) # "mean equivalent fluorochrome" mef = self.beads[mef_unit] if len(peaks) == 0: raise util.CytoflowOpError("Didn't find any peaks; check the diagnostic plot") elif len(peaks) > len(self.beads): raise util.CytoflowOpError("Found too many peaks; check the diagnostic plot") elif len(peaks) == 1: # if we only have one peak, assume it's the brightest peak a = mef[-1] / peaks[0] self._peaks[channel] = peaks self._mefs[channel] = [mef[-1]] self._calibration_functions[channel] = lambda x, a=a: a * x elif len(peaks) == 2: # if we have only two peaks, assume they're the brightest two self._peaks[channel] = peaks self._mefs[channel] = [mef[-1], mef[-2]] a = (mef[-1] - mef[-2]) / (peaks[1] - peaks[0]) self._calibration_functions[channel] = lambda x, a=a: a * x else: # if there are n > 2 peaks, check all the contiguous n-subsets # of mef for the one whose linear regression with the peaks # has the smallest (norm) sum-of-residuals. # do it in log10 space because otherwise the brightest peaks # have an outsized influence. best_resid = np.inf for start, end in [(x, x+len(peaks)) for x in range(len(mef) - len(peaks) + 1)]: mef_subset = mef[start:end] # linear regression of the peak locations against mef subset lr = np.polyfit(np.log10(peaks), np.log10(mef_subset), deg = 1, full = True) resid = lr[1][0] if resid < best_resid: best_lr = lr[0] best_resid = resid self._peaks[channel] = peaks self._mefs[channel] = mef_subset # remember, these (linear) coefficients came from logspace, so # if the relationship in log10 space is Y = aX + b, then in # linear space the relationship is x = 10**X, y = 10**Y, # and y = (10**b) * x ^ a # also remember that the result of np.polyfit is a list of # coefficients with the highest power first! so if we # solve y=ax + b, coeff #0 is a and coeff #1 is b a = best_lr[0] b = 10 ** best_lr[1] self._calibration_functions[channel] = \ lambda x, a=a, b=b: b * np.power(x, a) def apply(self, experiment): """Applies the bleedthrough correction to an experiment. Parameters ---------- old_experiment : Experiment the experiment to which this op is applied Returns ------- a new experiment calibrated in physical units. """ if not experiment: raise util.CytoflowOpError("No experiment specified") channels = self.units.keys() if not self.units: raise util.CytoflowOpError("No channels to calibrate.") if not self._calibration_functions: raise util.CytoflowOpError("Calibration not found. " "Did you forget to call estimate()?") if not set(channels) <= set(experiment.channels): raise util.CytoflowOpError("Module units don't match experiment channels") if set(channels) != set(self._calibration_functions.keys()): raise util.CytoflowOpError("Calibration doesn't match units. " "Did you forget to call estimate()?") # two things. first, you can't raise a negative value to a non-integer # power. second, negative physical units don't make sense -- how can # you have the equivalent of -5 molecules of fluoresceine? so, # we filter out negative values here. new_experiment = experiment.clone() for channel in channels: new_experiment.data = \ new_experiment.data[new_experiment.data[channel] > 0] new_experiment.data.reset_index(drop = True, inplace = True) for channel in channels: calibration_fn = self._calibration_functions[channel] new_experiment[channel] = calibration_fn(new_experiment[channel]) new_experiment.metadata[channel]['bead_calibration_fn'] = calibration_fn new_experiment.metadata[channel]['units'] = self.units[channel] if 'range' in experiment.metadata[channel]: new_experiment.metadata[channel]['range'] = calibration_fn(experiment.metadata[channel]['range']) new_experiment.history.append(self.clone_traits(transient = lambda t: True)) return new_experiment def default_view(self, **kwargs): """ Returns a diagnostic plot to see if the bleedthrough spline estimation is working. Returns ------- IView : An IView, call plot() to see the diagnostic plots """ return BeadCalibrationDiagnostic(op = self, **kwargs) BEADS = { # from http://www.spherotech.com/RCP-30-5a%20%20rev%20H%20ML%20071712.xls "Spherotech RCP-30-5A Lot AG01, AF02, AD04 and AAE01" : { "MECSB" : [216, 464, 1232, 2940, 7669, 19812, 35474], "MEBFP" : [861, 1997, 5776, 15233, 45389, 152562, 396759], "MEFL" : [792, 2079, 6588, 16471, 47497, 137049, 271647], "MEPE" : [531, 1504, 4819, 12506, 36159, 109588, 250892], "MEPTR" : [233, 669, 2179, 5929, 18219, 63944, 188785], "MECY" : [1614, 4035, 12025, 31896, 95682, 353225, 1077421], "MEPCY7" : [14916, 42336, 153840, 494263], "MEAP" : [373, 1079, 3633, 9896, 28189, 79831, 151008], "MEAPCY7" : [2864, 7644, 19081, 37258]}, # from http://www.spherotech.com/RCP-30-5a%20%20rev%20G.2.xls "Spherotech RCP-30-5A Lot AA01-AA04, AB01, AB02, AC01, GAA01-R": { "MECSB" : [179, 400, 993, 3203, 6083, 17777, 36331], "MEBFP" : [700, 1705, 4262, 17546, 35669, 133387, 412089], "MEFL" : [692, 2192, 6028, 17493, 35674, 126907, 290983], "MEPE" : [505, 1777, 4974, 13118, 26757, 94930, 250470], "MEPTR" : [207, 750, 2198, 6063, 12887, 51686, 170219], "MECY" : [1437, 4693, 12901, 36837, 76621, 261671, 1069858], "MEPCY7" : [32907, 107787, 503797], "MEAP" : [587, 2433, 6720, 17962, 30866, 51704, 146080], "MEAPCY7" : [718, 1920, 5133, 9324, 14210, 26735]}}
class LogScale(ScaleMixin): id = Constant("edu.mit.synbio.cytoflow.utility.log_scale") name = "log" experiment = Instance("cytoflow.Experiment") # must set one of these. they're considered in order. channel = Str condition = Str statistic = Tuple(Str, Str) error_statistic = Tuple(Str, Str) data = Array mode = Enum("mask", "clip") threshold = Property( Float, depends_on= "[experiment, condition, channel, statistic, error_statistic]") _channel_threshold = Float(0.1) mpl_params = Property(Dict) def _get_mpl_params(self): return {"nonposx": self.mode, "nonposy": self.mode} def _set_threshold(self, threshold): self._channel_threshold = threshold def _get_threshold(self): if self.channel: return self._channel_threshold elif self.condition: cond = self.experiment[self.condition][ self.experiment[self.condition] > 0] return cond.min() elif self.statistic in self.experiment.statistics \ and not self.error_statistic in self.experiment.statistics: stat = self.experiment.statistics[self.statistic] assert is_numeric(stat) return stat[stat > 0].min() elif self.statistic in self.experiment.statistics \ and self.error_statistic in self.experiment.statistics: stat = self.experiment.statistics[self.statistic] err_stat = self.experiment.statistics[self.error_statistic] stat_min = stat[stat > 0].min() try: err_min = min([x for x in [min(x) for x in err_stat] if x > 0]) return err_min except (TypeError, IndexError): err_min = min([x for x in err_stat if stat_min - x > 0]) return stat_min - err_min elif self.data.size > 0: return self.data[self.data > 0].min() def __call__(self, data): # this function should work with: int, float, tuple, list, pd.Series, # np.ndframe. it should return the same data type as it was passed. if isinstance(data, (int, float)): if self.mode == "mask": if data < self.threshold: raise CytoflowError( "data <= scale.threshold (currently: {})".format( self.threshold)) else: return np.log10(data) else: if data < self.threshold: return np.log10(self.threshold) else: return np.log10(data) elif isinstance(data, (list, tuple)): ret = [self.__call__(x) for x in data] if isinstance(data, tuple): return tuple(ret) else: return ret elif isinstance(data, (np.ndarray, pd.Series)): mask_value = np.nan if self.mode == "mask" else self.threshold x = pd.Series(data) x = x.mask(lambda x: x < self.threshold, other=mask_value) ret = np.log10(x) if isinstance(data, pd.Series): return ret else: return ret.values else: raise CytoflowError( "Unknown type {} passed to log_scale.__call__".format( type(data))) def inverse(self, data): # this function shoujld work with: int, float, tuple, list, pd.Series, # np.ndframe if isinstance(data, (int, float)): return np.power(10, data) elif isinstance(data, (list, tuple)): ret = [np.power(10, x) for x in data] if isinstance(data, tuple): return tuple(ret) else: return ret elif isinstance(data, (np.ndarray, pd.Series)): return np.power(10, data) else: raise CytoflowError( "Unknown type {} passed to log_scale.inverse".format( type(data))) def clip(self, data): if isinstance(data, pd.Series): return data.clip(lower=self.threshold) elif isinstance(data, np.ndarray): return data.clip(min=self.threshold) elif isinstance(data, float): return max(data, self.threshold) else: try: return [max(data, self.threshold) for x in data] except TypeError as e: raise CytoflowError( "Unknown data type in LogScale.clip") from e def color_norm(self): if self.channel: vmin = self.experiment[self.channel].min() vmax = self.experiment[self.channel].max() elif self.condition: vmin = self.experiment[self.condition].min() vmax = self.experiment[self.condition].max() elif self.statistic in self.experiment.statistics: stat = self.experiment.statistics[self.statistic] try: vmin = min([min(x) for x in stat]) vmax = max([max(x) for x in stat]) except (TypeError, IndexError): vmin = stat.min() vmax = stat.max() if self.error_statistic in self.experiment.statistics: err_stat = self.experiment.statistics[self.error_statistic] try: vmin = min([min(x) for x in err_stat]) vmax = max([max(x) for x in err_stat]) except (TypeError, IndexError): vmin = vmin - err_stat.min() vmax = vmax + err_stat.max() elif self.data.size > 0: vmin = self.data.min() vmax = self.data.max() else: raise CytoflowError("Must set one of 'channel', 'condition' " "or 'statistic'.") return matplotlib.colors.LogNorm(vmin=self.clip(vmin), vmax=self.clip(vmax))
class GaussianMixture2DOp(HasStrictTraits): """ This module fits a 2D Gaussian mixture model with a specified number of components to a pair of channels. Creates a new categorical metadata variable named `name`, with possible values `name_1` .... `name_n` where `n` is the number of components. An event is assigned to `name_i` category if it falls within `sigma` standard deviations of the component's mean. If that is true for multiple categories (or if `sigma == 0.0`), the event is assigned to the category with the highest posterior probability. If the event doesn't fall into any category, it is assigned to `name_None`. As a special case, if `num_components` is `1` and `sigma` > 0.0, then the new condition is boolean, `True` if the event fell in the gate and `False` otherwise. Optionally, if `posteriors` is `True`, this module will also compute the posterior probability of each event in its assigned component, returning it in a new colunm named `{Name}_Posterior`. Finally, the same mixture model (mean and standard deviation) may not be appropriate for every subset of the data. If this is the case, you can use the `by` attribute to specify metadata by which to aggregate the data before estimating (and applying) a mixture model. The number of components is the same across each subset, though. Attributes ---------- name : Str The operation name; determines the name of the new metadata column xchannel : Str The X channel to apply the mixture model to. ychannel : Str The Y channel to apply the mixture model to. xscale : Enum("linear", "logicle", "log") (default = "linear") Re-scale the data on the X acis before fitting the data? yscale : Enum("linear", "logicle", "log") (default = "linear") Re-scale the data on the Y axis before fitting the data? num_components : Int (default = 1) How many components to fit to the data? Must be positive. sigma : Float (default = 0.0) How many standard deviations on either side of the mean to include in each category? If an event is in multiple components, assign it to the component with the highest posterior probability. If `sigma == 0.0`, categorize *all* the data by assigning each event to the component with the highest posterior probability. Must be >= 0.0. by : List(Str) A list of metadata attributes to aggregate the data before estimating the model. For example, if the experiment has two pieces of metadata, `Time` and `Dox`, setting `by = ["Time", "Dox"]` will fit the model separately to each subset of the data with a unique combination of `Time` and `Dox`. posteriors : Bool (default = False) If `True`, add a column named `{Name}_Posterior` giving the posterior probability that the event is in the component to which it was assigned. Useful for filtering out low-probability events. Statistics ---------- xmean : Float the mean of the fitted gaussian in the x dimension. ymean : Float the mean of the fitted gaussian in the y dimension. proportion : Float the proportion of events in each component of the mixture model. only set if `num_components` > 1. PS -- if someone has good ideas for summarizing spread in a 2D (non-isotropic) Gaussian, or other useful statistics, let me know! Examples -------- >>> gauss_op = GaussianMixture2DOp(name = "Gaussian", ... xchannel = "V2-A", ... ychannel = "Y2-A", ... num_components = 2) >>> gauss_op.estimate(ex2) >>> gauss_op.default_view().plot(ex2) >>> ex3 = gauss_op.apply(ex2) """ id = Constant('edu.mit.synbio.cytoflow.operations.gaussian_2d') friendly_id = Constant("2D Gaussian Mixture") name = CStr() xchannel = Str() ychannel = Str() xscale = util.ScaleEnum yscale = util.ScaleEnum num_components = util.PositiveInt sigma = util.PositiveFloat(0.0, allow_zero=True) by = List(Str) posteriors = Bool(False) # the key is either a single value or a tuple _gmms = Dict(Any, Instance(mixture.GaussianMixture), transient=True) _xscale = Instance(util.IScale, transient=True) _yscale = Instance(util.IScale, transient=True) def estimate(self, experiment, subset=None): """ Estimate the Gaussian mixture model parameters """ if not experiment: raise util.CytoflowOpError("No experiment specified") if self.xchannel not in experiment.data: raise util.CytoflowOpError( "Column {0} not found in the experiment".format(self.xchannel)) if self.ychannel not in experiment.data: raise util.CytoflowOpError( "Column {0} not found in the experiment".format(self.ychannel)) for b in self.by: if b not in experiment.data: raise util.CytoflowOpError("Aggregation metadata {0} not found" " in the experiment".format(b)) if len(experiment.data[b].unique()) > 100: #WARNING - magic number raise util.CytoflowOpError( "More than 100 unique values found for" " aggregation metadata {0}. Did you" " accidentally specify a data channel?".format(b)) if self.num_components == 1 and self.posteriors: raise util.CytoflowOpError( "If num_components == 1, all posteriors are 1.") if subset: try: experiment = experiment.query(subset) except: raise util.CytoflowViewError( "Subset string '{0}' isn't valid".format(subset)) if len(experiment) == 0: raise util.CytoflowViewError( "Subset string '{0}' returned no events".format(subset)) if self.by: groupby = experiment.data.groupby(self.by) else: # use a lambda expression to return a group that contains # all the events groupby = experiment.data.groupby(lambda x: True) # get the scale. estimate the scale params for the ENTIRE data set, # not subsets we get from groupby(). And we need to save it so that # the data is transformed the same way when we apply() self._xscale = util.scale_factory(self.xscale, experiment, channel=self.xchannel) self._yscale = util.scale_factory(self.yscale, experiment, channel=self.ychannel) gmms = {} for group, data_subset in groupby: if len(data_subset) == 0: raise util.CytoflowOpError( "Group {} had no data".format(group)) x = data_subset.loc[:, [self.xchannel, self.ychannel]] x[self.xchannel] = self._xscale(x[self.xchannel]) x[self.ychannel] = self._yscale(x[self.ychannel]) # drop data that isn't in the scale range x = x[~(np.isnan(x[self.xchannel]) | np.isnan(x[self.ychannel]))] x = x.values gmm = mixture.GaussianMixture(n_components=self.num_components, covariance_type="full", random_state=1) gmm.fit(x) if not gmm.converged_: raise util.CytoflowOpError("Estimator didn't converge" " for group {0}".format(group)) # in the 1D version, we sort the components by the means -- so # the first component has the lowest mean, the second component # has the next-lowest mean, etc. that doesn't work in a 2D area, # obviously. # instead, we assume that the clusters are likely (?) to be # arranged along *one* of the axes, so we take the |norm| of the # x,y mean of each cluster and sort that way. norms = (gmm.means_[:, 0]**2 + gmm.means_[:, 1]**2)**0.5 sort_idx = np.argsort(norms) gmm.means_ = gmm.means_[sort_idx] gmm.weights_ = gmm.weights_[sort_idx] gmm.covariances_ = gmm.covariances_[sort_idx] gmms[group] = gmm self._gmms = gmms def apply(self, experiment): """ Assigns new metadata to events using the mixture model estimated in `estimate`. """ if not experiment: raise util.CytoflowOpError("No experiment specified") if not self.xchannel: raise util.CytoflowOpError("Must set X channel") if not self.ychannel: raise util.CytoflowOpError("Must set Y channel") # make sure name got set! if not self.name: raise util.CytoflowOpError("You have to set the gate's name " "before applying it!") if self.name in experiment.data.columns: raise util.CytoflowOpError( "Experiment already has a column named {0}".format(self.name)) if not self._gmms: raise util.CytoflowOpError( "No components found. Did you forget to " "call estimate()?") if not self._xscale: raise util.CytoflowOpError( "Couldn't find _xscale. What happened??") if not self._yscale: raise util.CytoflowOpError( "Couldn't find _yscale. What happened??") if self.xchannel not in experiment.data: raise util.CytoflowOpError( "Column {0} not found in the experiment".format(self.xchannel)) if self.ychannel not in experiment.data: raise util.CytoflowOpError( "Column {0} not found in the experiment".format(self.ychannel)) if self.posteriors: col_name = "{0}_Posterior".format(self.name) if col_name in experiment.data: raise util.CytoflowOpError( "Column {0} already found in the experiment".format( col_name)) for b in self.by: if b not in experiment.data: raise util.CytoflowOpError("Aggregation metadata {0} not found" " in the experiment".format(b)) if len(experiment.data[b].unique()) > 100: #WARNING - magic number raise util.CytoflowOpError( "More than 100 unique values found for" " aggregation metadata {0}. Did you" " accidentally specify a data channel?".format(b)) if self.sigma < 0.0: raise util.CytoflowOpError("sigma must be >= 0.0") event_assignments = pd.Series([None] * len(experiment), dtype="object") if self.posteriors: event_posteriors = pd.Series([0.0] * len(experiment)) # what we DON'T want to do is iterate through event-by-event. # the more of this we can push into numpy, sklearn and pandas, # the faster it's going to be. for example, this is why # we don't use Ellipse.contains(). if self.by: groupby = experiment.data.groupby(self.by) else: # use a lambda expression to return a group that # contains all the events groupby = experiment.data.groupby(lambda x: True) for group, data_subset in groupby: if group not in self._gmms: # there weren't any events in this group, so we didn't get # a gmm. continue gmm = self._gmms[group] x = data_subset.loc[:, [self.xchannel, self.ychannel]] x[self.xchannel] = self._xscale(x[self.xchannel]) x[self.ychannel] = self._yscale(x[self.ychannel]) # which values are missing? x_na = np.isnan(x[self.xchannel]) | np.isnan(x[self.ychannel]) x_na = x_na.values x = x.values group_idx = groupby.groups[group] # make a preliminary assignment predicted = np.full(len(x), -1, "int") predicted[~x_na] = gmm.predict(x[~x_na]) # if we're doing sigma-based gating, for each component check # to see if the event is in the sigma gate. if self.sigma > 0.0: # make a quick dataframe with the value and the predicted # component gate_df = pd.DataFrame({ "x": x[:, 0], "y": x[:, 1], "p": predicted }) # for each component, get the ellipse that follows the isoline # around the mixture component # cf. http://scikit-learn.org/stable/auto_examples/mixture/plot_gmm.html # and http://www.mathworks.com/matlabcentral/newsreader/view_thread/298389 # and http://stackoverflow.com/questions/7946187/point-and-ellipse-rotated-position-test-algorithm # i am not proud of how many tries this took me to get right. for c in range(0, self.num_components): mean = gmm.means_[c] covar = gmm.covariances_[c] # xc is the center on the x axis # yc is the center on the y axis xc = mean[0] # @UnusedVariable yc = mean[1] # @UnusedVariable v, w = linalg.eigh(covar) u = w[0] / linalg.norm(w[0]) # xl is the length along the x axis # yl is the length along the y axis xl = np.sqrt(v[0]) * self.sigma # @UnusedVariable yl = np.sqrt(v[1]) * self.sigma # @UnusedVariable # t is the rotation in radians (counter-clockwise) t = 2 * np.pi - np.arctan(u[1] / u[0]) sin_t = np.sin(t) # @UnusedVariable cos_t = np.cos(t) # @UnusedVariable # and build an expression with numexpr so it evaluates fast! gate_bool = gate_df.eval( "p == @c and " "((x - @xc) * @cos_t - (y - @yc) * @sin_t) ** 2 / ((@xl / 2) ** 2) + " "((x - @xc) * @sin_t + (y - @yc) * @cos_t) ** 2 / ((@yl / 2) ** 2) <= 1" ).values predicted[np.logical_and(predicted == c, gate_bool == False)] = -1 predicted_str = pd.Series(["(none)"] * len(predicted)) for c in range(0, self.num_components): predicted_str[predicted == c] = "{0}_{1}".format( self.name, c + 1) predicted_str[predicted == -1] = "{0}_None".format(self.name) predicted_str.index = group_idx event_assignments.iloc[group_idx] = predicted_str if self.posteriors: probability = np.full((len(x), self.num_components), 0.0, "float") probability[~x_na, :] = gmm.predict_proba(x[~x_na, :]) posteriors = pd.Series([0.0] * len(predicted)) for c in range(0, self.num_components): posteriors[predicted == c] = probability[predicted == c, c] posteriors.index = group_idx event_posteriors.iloc[group_idx] = posteriors new_experiment = experiment.clone() if self.num_components == 1 and self.sigma > 0: new_experiment.add_condition( self.name, "bool", event_assignments == "{0}_1".format(self.name)) elif self.num_components > 1: new_experiment.add_condition(self.name, "category", event_assignments) if self.posteriors and self.num_components > 1: col_name = "{0}_Posterior".format(self.name) new_experiment.add_condition(col_name, "float", event_posteriors) # add the statistics levels = list(self.by) if self.num_components > 1: levels.append(self.name) if levels: idx = pd.MultiIndex.from_product( [new_experiment[x].unique() for x in levels], names=levels) xmean_stat = pd.Series(index=idx, dtype=np.dtype(object)).sort_index() ymean_stat = pd.Series(index=idx, dtype=np.dtype(object)).sort_index() prop_stat = pd.Series(index=idx, dtype=np.dtype(object)).sort_index() for group, _ in groupby: gmm = self._gmms[group] for c in range(self.num_components): if self.num_components > 1: component_name = "{}_{}".format(self.name, c + 1) if group is True: g = [component_name] elif isinstance(group, tuple): g = list(group) g.append(component_name) else: g = list([group]) g.append(component_name) if len(g) > 1: g = tuple(g) else: g = g[0] else: g = group xmean_stat.loc[g] = self._xscale.inverse(gmm.means_[c][0]) ymean_stat.loc[g] = self._yscale.inverse(gmm.means_[c][0]) prop_stat.loc[g] = gmm.weights_[c] new_experiment.statistics[(self.name, "xmean")] = xmean_stat new_experiment.statistics[(self.name, "ymean")] = ymean_stat if self.num_components > 1: new_experiment.statistics[(self.name, "proportion")] = prop_stat new_experiment.history.append( self.clone_traits(transient=lambda t: True)) return new_experiment def default_view(self, **kwargs): """ Returns a diagnostic plot of the Gaussian mixture model. Returns ------- IView : an IView, call plot() to see the diagnostic plot. """ return GaussianMixture2DView(op=self, **kwargs)
class MATS3DMplDamageEEQ(MATS3DEval): # To use the model directly in the simulator specify the # time stepping classes epsilon_0 = Float(59.0e-6, label="a", desc="Lateral pressure coefficient", enter_set=True, auto_set=False) epsilon_f = Float(250.0e-6, label="a", desc="Lateral pressure coefficient", enter_set=True, auto_set=False) c_T = Float(0.01, label="a", desc="Lateral pressure coefficient", enter_set=True, auto_set=False) #========================================================================= # Configurational parameters #========================================================================= state_var_shapes = tr.Property(tr.Dict(), depends_on='n_mp') r''' Shapes of the state variables to be stored in the global array at the level of the domain. ''' @cached_property def _get_state_var_shapes(self): return {'kappa_n': (self.n_mp, ), 'omega_n': (self.n_mp, )} U_var_shape = (6, ) '''Shape of the primary variable required by the TStepState. ''' node_name = 'Desmorat model' tree_node_list = tr.List([]) #========================================================================= # Evaluation - get the corrector and predictor #========================================================================= def get_corr_pred(self, eps_ab, tn1, kappa_n, omega_n): self._update_state_variables(eps_ab, kappa_n, omega_n) #---------------------------------------------------------------------- # if the regularization using the crack-band concept is on calculate the # effective element length in the direction of principle strains #---------------------------------------------------------------------- # if self.regularization: # h = self.get_regularizing_length(sctx, eps_app_eng) # self.phi_fn.h = h #------------------------------------------------------------------ # Damage tensor (2th order): #------------------------------------------------------------------ phi_ab = self._get_phi_ab(kappa_n) #------------------------------------------------------------------ # Damage tensor (4th order) using product- or sum-type symmetrization: #------------------------------------------------------------------ beta_abcd = self._get_beta_abcd(phi_ab) #------------------------------------------------------------------ # Damaged stiffness tensor calculated based on the damage tensor beta4: #------------------------------------------------------------------ D_ijab = einsum('...ijab, abef, ...cdef -> ...ijcd', beta_abcd, self.D_abef, beta_abcd) sig_ab = einsum('...abef,...ef -> ...ab', D_ijab, eps_ab) return sig_ab, D_ijab #========================================================================= # MICROPLANE-Kinematic constraints #========================================================================= _MPNN = Property(depends_on='n_mp') r'''Get the dyadic product of the microplane normals ''' @cached_property def _get__MPNN(self): # dyadic product of the microplane normals MPNN_nij = einsum('ni,nj->nij', self._MPN, self._MPN) return MPNN_nij _MPTT = Property(depends_on='n_mp') r'''Get the third order tangential tensor (operator) for each microplane ''' @cached_property def _get__MPTT(self): # Third order tangential tensor for each microplane delta = identity(3) MPTT_nijr = 0.5 * ( einsum('ni,jr -> nijr', self._MPN, delta) + einsum('nj,ir -> njir', self._MPN, delta) - 2.0 * einsum('ni,nj,nr -> nijr', self._MPN, self._MPN, self._MPN)) return MPTT_nijr def _get_e_na(self, eps_ab): r''' Projection of apparent strain onto the individual microplanes ''' e_ni = einsum('nb,...ba->...na', self._MPN, eps_ab) return e_ni def _get_e_N_n(self, e_na): r''' Get the normal strain array for each microplane ''' e_N_n = einsum('...na, na->...n', e_na, self._MPN) return e_N_n def _get_e_equiv_n(self, e_na): r''' Returns a list of the microplane equivalent strains based on the list of microplane strain vectors ''' # magnitude of the normal strain vector for each microplane e_N_n = self._get_e_N_n(e_na) # positive part of the normal strain magnitude for each microplane e_N_pos_n = (np.abs(e_N_n) + e_N_n) / 2.0 # normal strain vector for each microplane e_N_na = einsum('...n,ni -> ...ni', e_N_n, self._MPN) # tangent strain ratio c_T = self.c_T # tangential strain vector for each microplane e_T_na = e_na - e_N_na # squared tangential strain vector for each microplane e_TT_n = einsum('...ni,...ni -> ...n', e_T_na, e_T_na) # equivalent strain for each microplane e_equiv_n = sqrt(e_N_pos_n * e_N_pos_n + c_T * e_TT_n) return e_equiv_n def _update_state_variables(self, eps_ab, kappa_n, omega_n): e_na = self._get_e_na(eps_ab) eps_eq_n = self._get_e_equiv_n(e_na) f_trial_n = eps_eq_n - self.epsilon_0 I = np.where(f_trial_n > 0) k_n = np.max(np.array([kappa_n[I], eps_eq_n[I]]), axis=0) kappa_n[I] = k_n omega_n[I] = self._get_omega(k_n) def _get_omega(self, kappa_n): ''' Return new value of damage parameter @param kappa: ''' omega_n = np.zeros_like(kappa_n) epsilon_0 = self.epsilon_0 epsilon_f = self.epsilon_f I = np.where(kappa_n >= epsilon_0) omega_n[I] = ( 1.0 - (epsilon_0 / kappa_n[I] * np.exp(-1.0 * (kappa_n[I] - epsilon_0) / (epsilon_f - epsilon_0)))) return omega_n def _get_phi_ab(self, kappa_n): # Returns the 2nd order damage tensor 'phi_mtx' # scalar integrity factor for each microplane phi_n = np.sqrt(1.0 - self._get_omega(kappa_n)) # print 'phi_Emn', phi_Emn[:, -1, :] # integration terms for each microplanes phi_ab = einsum('...n,n,nab->...ab', phi_n, self._MPW, self._MPNN) return phi_ab def _get_beta_abcd(self, phi_ab): ''' Returns the 4th order damage tensor 'beta4' using sum-type symmetrization (cf. [Jir99], Eq.(21)) ''' delta = identity(3) beta_ijkl = 0.25 * (einsum('...ik,jl->...ijkl', phi_ab, delta) + einsum('...il,jk->...ijkl', phi_ab, delta) + einsum('...jk,il->...ijkl', phi_ab, delta) + einsum('...jl,ik->...ijkl', phi_ab, delta)) return beta_ijkl #----------------------------------------------- # number of microplanes - currently fixed for 3D #----------------------------------------------- n_mp = Constant(28) #----------------------------------------------- # get the normal vectors of the microplanes #----------------------------------------------- _MPN = Property(depends_on='n_mp') @cached_property def _get__MPN(self): return array([[.577350259, .577350259, .577350259], [.577350259, .577350259, -.577350259], [.577350259, -.577350259, .577350259], [.577350259, -.577350259, -.577350259], [.935113132, .250562787, .250562787], [.935113132, .250562787, -.250562787], [.935113132, -.250562787, .250562787], [.935113132, -.250562787, -.250562787], [.250562787, .935113132, .250562787], [.250562787, .935113132, -.250562787], [.250562787, -.935113132, .250562787], [.250562787, -.935113132, -.250562787], [.250562787, .250562787, .935113132], [.250562787, .250562787, -.935113132], [.250562787, -.250562787, .935113132], [.250562787, -.250562787, -.935113132], [.186156720, .694746614, .694746614], [.186156720, .694746614, -.694746614], [.186156720, -.694746614, .694746614], [.186156720, -.694746614, -.694746614], [.694746614, .186156720, .694746614], [.694746614, .186156720, -.694746614], [.694746614, -.186156720, .694746614], [.694746614, -.186156720, -.694746614], [.694746614, .694746614, .186156720], [.694746614, .694746614, -.186156720], [.694746614, -.694746614, .186156720], [.694746614, -.694746614, -.186156720]]) #------------------------------------- # get the weights of the microplanes #------------------------------------- _MPW = Property(depends_on='n_mp') @cached_property def _get__MPW(self): return array([ .0160714276, .0160714276, .0160714276, .0160714276, .0204744730, .0204744730, .0204744730, .0204744730, .0204744730, .0204744730, .0204744730, .0204744730, .0204744730, .0204744730, .0204744730, .0204744730, .0158350505, .0158350505, .0158350505, .0158350505, .0158350505, .0158350505, .0158350505, .0158350505, .0158350505, .0158350505, .0158350505, .0158350505 ]) * 6.0 def _get_lame_params(self): la = self.E * self.nu / ((1. + self.nu) * (1. - 2. * self.nu)) # second Lame parameter (shear modulus) mu = self.E / (2. + 2. * self.nu) return la, mu D_abef = tr.Property(tr.Array, depends_on='+input') @tr.cached_property def _get_D_abef(self): la = self._get_lame_params()[0] mu = self._get_lame_params()[1] delta = identity(3) D_abef = (einsum(',ij,kl->ijkl', la, delta, delta) + einsum(',ik,jl->ijkl', mu, delta, delta) + einsum(',il,jk->ijkl', mu, delta, delta)) return D_abef def _get_var_dict(self): var_dict = super(MATS3DMplDamageEEQ, self)._get_var_dict() var_dict.update(phi_ab=self.get_phi_ab) return var_dict def get_phi_ab(self, eps_ab, tn1, kappa_n, omega_n): return self._get_phi_ab(kappa_n)
class TasbeCalibrationOp(PluginOpMixin): handler_factory = Callable(TasbeHandler) id = Constant( 'edu.mit.synbio.cytoflowgui.op_plugins.bleedthrough_piecewise') friendly_id = Constant("Quantitative Pipeline") name = Constant("TASBE") fsc_channel = DelegatesTo('_polygon_op', 'xchannel', estimate=True) ssc_channel = DelegatesTo('_polygon_op', 'ychannel', estimate=True) vertices = DelegatesTo('_polygon_op', 'vertices', estimate=True) channels = List(Str, estimate=True) blank_file = File(filter=["*.fcs"], estimate=True) bleedthrough_list = List(_BleedthroughControl, estimate=True) beads_name = Str(estimate=True) beads_file = File(filter=["*.fcs"], estimate=True) units_list = List(_Unit, estimate=True) bead_peak_quantile = Int(80, estimate=True) bead_brightness_threshold = Float(100, estimate=True) bead_brightness_cutoff = util.FloatOrNone("", estimate=True) do_color_translation = Bool(estimate=True) to_channel = Str(estimate=True) translation_list = List(_TranslationControl, estimate=True) mixture_model = Bool(False, estimate=True) do_estimate = Event valid_model = Bool(False, status=True) do_exit = Event input_files = List(File) output_directory = Directory _blank_exp_file = File(transient=True) _blank_exp = Instance(Experiment, transient=True) _blank_exp_file = File(transient=True) _blank_exp_channels = List(Str, status=True) _polygon_op = Instance(PolygonOp, kw={ 'name': 'polygon', 'xscale': 'log', 'yscale': 'log' }, transient=True) _af_op = Instance(AutofluorescenceOp, (), transient=True) _bleedthrough_op = Instance(BleedthroughLinearOp, (), transient=True) _bead_calibration_op = Instance(BeadCalibrationOp, (), transient=True) _color_translation_op = Instance(ColorTranslationOp, (), transient=True) status = Str(status=True) @on_trait_change('channels[], to_channel, do_color_translation', post_init=True) def _channels_changed(self, obj, name, old, new): for channel in self.channels: if channel not in [ control.channel for control in self.bleedthrough_list ]: self.bleedthrough_list.append( _BleedthroughControl(channel=channel)) if channel not in [unit.channel for unit in self.units_list]: self.units_list.append(_Unit(channel=channel)) to_remove = [] for control in self.bleedthrough_list: if control.channel not in self.channels: to_remove.append(control) for control in to_remove: self.bleedthrough_list.remove(control) to_remove = [] for unit in self.units_list: if unit.channel not in self.channels: to_remove.append(unit) for unit in to_remove: self.units_list.remove(unit) if self.do_color_translation: to_remove = [] for unit in self.units_list: if unit.channel != self.to_channel: to_remove.append(unit) for unit in to_remove: self.units_list.remove(unit) self.translation_list = [] for c in self.channels: if c == self.to_channel: continue self.translation_list.append( _TranslationControl(from_channel=c, to_channel=self.to_channel)) self.changed = (Changed.ESTIMATE, ('translation_list', self.translation_list)) self.changed = (Changed.ESTIMATE, ('bleedthrough_list', self.bleedthrough_list)) self.changed = (Changed.ESTIMATE, ('units_list', self.units_list)) @on_trait_change('_polygon_op:vertices', post_init=True) def _polygon_changed(self, obj, name, old, new): self.changed = (Changed.ESTIMATE, (None, None)) @on_trait_change("bleedthrough_list_items, bleedthrough_list.+", post_init=True) def _bleedthrough_controls_changed(self, obj, name, old, new): self.changed = (Changed.ESTIMATE, ('bleedthrough_list', self.bleedthrough_list)) @on_trait_change("translation_list_items, translation_list.+", post_init=True) def _translation_controls_changed(self, obj, name, old, new): self.changed = (Changed.ESTIMATE, ('translation_list', self.translation_list)) @on_trait_change('units_list_items,units_list.+', post_init=True) def _units_changed(self, obj, name, old, new): self.changed = (Changed.ESTIMATE, ('units_list', self.units_list)) # def estimate(self, experiment, subset=None): # if not self.subset: # warnings.warn("Are you sure you don't want to specify a subset " # "used to estimate the model?", # util.CytoflowOpWarning) # if experiment is None: # raise util.CytoflowOpError("No valid result to estimate with") # experiment = experiment.clone() if not self.fsc_channel: raise util.CytoflowOpError('fsc_channel', "Must set FSC channel") if not self.ssc_channel: raise util.CytoflowOpError('ssc_channel', "Must set SSC channel") if not self._polygon_op.vertices: raise util.CytoflowOpError( None, "Please draw a polygon around the " "single-cell population in the " "Morphology tab") experiment = self._blank_exp.clone() experiment = self._polygon_op.apply(experiment) self._af_op.channels = self.channels self._af_op.blank_file = self.blank_file self._af_op.estimate(experiment, subset="polygon == True") self.changed = (Changed.ESTIMATE_RESULT, "Autofluorescence") experiment = self._af_op.apply(experiment) self.status = "Estimating bleedthrough" self._bleedthrough_op.controls.clear() for control in self.bleedthrough_list: self._bleedthrough_op.controls[control.channel] = control.file self._bleedthrough_op.estimate(experiment, subset="polygon == True") self.changed = (Changed.ESTIMATE_RESULT, "Bleedthrough") experiment = self._bleedthrough_op.apply(experiment) self.status = "Estimating bead calibration" self._bead_calibration_op.beads = BeadCalibrationOp.BEADS[ self.beads_name] self._bead_calibration_op.beads_file = self.beads_file self._bead_calibration_op.bead_peak_quantile = self.bead_peak_quantile self._bead_calibration_op.bead_brightness_threshold = self.bead_brightness_threshold self._bead_calibration_op.bead_brightness_cutoff = self.bead_brightness_cutoff self._bead_calibration_op.units.clear() for unit in self.units_list: self._bead_calibration_op.units[unit.channel] = unit.unit self._bead_calibration_op.estimate(experiment) self.changed = (Changed.ESTIMATE_RESULT, "Bead Calibration") if self.do_color_translation: self.status = "Estimating color translation" experiment = self._bead_calibration_op.apply(experiment) self._color_translation_op.mixture_model = self.mixture_model self._color_translation_op.controls.clear() for control in self.translation_list: self._color_translation_op.controls[( control.from_channel, control.to_channel)] = control.file self._color_translation_op.estimate(experiment, subset='polygon == True') self.changed = (Changed.ESTIMATE_RESULT, "Color Translation") self.status = "Done estimating" self.valid_model = True def should_clear_estimate(self, changed, payload): """ Should the owning WorkflowItem clear the estimated model by calling op.clear_estimate()? `changed` can be: - Changed.ESTIMATE -- the parameters required to call 'estimate()' (ie traits with estimate = True metadata) have changed - Changed.PREV_RESULT -- the previous WorkflowItem's result changed """ if changed == Changed.ESTIMATE: name, val = payload if name == 'fsc_channel' or name == 'ssc_channel': return False return True def clear_estimate(self): self._af_op = AutofluorescenceOp() self._bleedthrough_op = BleedthroughLinearOp() self._bead_calibration_op = BeadCalibrationOp() self._color_translation_op = ColorTranslationOp() self.valid_model = False self.changed = (Changed.ESTIMATE_RESULT, self) def should_apply(self, changed, payload): """ Should the owning WorkflowItem apply this operation when certain things change? `changed` can be: - Changed.OPERATION -- the operation's parameters changed - Changed.PREV_RESULT -- the previous WorkflowItem's result changed - Changed.ESTIMATE_RESULT -- the results of calling "estimate" changed """ if changed == Changed.ESTIMATE_RESULT and \ self.blank_file != self._blank_exp_file: return True elif changed == Changed.OPERATION: name, _ = payload if name == "output_directory": return False return True return False def apply(self, experiment): if self.blank_file != self._blank_exp_file: self._blank_exp = ImportOp(tubes=[Tube( file=self.blank_file)]).apply() self._blank_exp_file = self.blank_file self._blank_exp_channels = self._blank_exp.channels self.changed = (Changed.PREV_RESULT, None) return out_dir = Path(self.output_directory) for path in self.input_files: in_file_path = Path(path) out_file_path = out_dir / in_file_path.name if out_file_path.exists(): raise util.CytoflowOpError( None, "File {} already exists".format(out_file_path)) tubes = [ Tube(file=path, conditions={'filename': Path(path).stem}) for path in self.input_files ] for tube in tubes: self.status = "Converting " + Path(tube.file).stem experiment = ImportOp(tubes=[tube], conditions={ 'filename': 'category' }).apply() experiment = self._af_op.apply(experiment) experiment = self._bleedthrough_op.apply(experiment) experiment = self._bead_calibration_op.apply(experiment) if self.do_color_translation: experiment = self._color_translation_op.apply(experiment) ExportFCS(path=self.output_directory, by=['filename'], _include_by=False).export(experiment) self.input_files = [] self.status = "Done converting!" def default_view(self, **kwargs): return TasbeCalibrationView(op=self, **kwargs) def get_help(self): current_dir = os.path.abspath(__file__) help_dir = os.path.split(current_dir)[0] help_dir = os.path.join(help_dir, "help") help_file = None for klass in self.__class__.__mro__: mod = klass.__module__ mod_html = mod + ".html" h = os.path.join(help_dir, mod_html) if os.path.exists(h): help_file = h break with open(help_file, encoding='utf-8') as f: help_html = f.read() return help_html
class HlogScale(ScaleMixin): """ A scale that transforms the data using the `hyperlog` function. This scaling method implements a "linear-like" region around 0, and a "log-like" region for large values, with a smooth transition between them. The transformation has one parameter, `b`, which specifies the location of the transition from linear to log-like. The default, `500`, is good for 18-bit scales and not good for other scales. Attributes ---------- b : Float (default = 500) the location of the transition from linear to log-like. References ---------- [1] Hyperlog-a flexible log-like transform for negative, zero, and positive valued data. Bagwell CB. Cytometry A. 2005 Mar;64(1):34-42. PMID: 15700280 http://onlinelibrary.wiley.com/doi/10.1002/cyto.a.20114/abstract """ id = Constant("edu.mit.synbio.cytoflow.utility.hlog") name = "hlog" experiment = Instance("cytoflow.Experiment") # what data do we use to compute scale parameters? set one. channel = Str condition = Str statistic = Tuple(Str, Str) range = Property(Float) b = Float(200, desc="location of the log transition") mpl_params = Property(Dict, depends_on="[b, range, scale_min, scale_max]") def __call__(self, data): """ Transforms `data` using this scale. Careful! May return `NaN` if the scale domain doesn't match the data (ie, applying a log10 scale to negative numbers.) """ f = _make_hlog_numeric(self.b, 1.0, np.log10(self.range)) if isinstance(data, pd.Series): return data.apply(f) elif isinstance(data, np.ndarray): return f(data) elif isinstance(data, (int, float)): # numpy returns a 0-dim array. wtf. return float(f(data)) else: try: return map(f, data) except TypeError: raise CytoflowError("Unknown data type in HlogScale.__call__") def inverse(self, data): """ Transforms 'data' using the inverse of this scale. """ f_inv = lambda y, b=self.b, d=np.log10(self.range): hlog_inv( y, b, 1.0, d) if isinstance(data, pd.Series): return data.apply(f_inv) elif isinstance(data, np.ndarray): inverse = np.vectorize(f_inv) return inverse(data) elif isinstance(data, float): return f_inv(data) else: try: return map(f_inv, data) except TypeError: raise CytoflowError("Unknown data type in HlogScale.inverse") def clip(self, data): return data def _get_range(self): if self.experiment: if self.channel and self.channel in self.experiment.channels: if "range" in self.experiment.metadata[self.channel]: return self.experiment.metadata[self.channel]["range"] else: return self.experiment.data[self.channel].max() elif self.condition and self.condition in self.experiment.conditions: return self.experiment.data[self.condition].max() elif self.statistic and self.statistic in self.experiment.statistics: return self.experiment.statistics[self.statistic].max() else: return Undefined else: return Undefined @cached_property def _get_mpl_params(self): return {"b": self.b, "range": self.range}
class Heading(Label): """ An item that is a fancy label. """ # Override the 'style' trait to default to the fancy 'custom' style: style = Constant('custom')
class MATS2DMicroplaneDamageJir(MATSXDMicroplaneDamageFatigueJir): # implements(IMATSEval) #----------------------------------------------- # number of microplanes #----------------------------------------------- n_mp = Constant(360) #----------------------------------------------- # get the normal vectors of the microplanes #----------------------------------------------- _MPN = Property(depends_on='n_mp') @cached_property def _get__MPN(self): # microplane normals: alpha_list = linspace(0, 2 * pi, self.n_mp) MPN = array([[cos(alpha), sin(alpha)] for alpha in alpha_list]) return MPN #------------------------------------- # get the weights of the microplanes #------------------------------------- _MPW = Property(depends_on='n_mp') @cached_property def _get__MPW(self): # Note that the values in the array must be multiplied by 6 (cf. [Baz05])! # The sum of of the array equals 0.5. (cf. [BazLuz04])) # The values are given for an Gaussian integration over the unit # hemisphere. MPW = ones(self.n_mp) / self.n_mp * 2 return MPW #------------------------------------------------------------------------- # Cached elasticity tensors #------------------------------------------------------------------------- elasticity_tensors = Property( depends_on='E, nu, dimensionality, stress_state') @cached_property def _get_elasticity_tensors(self): ''' Intialize the fourth order elasticity tensor for 2D or 2D plane strain or 2D plane stress ''' # ---------------------------------------------------------------------------- # Lame constants calculated from E and nu # ---------------------------------------------------------------------------- # first Lame paramter la = self.E * self.nu / ((1 + self.nu) * (1 - 2 * self.nu)) # second Lame parameter (shear modulus) mu = self.E / (2 + 2 * self.nu) # ----------------------------------------------------------------------------------------------------- # Get the fourth order elasticity and compliance tensors for the 2D-case # ----------------------------------------------------------------------------------------------------- # construct the elasticity tensor (using Numpy - einsum function) delta = identity(2) D_ijkl = (einsum(',ij,kl->ijkl', la, delta, delta) + einsum(',ik,jl->ijkl', mu, delta, delta) + einsum(',il,jk->ijkl', mu, delta, delta)) return D_ijkl
class PolygonOp(HasStrictTraits): """ Apply a polygon gate to a cytometry experiment. Attributes ---------- name : Str The operation name. Used to name the new metadata field in the experiment that's created by :meth:`apply` xchannel, ychannel : Str The names of the x and y channels to apply the gate. xscale, yscale : {'linear', 'log', 'logicle'} (default = 'linear') The scales applied to the data before drawing the polygon. vertices : List((Float, Float)) The polygon verticies. An ordered list of 2-tuples, representing the x and y coordinates of the vertices. Notes ----- This module uses :meth:`matplotlib.path.Path` to represent the polygon, because membership testing is very fast. You can set the verticies by hand, I suppose, but it's much easier to use the interactive view you get from :meth:`default_view` to do so. Examples -------- .. plot:: :context: close-figs Make a little data set. >>> import cytoflow as flow >>> import_op = flow.ImportOp() >>> import_op.tubes = [flow.Tube(file = "Plate01/RFP_Well_A3.fcs", ... conditions = {'Dox' : 10.0}), ... flow.Tube(file = "Plate01/CFP_Well_A4.fcs", ... conditions = {'Dox' : 1.0})] >>> import_op.conditions = {'Dox' : 'float'} >>> ex = import_op.apply() Create and parameterize the operation. .. plot:: :context: close-figs >>> p = flow.PolygonOp(name = "Polygon", ... xchannel = "V2-A", ... ychannel = "Y2-A") >>> p.vertices = [(23.411982294776319, 5158.7027015021222), ... (102.22182270573683, 23124.058843387455), ... (510.94519955277201, 23124.058843387455), ... (1089.5215641232173, 3800.3424832180476), ... (340.56382570202402, 801.98947404942271), ... (65.42597937575897, 1119.3133482602157)] Show the default view. .. plot:: :context: close-figs >>> df = p.default_view(huefacet = "Dox", ... xscale = 'log', ... yscale = 'log') >>> df.plot(ex) .. note:: If you want to use the interactive default view in a Jupyter notebook, make sure you say ``%matplotlib notebook`` in the first cell (instead of ``%matplotlib inline`` or similar). Then call ``default_view()`` with ``interactive = True``:: df = p.default_view(huefacet = "Dox", xscale = 'log', yscale = 'log', interactive = True) df.plot(ex) Apply the gate, and show the result .. plot:: :context: close-figs >>> ex2 = p.apply(ex) >>> ex2.data.groupby('Polygon').size() Polygon False 15875 True 4125 dtype: int64 """ # traits id = Constant('edu.mit.synbio.cytoflow.operations.polygon') friendly_id = Constant("Polygon") name = CStr() xchannel = Str() ychannel = Str() vertices = List((Float, Float)) xscale = util.ScaleEnum() yscale = util.ScaleEnum() _selection_view = Instance('PolygonSelection', transient=True) def apply(self, experiment): """Applies the threshold to an experiment. Parameters ---------- experiment : Experiment the old :class:`Experiment` to which this op is applied Returns ------- Experiment a new :class:'Experiment`, the same as ``old_experiment`` but with a new column of type `bool` with the same as the operation name. The bool is ``True`` if the event's measurement is within the polygon, and ``False`` otherwise. Raises ------ util.CytoflowOpError if for some reason the operation can't be applied to this experiment. The reason is in :attr:`.CytoflowOpError.args` """ if experiment is None: raise util.CytoflowOpError('experiment', "No experiment specified") if self.name in experiment.data.columns: raise util.CytoflowOpError( 'name', "{} is in the experiment already!".format(self.name)) if not self.xchannel: raise util.CytoflowOpError('xchannel', "Must specify an x channel") if not self.ychannel: raise util.CytoflowOpError('ychannel', "Must specify a y channel") if not self.xchannel in experiment.channels: raise util.CytoflowOpError( 'xchannel', "xchannel {0} is not in the experiment".format(self.xchannel)) if not self.ychannel in experiment.channels: raise util.CytoflowOpError( 'ychannel', "ychannel {0} is not in the experiment".format(self.ychannel)) if len(self.vertices) < 3: raise util.CytoflowOpError('vertices', "Must have at least 3 vertices") if any([len(x) != 2 for x in self.vertices]): return util.CytoflowOpError( 'vertices', "All vertices must be lists or tuples " "of length = 2") # make sure name got set! if not self.name: raise util.CytoflowOpError( 'name', "You have to set the Polygon gate's name " "before applying it!") # make sure old_experiment doesn't already have a column named self.name if (self.name in experiment.data.columns): raise util.CytoflowOpError( 'name', "Experiment already contains a column {0}".format(self.name)) # there's a bit of a subtlety here: if the vertices were # selected with an interactive plot, and that plot had scaled # axes, we need to apply that scale function to both the # vertices and the data before looking for path membership xscale = util.scale_factory(self.xscale, experiment, channel=self.xchannel) yscale = util.scale_factory(self.yscale, experiment, channel=self.ychannel) vertices = [(xscale(x), yscale(y)) for (x, y) in self.vertices] data = experiment.data[[self.xchannel, self.ychannel]].copy() data[self.xchannel] = xscale(data[self.xchannel]) data[self.ychannel] = yscale(data[self.ychannel]) # use a matplotlib Path because testing for membership is a fast C fn. path = mpl.path.Path(np.array(vertices)) xy_data = data.as_matrix(columns=[self.xchannel, self.ychannel]) new_experiment = experiment.clone() new_experiment.add_condition(self.name, "bool", path.contains_points(xy_data)) new_experiment.history.append( self.clone_traits(transient=lambda _: True)) return new_experiment def default_view(self, **kwargs): self._selection_view = PolygonSelection(op=self) self._selection_view.trait_set(**kwargs) return self._selection_view
class MATS3DMicroplaneDamage(MATSXDMicroplaneDamage, MATS3DEval): implements(IMATSEval) # number of spatial dimensions # n_dim = Constant(3) # number of components of engineering tensor representation # n_eng = Constant(6) #------------------------------------------------------------------------- # PolarDiscr related data #------------------------------------------------------------------------- # # number of microplanes - currently fixed for 3D # n_mp = Constant(28) # get the normal vectors of the microplanes # _MPN = Property(depends_on='n_mp') @cached_property def _get__MPN(self): # microplane normals: return array([[.577350259, .577350259, .577350259], [.577350259, .577350259, -.577350259], [.577350259, -.577350259, .577350259], [.577350259, -.577350259, -.577350259], [.935113132, .250562787, .250562787], [.935113132, .250562787, -.250562787], [.935113132, -.250562787, .250562787], [.935113132, -.250562787, -.250562787], [.250562787, .935113132, .250562787], [.250562787, .935113132, -.250562787], [.250562787, -.935113132, .250562787], [.250562787, -.935113132, -.250562787], [.250562787, .250562787, .935113132], [.250562787, .250562787, -.935113132], [.250562787, -.250562787, .935113132], [.250562787, -.250562787, -.935113132], [.186156720, .694746614, .694746614], [.186156720, .694746614, -.694746614], [.186156720, -.694746614, .694746614], [.186156720, -.694746614, -.694746614], [.694746614, .186156720, .694746614], [.694746614, .186156720, -.694746614], [.694746614, -.186156720, .694746614], [.694746614, -.186156720, -.694746614], [.694746614, .694746614, .186156720], [.694746614, .694746614, -.186156720], [.694746614, -.694746614, .186156720], [.694746614, -.694746614, -.186156720]]) # get the weights of the microplanes # _MPW = Property(depends_on='n_mp') @cached_property def _get__MPW(self): # Note that the values in the array must be multiplied by 6 (cf. [Baz05])! # The sum of of the array equals 0.5. (cf. [BazLuz04])) # The values are given for an Gaussian integration over the unit # hemisphere. return array([ .0160714276, .0160714276, .0160714276, .0160714276, .0204744730, .0204744730, .0204744730, .0204744730, .0204744730, .0204744730, .0204744730, .0204744730, .0204744730, .0204744730, .0204744730, .0204744730, .0158350505, .0158350505, .0158350505, .0158350505, .0158350505, .0158350505, .0158350505, .0158350505, .0158350505, .0158350505, .0158350505, .0158350505 ]) * 6.0 #------------------------------------------------------------------------- # Cached elasticity tensors #------------------------------------------------------------------------- elasticity_tensors = Property( depends_on='E, nu, dimensionality, stress_state') @cached_property def _get_elasticity_tensors(self): ''' Intialize the fourth order elasticity tensor for 3D or 2D plane strain or 2D plane stress ''' # ---------------------------------------------------------------------------- # Lame constants calculated from E and nu # ---------------------------------------------------------------------------- E = self.E nu = self.nu # first Lame paramter la = E * nu / ((1 + nu) * (1 - 2 * nu)) # second Lame parameter (shear modulus) mu = E / (2 + 2 * nu) # ----------------------------------------------------------------------------------------------------- # Get the fourth order elasticity and compliance tensors for the 3D-case # ----------------------------------------------------------------------------------------------------- # The following line correspond to the tensorial expression: # (using numpy functionality in order to avoid the loop): # # D4_e_3D = zeros((3,3,3,3),dtype=float) # C4_e_3D = zeros((3,3,3,3),dtype=float) # delta = identity(3) # for i in range(0,3): # for j in range(0,3): # for k in range(0,3): # for l in range(0,3): # elasticity tensor (cf. Jir/Baz Inelastic analysis of structures Eq.D25): # D4_e_3D[i,j,k,l] = la * delta[i,j] * delta[k,l] + \ # mu * ( delta[i,k] * delta[j,l] + delta[i,l] * delta[j,k] ) # elastic compliance tensor (cf. Simo, Computational Inelasticity, Eq.(2.7.16) AND (2.1.16)): # C4_e_3D[i,j,k,l] = (1+nu)/(E) * \ # ( delta[i,k] * delta[j,l] + delta[i,l]* delta[j,k] ) - \ # nu / E * delta[i,j] * delta[k,l] # NOTE: swapaxes returns a reference not a copy! # (the index notation always refers to the initial indexing (i=0,j=1,k=2,l=3)) delta = identity(3) delta_ijkl = outer(delta, delta).reshape(3, 3, 3, 3) delta_ikjl = delta_ijkl.swapaxes(1, 2) delta_iljk = delta_ikjl.swapaxes(2, 3) D4_e_3D = la * delta_ijkl + mu * (delta_ikjl + delta_iljk) C4_e_3D = -nu / E * delta_ijkl + \ (1 + nu) / (2 * E) * (delta_ikjl + delta_iljk) # ----------------------------------------------------------------------------------------------------- # Get the fourth order elasticity and compliance tensors for the 3D-case # ----------------------------------------------------------------------------------------------------- D2_e_3D = self.map_tns4_to_tns2(D4_e_3D) return D4_e_3D, C4_e_3D, D2_e_3D #------------------------------------------------------------------------- # Dock-based view with its own id #------------------------------------------------------------------------- traits_view = View(Include('polar_fn_group'), dock='tab', id='ibvpy.mats.mats3D.mats_3D_cmdm.MATS3D_cmdm', kind='modal', resizable=True, scrollable=True, width=0.6, height=0.8, buttons=['OK', 'Cancel'])
class BeadCalibrationOp(HasStrictTraits): """ Calibrate arbitrary channels to molecules-of-fluorophore using fluorescent beads (eg, the Spherotech RCP-30-5A rainbow beads.) Computes a log-linear calibration function that maps arbitrary fluorescence units to physical units (ie molecules equivalent fluorophore, or *MEF*). To use, set :attr:`beads_file` to an FCS file containing events collected *using the same cytometer settings as the data you're calibrating*. Specify which beads you ran by setting :attr:`beads` to match one of the values of :data:`BeadCalibrationOp.BEADS`; and set :attr:`units` to which channels you want calibrated and in which units. Then, call :meth:`estimate()` and check the peak-finding with :meth:`default_view().plot()`. If the peak-finding is wacky, try adjusting :attr:`bead_peak_quantile` and :attr:`bead_brightness_threshold`. When the peaks are successfully identified, call :meth:`apply` to scale your experimental data set. If you can't make the peak finding work, please submit a bug report! This procedure works best when the beads file is very clean data. It does not do its own gating (maybe a future addition?) In the meantime, I recommend gating the *acquisition* on the FSC/SSC channels in order to get rid of debris, cells, and other noise. Finally, because you can't have a negative number of fluorescent molecules (MEFLs, etc) (as well as for math reasons), this module filters out negative values. Attributes ---------- units : Dict(Str, Str) A dictionary specifying the channels you want calibrated (keys) and the units you want them calibrated in (values). The units must be keys of the :attr:`beads` attribute. beads_file : File A file containing the FCS events from the beads. beads : Dict(Str, List(Float)) The beads' characteristics. Keys are calibrated units (ie, MEFL or MEAP) and values are ordered lists of known fluorophore levels. Common values for this dict are included in :data:`BeadCalibrationOp.BEADS`. bead_peak_quantile : Int (default = 80) The quantile threshold used to choose bead peaks. bead_brightness_threshold : Float (default = 100) How bright must a bead peak be to be considered? bead_brightness_cutoff : Float If a bead peak is above this, then don't consider it. Takes care of clipping saturated detection. Defaults to 70% of the detector range. bead_histogram_bins : Int (default = 512) The number of bins to use in computing the bead histogram. Tweak this if the peak find is having difficulty, or if you have a small number of events force_linear : Bool (default = False) A linear fit in log space doesn't always go through the origin, which means that the calibration function isn't strictly a multiplicative scaling operation. Set :attr:`force_linear` to force the such behavior. Keep an eye on the diagnostic plot, though, to see how much error you're introducing! Notes ----- The peak finding is rather sophisticated. For each channel, a 256-bin histogram is computed on the log-transformed bead data, and then the histogram is smoothed with a Savitzky-Golay filter (with a window length of 5 and a polynomial order of 1). Next, a wavelet-based peak-finding algorithm is used: it convolves the smoothed histogram with a series of wavelets and looks for relative maxima at various length-scales. The parameters of the smoothing algorithm were arrived at empircally, using beads collected at a wide range of PMT voltages. Finally, the peaks are filtered by height (the histogram bin has a quantile greater than `bead_peak_quantile`) and intensity (brighter than :attr:`bead_brightness_threshold`). How to convert from a series of peaks to mean equivalent fluorochrome? If there's one peak, we assume that it's the brightest peak. If there are two peaks, we assume they're the brightest two. If there are ``n >=3`` peaks, we check all the contiguous `n`-subsets of the bead intensities and find the one whose linear regression (in log space!) has the smallest norm (square-root sum-of-squared-residuals.) There's a slight subtlety in the fact that we're performing the linear regression in log-space: if the relationship in log10-space is ``Y=aX + b``, then the same relationship in linear space is ``x = 10**X``, ``y = 10**y``, and ``y = (10**b) * (x ** a)``. Examples -------- Create a small experiment: .. plot:: :context: close-figs >>> import cytoflow as flow >>> import_op = flow.ImportOp() >>> import_op.tubes = [flow.Tube(file = "tasbe/rby.fcs")] >>> ex = import_op.apply() Create and parameterize the operation .. plot:: :context: close-figs >>> bead_op = flow.BeadCalibrationOp() >>> beads = "Spherotech RCP-30-5A Lot AA01-AA04, AB01, AB02, AC01, GAA01-R" >>> bead_op.beads = flow.BeadCalibrationOp.BEADS[beads] >>> bead_op.units = {"Pacific Blue-A" : "MEBFP", ... "FITC-A" : "MEFL", ... "PE-Tx-Red-YG-A" : "MEPTR"} >>> >>> bead_op.beads_file = "tasbe/beads.fcs" Estimate the model parameters .. plot:: :context: close-figs >>> bead_op.estimate(ex) Plot the diagnostic plot .. plot:: :context: close-figs >>> bead_op.default_view().plot(ex) Apply the operation to the experiment .. plot:: :context: close-figs >>> ex = bead_op.apply(ex) """ # traits id = Constant('edu.mit.synbio.cytoflow.operations.beads_calibrate') friendly_id = Constant("Bead Calibration") name = Constant("Beads") units = Dict(Str, Str) beads_file = File(exists = True) bead_peak_quantile = Int(80) bead_brightness_threshold = Float(100.0) bead_brightness_cutoff = util.FloatOrNone(None) bead_histogram_bins = Int(512) # TODO - bead_brightness_threshold should probably be different depending # on the data range of the input. force_linear = Bool(False) beads = Dict(Str, List(Float)) _histograms = Dict(Str, Any, transient = True) _calibration_functions = Dict(Str, Callable, transient = True) _peaks = Dict(Str, Any, transient = True) _mefs = Dict(Str, Any, transient = True) def estimate(self, experiment): """ Estimate the calibration coefficients from the beads file. Parameters ---------- experiment : Experiment The experiment used to compute the calibration. """ if experiment is None: raise util.CytoflowOpError('experiment', "No experiment specified") if not self.beads_file: raise util.CytoflowOpError('beads_file', "No beads file specified") if not set(self.units.keys()) <= set(experiment.channels): raise util.CytoflowOpError('units', "Specified channels that weren't found in " "the experiment.") if not set(self.units.values()) <= set(self.beads.keys()): raise util.CytoflowOpError('units', "Units don't match beads.") self._histograms.clear() self._calibration_functions.clear() self._peaks.clear() self._mefs.clear() # make a little Experiment check_tube(self.beads_file, experiment) beads_exp = ImportOp(tubes = [Tube(file = self.beads_file)], channels = {experiment.metadata[c]["fcs_name"] : c for c in experiment.channels}, name_metadata = experiment.metadata['name_metadata']).apply() channels = list(self.units.keys()) # make the histogram for channel in channels: data = beads_exp.data[channel] # TODO - this assumes the data is on a linear scale. check it! data_range = experiment.metadata[channel]['range'] if self.bead_brightness_cutoff is None: cutoff = 0.7 * data_range else: cutoff = self.bead_brightness_cutoff # bin the data on a log scale hist_bins = np.logspace(1, math.log(data_range, 2), num = self.bead_histogram_bins, base = 2) hist = np.histogram(data, bins = hist_bins) # mask off-scale values hist[0][0] = 0 hist[0][-1] = 0 # smooth it with a Savitzky-Golay filter hist_smooth = scipy.signal.savgol_filter(hist[0], 5, 1) self._histograms[channel] = (hist, hist_bins, hist_smooth) # find peaks for channel in channels: hist = self._histograms[channel][0] hist_bins = self._histograms[channel][1] hist_smooth = self._histograms[channel][2] peak_bins = scipy.signal.find_peaks_cwt(hist_smooth, widths = np.arange(3, 20), max_distances = np.arange(3, 20) / 2) # filter by height and intensity peak_threshold = np.percentile(hist_smooth, self.bead_peak_quantile) peak_bins_filtered = \ [x for x in peak_bins if hist_smooth[x] > peak_threshold and hist[1][x] > self.bead_brightness_threshold and hist[1][x] < cutoff] self._peaks[channel] = [hist_bins[x] for x in peak_bins_filtered] # compute the conversion for channel in channels: peaks = self._peaks[channel] mef_unit = self.units[channel] if not mef_unit in self.beads: raise util.CytoflowOpError('units', "Invalid unit {0} specified for channel {1}".format(mef_unit, channel)) # "mean equivalent fluorochrome" mef = self.beads[mef_unit] if len(peaks) == 0: raise util.CytoflowOpError(None, "Didn't find any peaks for channel {}; " "check the diagnostic plot" .format(channel)) elif len(peaks) > len(mef): raise util.CytoflowOpError(None, "Found too many peaks for channel {}; " "check the diagnostic plot" .format(channel)) elif len(peaks) == 1: # if we only have one peak, assume it's the brightest peak a = mef[-1] / peaks[0] self._mefs[channel] = [mef[-1]] self._calibration_functions[channel] = lambda x, a=a: a * x elif len(peaks) == 2: # if we have only two peaks, assume they're the brightest two self._mefs[channel] = [mef[-2], mef[-1]] a = (mef[-1] - mef[-2]) / (peaks[1] - peaks[0]) self._calibration_functions[channel] = lambda x, a=a: a * x else: # if there are n > 2 peaks, check all the contiguous n-subsets # of mef for the one whose linear regression with the peaks # has the smallest (norm) sum-of-residuals. # do it in log10 space because otherwise the brightest peaks # have an outsized influence. best_resid = np.inf for start, end in [(x, x+len(peaks)) for x in range(len(mef) - len(peaks) + 1)]: mef_subset = mef[start:end] # linear regression of the peak locations against mef subset lr = np.polyfit(np.log10(peaks), np.log10(mef_subset), deg = 1, full = True) resid = lr[1][0] if resid < best_resid: best_lr = lr[0] best_resid = resid self._mefs[channel] = mef_subset if self.force_linear: # if we're forcing a linear scale for the calibration # function, find that scale with an optimization. (we can't # use this above, to find the MEFs from the peaks, because # when i tried it mis-identified the proper subset.) # even though this keeps things a linear scale, it can # actually introduce *more* errors because "blank" beads # still fluoresce. def s(x): p = np.multiply(self._peaks[channel], x) return np.sum(np.abs(np.subtract(p, self._mefs[channel]))) res = scipy.optimize.minimize(s, [1]) a = res.x[0] self._calibration_functions[channel] = \ lambda x, a=a: a * x else: # remember, these (linear) coefficients came from logspace, so # if the relationship in log10 space is Y = aX + b, then in # linear space the relationship is x = 10**X, y = 10**Y, # and y = (10**b) * x ^ a # also remember that the result of np.polyfit is a list of # coefficients with the highest power first! so if we # solve y=ax + b, coeff #0 is a and coeff #1 is b a = best_lr[0] b = 10 ** best_lr[1] self._calibration_functions[channel] = \ lambda x, a=a, b=b: b * np.power(x, a) def apply(self, experiment): """ Applies the bleedthrough correction to an experiment. Parameters ---------- experiment : Experiment the experiment to which this operation is applied Returns ------- Experiment A new experiment with the specified channels calibrated in physical units. The calibrated channels also have new metadata: - **bead_calibration_fn** : Callable (pandas.Series --> pandas.Series) The function to calibrate raw data to bead units - **bead_units** : String The units this channel was calibrated to """ if experiment is None: raise util.CytoflowOpError('experiment', "No experiment specified") channels = list(self.units.keys()) if not self.units: raise util.CytoflowOpError('units', "No channels to calibrate.") if not self._calibration_functions: raise util.CytoflowOpError(None, "Calibration not found. " "Did you forget to call estimate()?") if not set(channels) <= set(experiment.channels): raise util.CytoflowOpError('units', "Module units don't match experiment channels") if set(channels) != set(self._calibration_functions.keys()): raise util.CytoflowOpError('units', "Calibration doesn't match units. " "Did you forget to call estimate()?") # two things. first, you can't raise a negative value to a non-integer # power. second, negative physical units don't make sense -- how can # you have the equivalent of -5 molecules of fluoresceine? so, # we filter out negative values here. new_experiment = experiment.clone() for channel in channels: new_experiment.data = \ new_experiment.data[new_experiment.data[channel] > 0] new_experiment.data.reset_index(drop = True, inplace = True) for channel in channels: calibration_fn = self._calibration_functions[channel] new_experiment[channel] = calibration_fn(new_experiment[channel]) new_experiment.metadata[channel]['bead_calibration_fn'] = calibration_fn new_experiment.metadata[channel]['bead_units'] = self.units[channel] if 'range' in experiment.metadata[channel]: new_experiment.metadata[channel]['range'] = calibration_fn(experiment.metadata[channel]['range']) if 'voltage' in experiment.metadata[channel]: del new_experiment.metadata[channel]['voltage'] new_experiment.history.append(self.clone_traits(transient = lambda t: True)) return new_experiment def default_view(self, **kwargs): """ Returns a diagnostic plot to see if the peak finding is working. Returns ------- IView An diagnostic view, call :meth:`~BeadCalibrationDiagnostic.plot` to see the diagnostic plots """ v = BeadCalibrationDiagnostic(op = self) v.trait_set(**kwargs) return v BEADS = \ { # from http://www.spherotech.com/RCP-30-5A%20%20Rev%20K%20ML%23%20073112%20Rev.%20B.xls "Spherotech RCP-30-5A Lot AK02, AK03, AK04" : { "MECSB" : [205, 470, 1211, 2740, 7516, 20122, 35573], "MEBFP" : [844, 1958, 5422, 13522, 42717, 153501, 420359], "MEFL" : [771, 2106, 6262, 15183, 45292, 136258, 291042], "MEPE" : [487, 1474, 4516, 11260, 34341, 107608, 260461], "MEPTR" : [205, 643, 2021, 5278, 17018, 62451, 198933], "MECY" : [1414, 3809, 10852, 27904, 85866, 324106, 1040895], "MECY7" : [12752, 39057, 142958, 448890], "MEAP" : [341, 1027, 3156, 7750, 23446, 68702, 116813], "MEAPCY7" : [173, 427, 1097, 2399, 6359, 17475, 30725]}, # from http://www.spherotech.com/RCP-30-5a%20%20rev%20H%20ML%20071712.xls "Spherotech RCP-30-5A Lot AG01, AF02, AD04 and AAE01" : { "MECSB" : [216, 464, 1232, 2940, 7669, 19812, 35474], "MEBFP" : [861, 1997, 5776, 15233, 45389, 152562, 396759], "MEFL" : [792, 2079, 6588, 16471, 47497, 137049, 271647], "MEPE" : [531, 1504, 4819, 12506, 36159, 109588, 250892], "MEPTR" : [233, 669, 2179, 5929, 18219, 63944, 188785], "MECY" : [1614, 4035, 12025, 31896, 95682, 353225, 1077421], "MEPCY7" : [14916, 42336, 153840, 494263], "MEAP" : [373, 1079, 3633, 9896, 28189, 79831, 151008], "MEAPCY7" : [2864, 7644, 19081, 37258]}, # from http://www.spherotech.com/RCP-30-5a%20%20rev%20G.2.xls "Spherotech RCP-30-5A Lot AA01-AA04, AB01, AB02, AC01, GAA01-R": { "MECSB" : [179, 400, 993, 3203, 6083, 17777, 36331], "MEBFP" : [700, 1705, 4262, 17546, 35669, 133387, 412089], "MEFL" : [692, 2192, 6028, 17493, 35674, 126907, 290983], "MEPE" : [505, 1777, 4974, 13118, 26757, 94930, 250470], "MEPTR" : [207, 750, 2198, 6063, 12887, 51686, 170219], "MECY" : [1437, 4693, 12901, 36837, 76621, 261671, 1069858], "MEPCY7" : [32907, 107787, 503797], "MEAP" : [587, 2433, 6720, 17962, 30866, 51704, 146080], "MEAPCY7" : [718, 1920, 5133, 9324, 14210, 26735]}, "Spherotech URCP-100-2H (9 peaks)": { "MEFL" : [3531, 11373, 34643, 107265, 324936, 835306, 2517654, 6069240], "MEPE" : [2785, 9525, 28421, 90313, 275589, 713181, 2209251, 5738784], "MEPTR" : [1158, 4161, 12528, 41140, 130347, 344149, 1091393, 2938710], "MEPCY" : [6501, 20302, 59517, 183870, 550645, 1569470, 5109318, 17854584], "MEPCY7" : [4490, 10967, 30210, 87027, 283621, 975312, 4409101, 24259524], "MEAP" : [369, 749, 3426, 10413, 50013, 177490, 500257, 1252120], "MEAPCY7" : [1363, 2656, 9791, 25120, 96513, 328967, 864905, 2268931], "MECSB" : [989, 2959, 8277, 25524, 71603, 173069, 491388, 1171641], "MEBFP" : [1957, 5579, 16005, 53621, 168302, 459809, 1581762, 4999251]}} """
class GaussianMixtureOp(HasStrictTraits): """ This module fits a Gaussian mixture model with a specified number of components to one or more channels. If :attr:`num_components` ``> 1``, :meth:`apply` creates a new categorical metadata variable named ``name``, with possible values ``{name}_1`` .... ``name_n`` where ``n`` is the number of components. An event is assigned to ``name_i`` category if it has the highest posterior probability of having been produced by component ``i``. If an event has a value that is outside the range of one of the channels' scales, then it is assigned to ``{name}_None``. Optionally, if :attr:`sigma` is greater than 0, :meth:`apply` creates new ``boolean`` metadata variables named ``{name}_1`` ... ``{name}_n`` where ``n`` is the number of components. The column ``{name}_i`` is ``True`` if the event is less than :attr:`sigma` standard deviations from the mean of component ``i``. If :attr:`num_components` is ``1``, :attr:`sigma` must be greater than 0. Optionally, if :attr:`posteriors` is ``True``, :meth:`apply` creates a new ``double`` metadata variables named ``{name}_1_posterior`` ... ``{name}_n_posterior`` where ``n`` is the number of components. The column ``{name}_i_posterior`` contains the posterior probability that this event is a member of component ``i``. Finally, the same mixture model (mean and standard deviation) may not be appropriate for every subset of the data. If this is the case, you can use the :attr:`by` attribute to specify metadata by which to aggregate the data before estimating (and applying) a mixture model. The number of components must be the same across each subset, though. Attributes ---------- name : Str The operation name; determines the name of the new metadata column channels : List(Str) The channels to apply the mixture model to. scale : Dict(Str : {"linear", "logicle", "log"}) Re-scale the data in the specified channels before fitting. If a channel is in :attr:`channels` but not in :attr:`scale`, the current package-wide default (set with :func:`~.set_default_scale`) is used. num_components : Int (default = 1) How many components to fit to the data? Must be a positive integer. sigma : Float (default = 0.0) How many standard deviations on either side of the mean to include in the boolean variable ``{name}_i``? Must be ``>= 0.0``. If :attr:`num_components` is ``1``, must be ``> 0``. by : List(Str) A list of metadata attributes to aggregate the data before estimating the model. For example, if the experiment has two pieces of metadata, ``Time`` and ``Dox``, setting :attr:`by` to ``["Time", "Dox"]`` will fit the model separately to each subset of the data with a unique combination of ``Time`` and ``Dox``. posteriors : Bool (default = False) If ``True``, add columns named ``{name}_{i}_posterior`` giving the posterior probability that the event is in component ``i``. Useful for filtering out low-probability events. Notes ----- We use the Mahalnobis distance as a multivariate generalization of the number of standard deviations an event is from the mean of the multivariate gaussian. If :math:`\\vec{x}` is an observation from a distribution with mean :math:`\\vec{\\mu}` and :math:`S` is the covariance matrix, then the Mahalanobis distance is :math:`\\sqrt{(x - \\mu)^T \\cdot S^{-1} \\cdot (x - \\mu)}`. Examples -------- .. plot:: :context: close-figs Make a little data set. >>> import cytoflow as flow >>> import_op = flow.ImportOp() >>> import_op.tubes = [flow.Tube(file = "Plate01/RFP_Well_A3.fcs", ... conditions = {'Dox' : 10.0}), ... flow.Tube(file = "Plate01/CFP_Well_A4.fcs", ... conditions = {'Dox' : 1.0})] >>> import_op.conditions = {'Dox' : 'float'} >>> ex = import_op.apply() Create and parameterize the operation. .. plot:: :context: close-figs >>> gm_op = flow.GaussianMixtureOp(name = 'Gauss', ... channels = ['Y2-A'], ... scale = {'Y2-A' : 'log'}, ... num_components = 2) Estimate the clusters .. plot:: :context: close-figs >>> gm_op.estimate(ex) Plot a diagnostic view .. plot:: :context: close-figs >>> gm_op.default_view().plot(ex) Apply the gate .. plot:: :context: close-figs >>> ex2 = gm_op.apply(ex) Plot a diagnostic view with the event assignments .. plot:: :context: close-figs >>> gm_op.default_view().plot(ex2) And with two channels: .. plot:: :context: close-figs >>> gm_op = flow.GaussianMixtureOp(name = 'Gauss', ... channels = ['V2-A', 'Y2-A'], ... scale = {'V2-A' : 'log', ... 'Y2-A' : 'log'}, ... num_components = 2) >>> gm_op.estimate(ex) >>> ex2 = gm_op.apply(ex) >>> gm_op.default_view().plot(ex2) """ id = Constant('edu.mit.synbio.cytoflow.operations.gaussian') friendly_id = Constant("Gaussian Mixture Model") name = CStr() channels = List(Str) scale = Dict(Str, util.ScaleEnum) num_components = util.PositiveInt(1, allow_zero=False) sigma = util.PositiveFloat(allow_zero=True) by = List(Str) posteriors = Bool(False) # the key is either a single value or a tuple _gmms = Dict(Any, Instance(sklearn.mixture.GaussianMixture), transient=True) _scale = Dict(Str, Instance(util.IScale), transient=True) def estimate(self, experiment, subset=None): """ Estimate the Gaussian mixture model parameters Parameters ---------- experiment : Experiment The data to use to estimate the mixture parameters subset : str (default = None) If set, a Python expression to determine the subset of the data to use to in the estimation. """ if experiment is None: raise util.CytoflowOpError('experiment', "No experiment specified") if len(self.channels) == 0: raise util.CytoflowOpError('channels', "Must set at least one channel") for c in self.channels: if c not in experiment.data: raise util.CytoflowOpError( 'channels', "Channel {0} not found in the experiment".format(c)) for c in self.scale: if c not in self.channels: raise util.CytoflowOpError( 'channels', "Scale set for channel {0}, but it isn't " "in the experiment".format(c)) for b in self.by: if b not in experiment.data: raise util.CytoflowOpError( 'by', "Aggregation metadata {} not found, " "must be one of {}".format(b, experiment.conditions)) if subset: try: experiment = experiment.query(subset) except: raise util.CytoflowViewError( 'subset', "Subset string '{0}' isn't valid".format(subset)) if len(experiment) == 0: raise util.CytoflowViewError( 'subset', "Subset string '{0}' returned no events".format(subset)) if self.by: groupby = experiment.data.groupby(self.by) else: # use a lambda expression to return a group that contains # all the events groupby = experiment.data.groupby(lambda _: True) # get the scale. estimate the scale params for the ENTIRE data set, # not subsets we get from groupby(). And we need to save it so that # the data is transformed the same way when we apply() for c in self.channels: if c in self.scale: self._scale[c] = util.scale_factory(self.scale[c], experiment, channel=c) else: self._scale[c] = util.scale_factory(util.get_default_scale(), experiment, channel=c) gmms = {} for group, data_subset in groupby: if len(data_subset) == 0: raise util.CytoflowOpError( None, "Group {} had no data".format(group)) x = data_subset.loc[:, self.channels[:]] for c in self.channels: x[c] = self._scale[c](x[c]) # drop data that isn't in the scale range for c in self.channels: x = x[~(np.isnan(x[c]))] x = x.values gmm = sklearn.mixture.GaussianMixture( n_components=self.num_components, covariance_type="full", random_state=1) gmm.fit(x) if not gmm.converged_: raise util.CytoflowOpError( None, "Estimator didn't converge" " for group {0}".format(group)) # in the 1D version, we sorted the components by the means -- so # the first component has the lowest mean, the second component # has the next-lowest mean, etc. # that doesn't work in the general case. instead, we assume that # the clusters are likely (?) to be arranged along *one* of the # axes, so we take the |norm| of the mean of each cluster and # sort that way. norms = np.sum(gmm.means_**2, axis=1)**0.5 sort_idx = np.argsort(norms) gmm.means_ = gmm.means_[sort_idx] gmm.weights_ = gmm.weights_[sort_idx] gmm.covariances_ = gmm.covariances_[sort_idx] gmm.precisions_ = gmm.precisions_[sort_idx] gmm.precisions_cholesky_ = gmm.precisions_cholesky_[sort_idx] gmms[group] = gmm self._gmms = gmms def apply(self, experiment): """ Assigns new metadata to events using the mixture model estimated in :meth:`estimate`. Returns ------- Experiment A new :class:`.Experiment` with the new condition variables as described in the class documentation. Also adds the following new statistics: - **mean** : Float the mean of the fitted gaussian in each channel for each component. - **sigma** : (Float, Float) the locations the mean +/- one standard deviation in each channel for each component. - **correlation** : Float the correlation coefficient between each pair of channels for each component. - **proportion** : Float the proportion of events in each component of the mixture model. only added if :attr:`num_components` ``> 1``. """ if experiment is None: raise util.CytoflowOpError('experiment', "No experiment specified") if len(self.channels) == 0: raise util.CytoflowOpError('channels', "Must set at least one channel") # make sure name got set! if not self.name: raise util.CytoflowOpError( 'name', "You have to set the gate's name " "before applying it!") if self.name != util.sanitize_identifier(self.name): raise util.CytoflowOpError( 'name', "Name can only contain letters, numbers and underscores.". format(self.name)) if self.num_components > 1 and self.name in experiment.data.columns: raise util.CytoflowOpError( 'name', "Experiment already has a column named {0}".format(self.name)) if self.sigma > 0: for i in range(1, self.num_components + 1): cname = "{}_{}".format(self.name, i) if cname in experiment.data.columns: raise util.CytoflowOpError( 'name', "Experiment already has a column named {}".format( cname)) if self.posteriors: for i in range(1, self.num_components + 1): cname = "{}_{}_posterior".format(self.name, i) if cname in experiment.data.columns: raise util.CytoflowOpError( 'name', "Experiment already has a column named {}".format( cname)) if not self._gmms: raise util.CytoflowOpError( None, "No components found. Did you forget to " "call estimate()?") for c in self.channels: if c not in self._scale: raise util.CytoflowOpError( None, "Model scale not set. Did you forget " "to call estimate()?") for c in self.channels: if c not in experiment.channels: raise util.CytoflowOpError( 'channels', "Channel {0} not found in the experiment".format(c)) for b in self.by: if b not in experiment.conditions: raise util.CytoflowOpError( 'by', "Aggregation metadata {} not found, " "must be one of {}".format(b, experiment.conditions)) # # if self.num_components == 1 and self.sigma == 0.0: # raise util.CytoflowOpError('sigma', # "if num_components is 1, sigma must be > 0.0") if self.num_components == 1 and self.posteriors: warn("If num_components == 1, all posteriors will be 1", util.CytoflowOpWarning) # raise util.CytoflowOpError('posteriors', # "If num_components == 1, all posteriors will be 1.") if self.num_components > 1: event_assignments = pd.Series(["{}_None".format(self.name)] * len(experiment), dtype="object") if self.sigma > 0: event_gate = { i: pd.Series([False] * len(experiment), dtype="double") for i in range(self.num_components) } if self.posteriors: event_posteriors = { i: pd.Series([0.0] * len(experiment), dtype="double") for i in range(self.num_components) } if self.by: groupby = experiment.data.groupby(self.by) else: # use a lambda expression to return a group that # contains all the events groupby = experiment.data.groupby(lambda _: True) # make the statistics components = [x + 1 for x in range(self.num_components)] prop_idx = pd.MultiIndex.from_product( [experiment[x].unique() for x in self.by] + [components], names=list(self.by) + ["Component"]) prop_stat = pd.Series(name="{} : {}".format(self.name, "proportion"), index=prop_idx, dtype=np.dtype(object)).sort_index() mean_idx = pd.MultiIndex.from_product( [experiment[x].unique() for x in self.by] + [components] + [self.channels], names=list(self.by) + ["Component"] + ["Channel"]) mean_stat = pd.Series(name="{} : {}".format(self.name, "mean"), index=mean_idx, dtype=np.dtype(object)).sort_index() sigma_stat = pd.Series(name="{} : {}".format(self.name, "sigma"), index=mean_idx, dtype=np.dtype(object)).sort_index() interval_stat = pd.Series(name="{} : {}".format(self.name, "interval"), index=mean_idx, dtype=np.dtype(object)).sort_index() corr_idx = pd.MultiIndex.from_product( [experiment[x].unique() for x in self.by] + [components] + [self.channels] + [self.channels], names=list(self.by) + ["Component"] + ["Channel_1"] + ["Channel_2"]) corr_stat = pd.Series(name="{} : {}".format(self.name, "correlation"), index=corr_idx, dtype=np.dtype(object)).sort_index() for group, data_subset in groupby: if group not in self._gmms: # there weren't any events in this group, so we didn't get # a gmm. continue gmm = self._gmms[group] x = data_subset.loc[:, self.channels[:]] for c in self.channels: x[c] = self._scale[c](x[c]) # which values are missing? x_na = pd.Series([False] * len(x)) for c in self.channels: x_na[np.isnan(x[c]).values] = True x = x.values x_na = x_na.values group_idx = groupby.groups[group] if self.num_components > 1: predicted = np.full(len(x), -1, "int") predicted[~x_na] = gmm.predict(x[~x_na]) predicted_str = pd.Series(["(none)"] * len(predicted)) for c in range(0, self.num_components): predicted_str[predicted == c] = "{0}_{1}".format( self.name, c + 1) predicted_str[predicted == -1] = "{0}_None".format(self.name) predicted_str.index = group_idx event_assignments.iloc[group_idx] = predicted_str # if we're doing sigma-based gating, for each component check # to see if the event is in the sigma gate. if self.sigma > 0.0: for c in range(self.num_components): s = np.linalg.pinv(gmm.covariances_[c]) mu = gmm.means_[c] # compute the Mahalanobis distance f = lambda x, mu, s: np.dot(np.dot((x - mu).T, s), (x - mu)) dist = np.apply_along_axis(f, 1, x, mu, s) # come up with a threshold based on sigma. you'll note we # didn't sqrt dist: that's because for a multivariate # Gaussian, the square of the Mahalanobis distance is # chi-square distributed p = (scipy.stats.norm.cdf(self.sigma) - 0.5) * 2 thresh = scipy.stats.chi2.ppf(p, 1) event_gate[c].iloc[group_idx] = np.less_equal(dist, thresh) if self.posteriors: p = np.full((len(x), self.num_components), 0.0) p[~x_na] = gmm.predict_proba(x[~x_na]) for c in range(self.num_components): event_posteriors[c].iloc[group_idx] = p[:, c] for c in range(self.num_components): if len(self.by) == 0: g = tuple([c + 1]) elif hasattr(group, '__iter__') and not isinstance( group, (str, bytes)): g = tuple(list(group) + [c + 1]) else: g = tuple([group] + [c + 1]) prop_stat.at[g] = gmm.weights_[c] for cidx1, channel1 in enumerate(self.channels): g2 = tuple(list(g) + [channel1]) mean_stat.at[g2] = self._scale[channel1].inverse( gmm.means_[c, cidx1]) s, corr = util.cov2corr(gmm.covariances_[c]) sigma_stat[g2] = (self._scale[channel1].inverse(s[cidx1])) interval_stat.at[g2] = ( self._scale[channel1].inverse(gmm.means_[c, cidx1] - s[cidx1]), self._scale[channel1].inverse(gmm.means_[c, cidx1] + s[cidx1])) for cidx2, channel2 in enumerate(self.channels): g3 = tuple(list(g2) + [channel2]) corr_stat[g3] = corr[cidx1, cidx2] corr_stat.drop(tuple(list(g2) + [channel1]), inplace=True) new_experiment = experiment.clone() if self.num_components > 1: new_experiment.add_condition(self.name, "category", event_assignments) if self.sigma > 0: for c in range(self.num_components): gate_name = "{}_{}".format(self.name, c + 1) new_experiment.add_condition(gate_name, "bool", event_gate[c]) if self.posteriors: for c in range(self.num_components): post_name = "{}_{}_posterior".format(self.name, c + 1) new_experiment.add_condition(post_name, "double", event_posteriors[c]) new_experiment.statistics[(self.name, "mean")] = pd.to_numeric(mean_stat) new_experiment.statistics[(self.name, "sigma")] = sigma_stat new_experiment.statistics[(self.name, "interval")] = interval_stat if len(corr_stat) > 0: new_experiment.statistics[( self.name, "correlation")] = pd.to_numeric(corr_stat) if self.num_components > 1: new_experiment.statistics[( self.name, "proportion")] = pd.to_numeric(prop_stat) new_experiment.history.append( self.clone_traits(transient=lambda _: True)) return new_experiment def default_view(self, **kwargs): """ Returns a diagnostic plot of the Gaussian mixture model. Returns ------- IView : an IView, call plot() to see the diagnostic plot. """ channels = kwargs.pop('channels', self.channels) scale = kwargs.pop('scale', self.scale) for c in channels: if c not in self.channels: raise util.CytoflowViewError( 'channels', "Channel {} isn't in the operation's channels".format(c)) for s in scale: if s not in self.channels: raise util.CytoflowViewError( 'scale', "Channel {} isn't in the operation's channels".format(s)) for c in channels: if c not in scale: scale[c] = util.get_default_scale() if len(channels) == 0: raise util.CytoflowViewError( 'channels', "Must specify at least one channel for a default view") elif len(channels) == 1: v = GaussianMixture1DView(op=self) v.trait_set(channel=channels[0], scale=scale[channels[0]], **kwargs) return v elif len(channels) == 2: v = GaussianMixture2DView(op=self) v.trait_set(xchannel=channels[0], ychannel=channels[1], xscale=scale[channels[0]], yscale=scale[channels[1]], **kwargs) return v else: raise util.CytoflowViewError( 'channels', "Can't specify more than two channels for a default view")
class BleedthroughPiecewiseDiagnostic(HasStrictTraits): """ Plots a scatterplot of each channel vs every other channel and the bleedthrough spline Attributes ---------- name : Str The instance name (for serialization, UI etc.) op : Instance(BleedthroughPiecewiseOp) The op whose parameters we're viewing """ # traits id = Constant( "edu.mit.synbio.cytoflow.view.autofluorescencediagnosticview") friendly_id = Constant("Autofluorescence Diagnostic") name = Str subset = Str # TODO - why can't I use BleedthroughPiecewiseOp here? op = Instance(IOperation) def plot(self, experiment=None, **kwargs): """Plot a faceted histogram view of a channel""" if experiment is None: raise util.CytoflowViewError("No experiment specified") if not self.op.controls: raise util.CytoflowViewError("No controls specified") if not self.op._splines: raise util.CytoflowViewError( "No splines. Did you forget to call estimate()?") kwargs.setdefault('histtype', 'stepfilled') kwargs.setdefault('alpha', 0.5) kwargs.setdefault('antialiased', True) plt.figure() channels = list(self.op._splines.keys()) num_channels = len(channels) for from_idx, from_channel in enumerate(channels): for to_idx, to_channel in enumerate(channels): if from_idx == to_idx: continue # make a little Experiment check_tube(self.op.controls[from_channel], experiment) tube_exp = ImportOp( tubes=[Tube(file=self.op.controls[from_channel])], channels={ experiment.metadata[c]["fcs_name"]: c for c in experiment.channels }, name_metadata=experiment.metadata['name_metadata'], events=10000).apply() # apply previous operations for op in experiment.history: tube_exp = op.apply(tube_exp) # subset it if self.subset: try: tube_exp = tube_exp.query(self.subset) except Exception as e: raise util.CytoflowOpError( "Subset string '{0}' isn't valid".format( self.subset)) from e if len(tube_exp.data) == 0: raise util.CytoflowOpError( "Subset string '{0}' returned no events".format( self.subset)) # get scales xscale = util.scale_factory("logicle", tube_exp, channel=from_channel) yscale = util.scale_factory("logicle", tube_exp, channel=to_channel) tube_data = tube_exp.data plt.subplot(num_channels, num_channels, from_idx + (to_idx * num_channels) + 1) plt.xscale('logicle', **xscale.mpl_params) plt.yscale('logicle', **yscale.mpl_params) plt.xlabel(from_channel) plt.ylabel(to_channel) plt.scatter(tube_data[from_channel], tube_data[to_channel], alpha=0.5, s=1, marker='o') spline = self.op._splines[from_channel][to_channel] xs = np.logspace(-1, math.log(tube_data[from_channel].max(), 10)) plt.plot(xs, spline(xs), 'g-', lw=3) plt.tight_layout(pad=0.8)
class GaussianMixture1DView(By1DView, AnnotatingView, HistogramView): """ A default view for :class:`GaussianMixtureOp` that plots the histogram of a single channel, then the estimated Gaussian distributions on top of it. Attributes ---------- """ id = Constant('edu.mit.synbio.cytoflow.view.gaussianmixture1dview') friendly_id = Constant("1D Gaussian Mixture Diagnostic Plot") channel = Str scale = util.ScaleEnum def plot(self, experiment, **kwargs): """ Plot the plots. Parameters ---------- """ if experiment is None: raise util.CytoflowViewError('experiment', "No experiment specified") if self.op.num_components == 1: annotation_facet = self.op.name + "_1" else: annotation_facet = self.op.name view, trait_name = self._strip_trait(annotation_facet) if self.channel in self.op._scale: scale = self.op._scale[self.channel] else: scale = util.scale_factory(self.scale, experiment, channel=self.channel) super(GaussianMixture1DView, view).plot(experiment, annotation_facet=annotation_facet, annotation_trait=trait_name, annotations=self.op._gmms, scale=scale, **kwargs) def _annotation_plot(self, axes, annotation, annotation_facet, annotation_value, annotation_color, **kwargs): # annotation is an instance of mixture.GaussianMixture gmm = annotation if annotation_value is None: for i in range(len(gmm.means_)): self._annotation_plot(axes, annotation, annotation_facet, i, annotation_color, **kwargs) return elif type(annotation_value) is str: try: idx_re = re.compile(annotation_facet + '_(\d+)') idx = idx_re.match(annotation_value).group(1) idx = int(idx) - 1 except: return elif isinstance(annotation_value, np.bool_): if annotation_value: idx = 0 else: return else: idx = annotation_value kwargs.setdefault('orientation', 'vertical') if kwargs['orientation'] == 'horizontal': scale = kwargs['yscale'] patch_area = 0.0 for k in range(0, len(axes.patches)): patch = axes.patches[k] if isinstance(patch, Polygon): xy = patch.get_xy() patch_area += poly_area([scale(p[1]) for p in xy], [p[0] for p in xy]) elif isinstance(patch, Rectangle): for xy in patch.get_path().to_polygons(): patch_area += poly_area([p[1] for p in xy], [p[0] for p in xy]) plt_min, plt_max = plt.gca().get_ylim() y = scale.inverse( np.linspace(scale(scale.clip(plt_min)), scale(scale.clip(plt_max)), 500)) pdf_scale = patch_area * gmm.weights_[idx] mean = gmm.means_[idx][0] stdev = np.sqrt(gmm.covariances_[idx][0]) x = scipy.stats.norm.pdf(scale(y), mean, stdev) * pdf_scale axes.plot(x, y, color=annotation_color) else: scale = kwargs['xscale'] patch_area = 0.0 for k in range(0, len(axes.patches)): patch = axes.patches[k] if isinstance(patch, Polygon): xy = patch.get_xy() patch_area += poly_area([scale(p[0]) for p in xy], [p[1] for p in xy]) elif isinstance(patch, Rectangle): for xy in patch.get_path().to_polygons(): patch_area += poly_area([p[0] for p in xy], [p[1] for p in xy]) plt_min, plt_max = plt.gca().get_xlim() x = scale.inverse( np.linspace(scale(scale.clip(plt_min)), scale(scale.clip(plt_max)), 500)) pdf_scale = patch_area * gmm.weights_[idx] mean = gmm.means_[idx][0] stdev = np.sqrt(gmm.covariances_[idx][0]) y = scipy.stats.norm.pdf(scale(x), mean, stdev) * pdf_scale axes.plot(x, y, color=annotation_color)
class AutofluorescenceDiagnosticView(HasStrictTraits): """ Plots a histogram of each channel, and its median in red. Serves as a diagnostic for the autofluorescence correction. Attributes ---------- op : Instance(AutofluorescenceOp) The :class:`AutofluorescenceOp` whose parameters we're viewing. Set automatically if you created the instance using :meth:`AutofluorescenceOp.default_view`. subset : str (default = "") An expression that specifies the events that are plotted in the histograms """ # traits id = Constant( 'edu.mit.synbio.cytoflow.view.autofluorescencediagnosticview') friendly_id = Constant("Autofluorescence Diagnostic") op = Instance(AutofluorescenceOp) subset = Str def plot(self, experiment, **kwargs): """ Plot a faceted histogram view of a channel """ if experiment is None: raise util.CytoflowViewError('experiment', "No experiment specified") if not self.op.channels: raise util.CytoflowViewError('op', "No channels specified") if not self.op._af_median: raise util.CytoflowViewError( 'op', "Autofluorescence values aren't set. Did " "you forget to run estimate()?") if not set(self.op._af_median.keys()) <= set(experiment.channels) or \ not set(self.op._af_stdev.keys()) <= set(experiment.channels): raise util.CytoflowViewError( 'op', "Autofluorescence estimates aren't set, or are " "different than those in the experiment " "parameter. Did you forget to run estimate()?") if not set(self.op._af_median.keys()) == set(self.op._af_stdev.keys()): raise util.CytoflowOpError( 'op', "Median and stdev keys are different! " "What the hell happened?!") if not set(self.op.channels) == set(self.op._af_median.keys()): raise util.CytoflowOpError( 'op', "Estimated channels differ from the channels " "parameter. Did you forget to (re)run estimate()?") import matplotlib.pyplot as plt import seaborn as sns # @UnusedImport kwargs.setdefault('histtype', 'stepfilled') kwargs.setdefault('alpha', 0.5) kwargs.setdefault('antialiased', True) # make a little Experiment try: check_tube(self.op.blank_file, experiment) blank_exp = ImportOp( tubes=[Tube(file=self.op.blank_file)], channels={ experiment.metadata[c]["fcs_name"]: c for c in experiment.channels }, name_metadata=experiment.metadata['name_metadata']).apply() except util.CytoflowOpError as e: raise util.CytoflowViewError('op', e.__str__()) from e # apply previous operations for op in experiment.history: blank_exp = op.apply(blank_exp) # subset it if self.subset: try: blank_exp = blank_exp.query(self.subset) except Exception as exc: raise util.CytoflowOpError( 'subset', "Subset string '{0}' isn't valid".format( self.subset)) from exc if len(blank_exp.data) == 0: raise util.CytoflowOpError( 'subset', "Subset string '{0}' returned no events".format( self.subset)) plt.figure() for idx, channel in enumerate(self.op.channels): d = blank_exp.data[channel] plt.subplot(len(self.op.channels), 1, idx + 1) plt.title(channel) plt.hist(d, bins=200, **kwargs) plt.axvline(self.op._af_median[channel], color='r') plt.tight_layout(pad=0.8)
class GaussianMixture2DView(By2DView, AnnotatingView, ScatterplotView): """ A default view for :class:`GaussianMixtureOp` that plots the scatter plot of a two channels, then the estimated 2D Gaussian distributions on top of it. Attributes ---------- """ id = Constant('edu.mit.synbio.cytoflow.view.gaussianmixture2dview') friendly_id = Constant("2D Gaussian Mixture Diagnostic Plot") xchannel = Str xscale = util.ScaleEnum ychannel = Str yscale = util.ScaleEnum def plot(self, experiment, **kwargs): """ Plot the plots. Parameters ---------- """ if experiment is None: raise util.CytoflowViewError('experiment', "No experiment specified") if self.op.num_components == 1: annotation_facet = self.op.name + "_1" else: annotation_facet = self.op.name view, trait_name = self._strip_trait(annotation_facet) if self.xchannel in self.op._scale: xscale = self.op._scale[self.xchannel] else: xscale = util.scale_factory(self.xscale, experiment, channel=self.xchannel) if self.ychannel in self.op._scale: yscale = self.op._scale[self.ychannel] else: yscale = util.scale_factory(self.yscale, experiment, channel=self.ychannel) super(GaussianMixture2DView, view).plot(experiment, annotation_facet=annotation_facet, annotation_trait=trait_name, annotations=self.op._gmms, xscale=xscale, yscale=yscale, **kwargs) def _annotation_plot(self, axes, annotation, annotation_facet, annotation_value, annotation_color, **kwargs): # annotation is an instance of mixture.GaussianMixture gmm = annotation if annotation_value is None: for i in range(len(gmm.means_)): self._annotation_plot(axes, annotation, annotation_facet, i, annotation_color, **kwargs) return elif isinstance(annotation_value, str): try: idx_re = re.compile(annotation_facet + '_(\d+)') idx = idx_re.match(annotation_value).group(1) idx = int(idx) - 1 except: return elif isinstance(annotation_value, np.bool_): if annotation_value: idx = 0 else: return else: idx = annotation_value xscale = kwargs['xscale'] yscale = kwargs['yscale'] mean = gmm.means_[idx] covar = gmm.covariances_[idx] v, w = scipy.linalg.eigh(covar) u = w[0] / scipy.linalg.norm(w[0]) #rotation angle (in degrees) t = np.arctan(u[1] / u[0]) t = 180 * t / np.pi # in order to scale the ellipses correctly, we have to make them # ourselves out of an affine-scaled unit circle. The interface # is the same as matplotlib.patches.Ellipse _plot_ellipse(axes, xscale, yscale, mean, np.sqrt(v[0]), np.sqrt(v[1]), 180 + t, color=annotation_color, fill=False, linewidth=2) _plot_ellipse(axes, xscale, yscale, mean, np.sqrt(v[0]) * 2, np.sqrt(v[1]) * 2, 180 + t, color=annotation_color, fill=False, linewidth=2, alpha=0.66) _plot_ellipse(axes, xscale, yscale, mean, np.sqrt(v[0]) * 3, np.sqrt(v[1]) * 3, 180 + t, color=annotation_color, fill=False, linewidth=2, alpha=0.33)
class BleedthroughLinearOp(HasStrictTraits): """ Apply matrix-based bleedthrough correction to a set of fluorescence channels. This is a traditional matrix-based compensation for bleedthrough. For each pair of channels, the user specifies the proportion of the first channel that bleeds through into the second; then, the module performs a matrix multiplication to compensate the raw data. The module can also estimate the bleedthrough matrix using one single-color control per channel. This works best on data that has had autofluorescence removed first; if that is the case, then the autofluorescence will be subtracted from the single-color controls too. To use, set up the `controls` dict with the single color controls; call `estimate()` to parameterize the operation; check that the bleedthrough plots look good with `default_view().plot()`; and then `apply()` to an Experiment. Attributes ---------- name : Str The operation name (for UI representation; optional for interactive use) controls : Dict(Str, File) The channel names to correct, and corresponding single-color control FCS files to estimate the correction splines with. Must be set to use `estimate()`. spillover : Dict(Tuple(Str, Str), Float) The spillover "matrix" to use to correct the data. The keys are pairs of channels, and the values are proportions of spectral overlap. If `("channel1", "channel2")` is present as a key, `("channel2", "channel1")` must also be present. The module does not assume that the matrix is symmetric. Notes ----- Examples -------- >>> bl_op = flow.BleedthroughLinearOp() >>> bl_op.controls = {'Pacific Blue-A' : 'merged/ebfp.fcs', ... 'FITC-A' : 'merged/eyfp.fcs', ... 'PE-Tx-Red-YG-A' : 'merged/mkate.fcs'} >>> >>> bl_op.estimate(ex2) >>> bl_op.default_view().plot(ex2) >>> >>> ex3 = bl_op.apply(ex2) """ # traits id = Constant('edu.mit.synbio.cytoflow.operations.bleedthrough_linear') friendly_id = Constant("Linear Bleedthrough Correction") name = CStr() controls = Dict(Str, File) spillover = Dict(Tuple(Str, Str), Float) def estimate(self, experiment, subset=None): """ Estimate the bleedthrough from simgle-channel controls in `controls` """ if not experiment: raise util.CytoflowOpError("No experiment specified") channels = self.controls.keys() if len(channels) < 2: raise util.CytoflowOpError( "Need at least two channels to correct bleedthrough.") # make sure the control files exist for channel in channels: if not os.path.isfile(self.controls[channel]): raise util.CytoflowOpError( "Can't find file {0} for channel {1}.".format( self.controls[channel], channel)) for channel in channels: # make a little Experiment check_tube(self.controls[channel], experiment) tube_exp = ImportOp(tubes=[Tube( file=self.controls[channel])]).apply() # apply previous operations for op in experiment.history: tube_exp = op.apply(tube_exp) # subset it if subset: try: tube_data = tube_exp.query(subset) except: raise util.CytoflowOpError( "Subset string '{0}' isn't valid".format(self.subset)) if len(tube_data.index) == 0: raise util.CytoflowOpError( "Subset string '{0}' returned no events".format( self.subset)) else: tube_data = tube_exp.data # polyfit requires sorted data tube_data.sort(channel, inplace=True) for to_channel in channels: from_channel = channel if from_channel == to_channel: continue # sometimes some of the data is off the edge of the # plot, and this screws up a linear regression from_min = np.min(tube_data[from_channel]) * 1.05 from_max = np.max(tube_data[from_channel]) * 0.95 tube_data = tube_data[tube_data[from_channel] > from_min] tube_data = tube_data[tube_data[from_channel] < from_max] to_min = np.min(tube_data[to_channel]) * 1.05 to_max = np.max(tube_data[to_channel]) * 0.95 tube_data = tube_data[tube_data[to_channel] > to_min] tube_data = tube_data[tube_data[to_channel] < to_max] tube_data.reset_index(drop=True, inplace=True) lr = np.polyfit(tube_data[from_channel], tube_data[to_channel], deg=1) self.spillover[(from_channel, to_channel)] = lr[0] def apply(self, experiment): """Applies the bleedthrough correction to an experiment. Parameters ---------- experiment : Experiment the old_experiment to which this op is applied Returns ------- a new experiment with the bleedthrough subtracted out. """ if not experiment: raise util.CytoflowOpError("No experiment specified") if not self.spillover: raise util.CytoflowOpError("Spillover matrix isn't set. " "Did you forget to run estimate()?") for (from_channel, to_channel) in self.spillover: if not from_channel in experiment.data: raise util.CytoflowOpError( "Can't find channel {0} in experiment".format( from_channel)) if not to_channel in experiment.data: raise util.CytoflowOpError( "Can't find channel {0} in experiment".format(to_channel)) if not (to_channel, from_channel) in self.spillover: raise util.CytoflowOpError("Must have both (from, to) and " "(to, from) keys in self.spillover") new_experiment = experiment.clone() # the completely arbitrary ordering of the channels channels = list(set([x for (x, _) in self.spillover.keys()])) # build the spillover matrix from the spillover dictionary a = [[self.spillover[(y, x)] if x != y else 1.0 for x in channels] for y in channels] # invert it. use the pseudoinverse in case a is singular a_inv = np.linalg.pinv(a) new_experiment.data[channels] = np.dot(experiment.data[channels], a_inv) for channel in channels: # add the spillover values to the channel's metadata new_experiment.metadata[channel]['linear_bleedthrough'] = \ {x : self.spillover[(x, channel)] for x in channels if x != channel} new_experiment.history.append(self.clone_traits()) return new_experiment def default_view(self, **kwargs): """ Returns a diagnostic plot to make sure spillover estimation is working. Returns ------- IView : An IView, call plot() to see the diagnostic plots """ # the completely arbitrary ordering of the channels channels = list(set([x for (x, _) in self.spillover.keys()])) if set(self.controls.keys()) != set(channels): raise util.CytoflowOpError( "Must have both the controls and bleedthrough to plot") return BleedthroughLinearDiagnostic(op=self, **kwargs)
class GaussianMixture1DWorkflowOp(WorkflowOperation, GaussianMixtureOp): # override id so we can differentiate the 1D and 2D ops id = Constant('edu.mit.synbio.cytoflowgui.operations.gaussian_1d') # add 'estimate' and 'apply' metadata name = Str(apply=True) channel = Str(estimate=True) channel_scale = util.ScaleEnum(estimate=True) num_components = util.PositiveCInt(1, allow_zero=False, estimate=True) sigma = util.PositiveCFloat(None, allow_zero=True, allow_none=True, estimate=True) by = List(Str, estimate=True) # add the 'estimate_result' metadata _gmms = Dict(Any, Instance(mixture.GaussianMixture), transient=True, estimate_result=True) # override the base class's "subset" with one that is dynamically generated / # updated from subset_list subset = Property(Str, observe="subset_list.items.str") subset_list = List(ISubset, estimate=True) # bits to support the subset editor @observe('subset_list:items.str') def _on_subset_changed(self, _): self.changed = 'subset_list' # MAGIC - returns the value of the "subset" Property, above def _get_subset(self): return " and ".join( [subset.str for subset in self.subset_list if subset.str]) def estimate(self, experiment): self.channels = [self.channel] self.scale = {self.channel: self.channel_scale} super().estimate(experiment, subset=self.subset) def apply(self, experiment): if not self._gmms: raise util.CytoflowOpError(None, 'Click "Estimate"!') return GaussianMixtureOp.apply(self, experiment) def default_view(self, **kwargs): return GaussianMixture1DWorkflowView(op=self, **kwargs) def clear_estimate(self): self._gmms = {} self._scale = {} def get_notebook_code(self, idx): op = GaussianMixtureOp() op.copy_traits(self, op.copyable_trait_names()) op.channels = [self.channel] op.scale = {self.channel: self.channel_scale} return dedent(""" op_{idx} = {repr} op_{idx}.estimate(ex_{prev_idx}{subset}) ex_{idx} = op_{idx}.apply(ex_{prev_idx}) """.format(repr=repr(op), idx=idx, prev_idx=idx - 1, subset=", subset = " + repr(self.subset) if self.subset else ""))