Ejemplo n.º 1
0
    def default_view(self, **kwargs):
        """
        Returns a diagnostic plot of the Gaussian mixture model.
         
        Returns
        -------
            IView : an IView, call plot() to see the diagnostic plot.
        """
        channels = kwargs.pop('channels', self.channels)
        scale = kwargs.pop('scale', self.scale)
        density = kwargs.pop('density', False)

        for c in channels:
            if c not in self.channels:
                raise util.CytoflowViewError(
                    "Channel {} isn't in the operation's channels".format(c))

        for s in scale:
            if s not in self.channels:
                raise util.CytoflowViewError(
                    "Channel {} isn't in the operation's channels".format(s))

        for c in channels:
            if c not in scale:
                scale[c] = util.get_default_scale()

        if len(channels) == 0:
            raise util.CytoflowViewError(
                "Must specify at least one channel for a default view")
        elif len(channels) == 1:
            return FlowPeaks1DView(op=self,
                                   channel=channels[0],
                                   scale=scale[channels[0]],
                                   **kwargs)
        elif len(channels) == 2:
            if density:
                return FlowPeaks2DDensityView(op=self,
                                              xchannel=channels[0],
                                              ychannel=channels[1],
                                              xscale=scale[channels[0]],
                                              yscale=scale[channels[1]],
                                              **kwargs)
            else:
                return FlowPeaks2DView(op=self,
                                       xchannel=channels[0],
                                       ychannel=channels[1],
                                       xscale=scale[channels[0]],
                                       yscale=scale[channels[1]],
                                       **kwargs)
        else:
            raise util.CytoflowViewError(
                "Can't specify more than two channels for a default view")
Ejemplo n.º 2
0
    def default_view(self, **kwargs):
        """
        Returns a diagnostic plot of the k-means clustering.
         
        Returns
        -------
            IView : an IView, call :meth:`KMeans1DView.plot` to see the diagnostic plot.
        """
        channels = kwargs.pop('channels', self.channels)
        scale = kwargs.pop('scale', self.scale)

        for c in channels:
            if c not in self.channels:
                raise util.CytoflowViewError(
                    'channels',
                    "Channel {} isn't in the operation's channels".format(c))

        for s in scale:
            if s not in self.channels:
                raise util.CytoflowViewError(
                    'scale',
                    "Channel {} isn't in the operation's channels".format(s))

        for c in channels:
            if c not in scale:
                scale[c] = util.get_default_scale()

        if len(channels) == 0:
            raise util.CytoflowViewError(
                'channels',
                "Must specify at least one channel for a default view")
        elif len(channels) == 1:
            v = KMeans1DView(op=self)
            v.trait_set(channel=channels[0],
                        scale=scale[channels[0]],
                        **kwargs)
            return v

        elif len(channels) == 2:
            v = KMeans2DView(op=self)
            v.trait_set(xchannel=channels[0],
                        ychannel=channels[1],
                        xscale=scale[channels[0]],
                        yscale=scale[channels[1]],
                        **kwargs)
            return v

        else:
            raise util.CytoflowViewError(
                'channels',
                "Can't specify more than two channels for a default view")
Ejemplo n.º 3
0
    def plot(self, experiment, **kwargs):
        """
        Parameters
        ----------
        lim : Dict(Str : (float, float))
            Set the range of each channel's axis.  If unspecified, assume
            that the limits are the minimum and maximum of the clipped data
        """

        if experiment is None:
            raise util.CytoflowViewError('experiment',
                                         "No experiment specified")

        if len(self.channels) == 0:
            raise util.CytoflowOpError('channels',
                                       "Must set at least one channel")

        if len(self.channels) != len(set(self.channels)):
            raise util.CytoflowOpError('channels',
                                       "Must not duplicate channels")

        for c in self.channels:
            if c not in experiment.data:
                raise util.CytoflowOpError(
                    'channels',
                    "Channel {0} not found in the experiment".format(c))

        for c in self.scale:
            if c not in self.channels:
                raise util.CytoflowOpError(
                    'scale', "Scale set for channel {0}, but it isn't "
                    "in 'channels'".format(c))

        # get the scale
        scale = {}
        for c in self.channels:
            if c in self.scale:
                scale[c] = util.scale_factory(self.scale[c],
                                              experiment,
                                              channel=c)
            else:
                scale[c] = util.scale_factory(util.get_default_scale(),
                                              experiment,
                                              channel=c)

        lim = kwargs.pop("lim", {})
        for c in self.channels:
            if c not in lim:
                lim[c] = None

        super().plot(experiment, lim=lim, scale=scale, **kwargs)
Ejemplo n.º 4
0
    def estimate(self, experiment, subset=None):
        """
        Estimate the Gaussian mixture model parameters
        """

        if experiment is None:
            raise util.CytoflowOpError("No experiment specified")

        if len(self.channels) == 0:
            raise util.CytoflowOpError("Must set at least one channel")

        for c in self.channels:
            if c not in experiment.data:
                raise util.CytoflowOpError(
                    "Channel {0} not found in the experiment".format(c))

        for c in self.scale:
            if c not in self.channels:
                raise util.CytoflowOpError(
                    "Scale set for channel {0}, but it isn't "
                    "in the experiment".format(c))

        for b in self.by:
            if b not in experiment.data:
                raise util.CytoflowOpError("Aggregation metadata {0} not found"
                                           " in the experiment".format(b))
            if len(experiment.data[b].unique()) > 100:  #WARNING - magic number
                raise util.CytoflowOpError(
                    "More than 100 unique values found for"
                    " aggregation metadata {0}.  Did you"
                    " accidentally specify a data channel?".format(b))

        if subset:
            try:
                experiment = experiment.query(subset)
            except:
                raise util.CytoflowViewError(
                    "Subset string '{0}' isn't valid".format(subset))

            if len(experiment) == 0:
                raise util.CytoflowViewError(
                    "Subset string '{0}' returned no events".format(subset))

        if self.by:
            groupby = experiment.data.groupby(self.by)
        else:
            # use a lambda expression to return a group that contains
            # all the events
            groupby = experiment.data.groupby(lambda _: True)

        # get the scale. estimate the scale params for the ENTIRE data set,
        # not subsets we get from groupby().  And we need to save it so that
        # the data is transformed the same way when we apply()
        for c in self.channels:
            if c in self.scale:
                self._scale[c] = util.scale_factory(self.scale[c],
                                                    experiment,
                                                    channel=c)
#                 if self.scale[c] == 'log':
#                     self._scale[c].mode = 'mask'
            else:
                self._scale[c] = util.scale_factory(util.get_default_scale(),
                                                    experiment,
                                                    channel=c)

        for data_group, data_subset in groupby:
            if len(data_subset) == 0:
                raise util.CytoflowOpError(
                    "Group {} had no data".format(data_group))
            x = data_subset.loc[:, self.channels[:]]
            for c in self.channels:
                x[c] = self._scale[c](x[c])

            # drop data that isn't in the scale range
            for c in self.channels:
                x = x[~(np.isnan(x[c]))]
            x = x.values

            #### choose the number of clusters and fit the kmeans
            num_clusters = [
                util.num_hist_bins(x[:, c]) for c in range(len(self.channels))
            ]
            num_clusters = np.ceil(np.median(num_clusters))
            num_clusters = int(num_clusters)

            self._kmeans[data_group] = kmeans = \
                sklearn.cluster.MiniBatchKMeans(n_clusters = num_clusters)

            kmeans.fit(x)
            x_labels = kmeans.predict(x)
            d = len(self.channels)

            #### use the kmeans centroids to parameterize a finite gaussian
            #### mixture model which estimates the density function

            d = len(self.channels)
            s0 = np.zeros([d, d])
            for j in range(d):
                r = x[d].max() - x[d].min()
                s0[j, j] = (r / (num_clusters**(1. / d)))**0.5

            means = []
            weights = []
            normals = []
            beta_max = []

            for k in range(num_clusters):
                xk = x[x_labels == k]
                num_k = np.sum(x_labels == k)
                weight_k = num_k / len(x_labels)
                mu = xk.mean(axis=0)
                means.append(mu)
                s = np.cov(xk, rowvar=False)

                el = num_k / (num_clusters + num_k)
                s_smooth = el * self.h * s + (1.0 - el) * self.h0 * s0

                n = scipy.stats.multivariate_normal(mean=mu, cov=s_smooth)
                weights.append(weight_k)
                normals.append(lambda x, n=n: n.pdf(x))

                # get appropriate step size for peak finding
                min_b = np.inf
                for b in np.diagonal(s_smooth):
                    if np.sqrt(b) < min_b:
                        min_b = np.sqrt(b)
                beta_max.append(b)

            self._normals[data_group] = normals
            self._density[
                data_group] = density = lambda x, weights=weights, normals=normals: np.sum(
                    [w * n(x) for w, n in zip(weights, normals)], axis=0)

            ### use optimization on the finite gmm to find the local peak for
            ### each kmeans cluster
            peaks = []
            peak_clusters = []  # peak idx --> list of clusters

            min_mu = [np.inf] * len(self.channels)
            max_mu = [-1.0 * np.inf] * len(self.channels)

            for k in range(num_clusters):
                mu = means[k]
                for ci in range(len(self.channels)):
                    if mu[ci] < min_mu[ci]:
                        min_mu[ci] = mu[ci]
                    if mu[ci] > max_mu[ci]:
                        max_mu[ci] = mu[ci]

            constraints = []
            for ci, c in enumerate(self.channels):
                constraints.append({
                    'type':
                    'ineq',
                    'fun':
                    lambda x, min_mu=min_mu[ci]: x - min_mu
                })
                constraints.append({
                    'type':
                    'ineq',
                    'fun':
                    lambda x, max_mu=max_mu[ci]: max_mu - x
                })

            for k in range(num_clusters):
                mu = means[k]
                f = lambda x: -1.0 * density(x)

                res = scipy.optimize.minimize(f,
                                              mu,
                                              method='COBYLA',
                                              constraints=constraints,
                                              options={
                                                  'rhobeg': beta_max[k],
                                                  'maxiter': 5000
                                              })
                if not res.success:
                    raise util.CytoflowOpError(
                        "Peak finding failed for cluster {}: {}".format(
                            k, res.message))


#                 ### The peak-searching algorithm from the paper.  works fine,
#                 ### but slow!  we get similar results with the COBYLA
#                 ### optimization method from scipy, using an appropriate rho
#                 x0 = x = means[k]
#                 k0 = k
#                 b = beta_max[k] / 10.0
#                 Nsuc = 0
#                 n = 0
#
#                 while(n < 1000):
# #                     df = scipy.misc.derivative(density, x, 1e-6)
#                     df = statsmodels.tools.numdiff.approx_fprime(x, density)
#                     if np.linalg.norm(df) < 1e-3:
#                         break
#
#                     y = x + b * df / np.linalg.norm(df)
#                     if density(y) <= density(x):
#                         Nsuc = 0
#                         b = b / 2.0
#                         continue
#
#                     Nsuc += 1
#                     if Nsuc >= 2:
#                         b = min(2*b, beta_max[k])
#
#                     ky = kmeans.predict(y[np.newaxis, :])[0]
#                     if ky == k:
#                         x = y
#                     else:
#                         k = ky
#                         b = beta_max[k] / 10.0
#                         mu = means[k]
#                         if density(mu) > density(y):
#                             x = mu
#                         else:
#                             x = y
#
#                     n += 1
#
#
#
#                 print("{} --> {}, {}".format(x0, x, n))

                merged = False
                for pi, p in enumerate(peaks):
                    if np.linalg.norm(p - res.x) < (1e-2):
                        peak_clusters[pi].append(k)
                        merged = True
                        break

                if not merged:
                    peak_clusters.append([k])
                    peaks.append(res.x)

            self._peaks[data_group] = peaks

            ### merge peaks that are sufficiently close

            groups = [[x] for x in range(len(peaks))]
            peak_groups = [x for x in range(len(peaks))
                           ]  # peak idx --> group idx

            def max_tol(x, y):
                f = lambda a: density(a[np.newaxis, :])
                #                 lx = kmeans.predict(x[np.newaxis, :])[0]
                #                 ly = kmeans.predict(y[np.newaxis, :])[0]
                n = len(x)
                n_scale = 1

                #                 n_scale = np.sqrt(((nx + ny) / 2.0) / (n / num_clusters))

                def tol(t):
                    zt = x + t * (y - x)
                    fhat_zt = f(x) + t * (f(y) - f(x))
                    return -1.0 * abs((f(zt) - fhat_zt) / fhat_zt) * n_scale

                res = scipy.optimize.minimize_scalar(tol,
                                                     bounds=[0, 1],
                                                     method='Bounded')

                if res.status != 0:
                    raise util.CytoflowOpError(
                        "tol optimization failed for {}, {}".format(x, y))
                return -1.0 * res.fun

            def nearest_neighbor_dist(k):
                min_dist = np.inf

                for i in range(num_clusters):
                    if i == k:
                        continue
                    dist = np.linalg.norm(means[k] - means[i])
                    if dist < min_dist:
                        min_dist = dist

                return min_dist

            sk = [nearest_neighbor_dist(x) for x in range(num_clusters)]

            def s(x):
                k = kmeans.predict(x[np.newaxis, :])[0]
                return sk[k]

            def can_merge(g, h):
                for pg in g:
                    for ph in h:
                        vg = peaks[pg]
                        vh = peaks[ph]
                        dist_gh = np.linalg.norm(vg - vh)

                        if max_tol(vg, vh) < self.tol and dist_gh / (
                                s(vg) + s(vh)) <= self.merge_dist:
                            return True

                return False

            while True:
                if len(groups) == 1:
                    break

                # find closest mergable groups
                min_dist = np.inf
                for gi in range(len(groups)):
                    g = groups[gi]

                    for hi in range(gi + 1, len(groups)):
                        h = groups[hi]

                        if can_merge(g, h):
                            dist_gh = np.inf
                            for pg in g:
                                vg = peaks[pg]
                                for ph in h:
                                    vh = peaks[ph]
                                    #                                     print("vg {} vh {}".format(vg, vh))
                                    dist_gh = min(dist_gh,
                                                  np.linalg.norm(vg - vh))

                            if dist_gh < min_dist:
                                min_gi = gi
                                min_hi = hi
                                min_dist = dist_gh

                if min_dist == np.inf:
                    break

                # merge the groups
                groups[min_gi].extend(groups[min_hi])
                for g in groups[min_hi]:
                    peak_groups[g] = min_gi
                del groups[min_hi]

        cluster_group = [0] * num_clusters
        cluster_peaks = [0] * num_clusters

        for gi, g in enumerate(groups):
            for p in g:
                for cluster in peak_clusters[p]:
                    cluster_group[cluster] = gi
                    cluster_peaks[cluster] = p

        self._peaks[data_group] = peaks
        self._cluster_peak[data_group] = cluster_peaks
        self._cluster_group[data_group] = cluster_group
Ejemplo n.º 5
0
    def estimate(self, experiment, subset = None):
        """
        Estimate the Gaussian mixture model parameters
        
        Parameters
        ----------
        experiment : Experiment
            The data to use to estimate the mixture parameters
            
        subset : str (default = None)
            If set, a Python expression to determine the subset of the data
            to use to in the estimation.
        """

        if experiment is None:
            raise util.CytoflowOpError('experiment',
                                       "No experiment specified")
        
        if len(self.channels) == 0:
            raise util.CytoflowOpError('channels',
                                       "Must set at least one channel")

        if len(self.channels) != len(set(self.channels)):
            raise util.CytoflowOpError('channels', 
                                       "Must not duplicate channels")

        for c in self.channels:
            if c not in experiment.data:
                raise util.CytoflowOpError('channels',
                                           "Channel {0} not found in the experiment"
                                      .format(c))
                
        for c in self.scale:
            if c not in self.channels:
                raise util.CytoflowOpError('channels',
                                           "Scale set for channel {0}, but it isn't "
                                           "in the experiment"
                                           .format(c))
       
        for b in self.by:
            if b not in experiment.data:
                raise util.CytoflowOpError('by',
                                           "Aggregation metadata {} not found, "
                                           "must be one of {}"
                                           .format(b, experiment.conditions))
                
        if subset:
            try:
                experiment = experiment.query(subset)
            except:
                raise util.CytoflowViewError('subset',
                                             "Subset string '{0}' isn't valid"
                                             .format(subset))
                
            if len(experiment) == 0:
                raise util.CytoflowViewError('subset',
                                             "Subset string '{0}' returned no events"
                                             .format(subset))
                
        if self.by:
            groupby = experiment.data.groupby(self.by)
        else:
            # use a lambda expression to return a group that contains
            # all the events
            groupby = experiment.data.groupby(lambda _: True)
            
        # get the scale. estimate the scale params for the ENTIRE data set,
        # not subsets we get from groupby().  And we need to save it so that
        # the data is transformed the same way when we apply()
        for c in self.channels:
            if c in self.scale:
                self._scale[c] = util.scale_factory(self.scale[c], experiment, channel = c)
            else:
                self._scale[c] = util.scale_factory(util.get_default_scale(), experiment, channel = c)
        
        gmms = {}
            
        for group, data_subset in groupby:
            if len(data_subset) == 0:
                raise util.CytoflowOpError(None,
                                           "Group {} had no data"
                                           .format(group))
            x = data_subset.loc[:, self.channels[:]]
            for c in self.channels:
                x[c] = self._scale[c](x[c])
            
            # drop data that isn't in the scale range
            for c in self.channels:
                x = x[~(np.isnan(x[c]))]
            x = x.values
            
            gmm = sklearn.mixture.GaussianMixture(n_components = self.num_components,
                                                  covariance_type = "full",
                                                  random_state = 1)
            gmm.fit(x)
            
            if not gmm.converged_:
                raise util.CytoflowOpError(None,
                                           "Estimator didn't converge"
                                           " for group {0}"
                                           .format(group))
                
            # in the 1D version, we sorted the components by the means -- so
            # the first component has the lowest mean, the second component
            # has the next-lowest mean, etc.
            
            # that doesn't work in the general case.  instead, we assume that 
            # the clusters are likely (?) to be arranged along *one* of the 
            # axes, so we take the |norm| of the mean of each cluster and 
            # sort that way.
            
            norms = np.sum(gmm.means_ ** 2, axis = 1) ** 0.5
            sort_idx = np.argsort(norms)
            gmm.means_ = gmm.means_[sort_idx]
            gmm.weights_ = gmm.weights_[sort_idx]
            gmm.covariances_ = gmm.covariances_[sort_idx]
            gmm.precisions_ = gmm.precisions_[sort_idx]
            gmm.precisions_cholesky_ = gmm.precisions_cholesky_[sort_idx]

            
            gmms[group] = gmm
            
        self._gmms = gmms
Ejemplo n.º 6
0
    def default_view(self, **kwargs):
        """
        Returns a diagnostic plot of the Gaussian mixture model.
        
        Parameters
        ----------
        channels : List(Str)
            Which channels to plot?  Must be contain either one or two channels.
            
        scale : List({'linear', 'log', 'logicle'})
            How to scale the channels before plotting them
            
        density : bool
            Should we plot a scatterplot or the estimated density function?
         
        Returns
        -------
        IView
            an IView, call :meth:`plot` to see the diagnostic plot.
        """
        channels = kwargs.pop('channels', self.channels)
        scale = kwargs.pop('scale', self.scale)
        density = kwargs.pop('density', False)

        for c in channels:
            if c not in self.channels:
                raise util.CytoflowViewError(
                    'channels',
                    "Channel {} isn't in the operation's channels".format(c))

        for s in scale:
            if s not in self.channels:
                raise util.CytoflowViewError(
                    'channels',
                    "Channel {} isn't in the operation's channels".format(s))

        for c in channels:
            if c not in scale:
                scale[c] = util.get_default_scale()

        if len(channels) == 0:
            raise util.CytoflowViewError(
                'channels',
                "Must specify at least one channel for a default view")
        elif len(channels) == 1:
            v = FlowPeaks1DView(op=self)
            v.trait_set(channel=channels[0],
                        scale=scale[channels[0]],
                        **kwargs)
            return v

        elif len(channels) == 2:
            if density:
                v = FlowPeaks2DDensityView(op=self)
                v.trait_set(xchannel=channels[0],
                            ychannel=channels[1],
                            xscale=scale[channels[0]],
                            yscale=scale[channels[1]],
                            **kwargs)
                return v

            else:
                v = FlowPeaks2DView(op=self)
                v.trait_set(xchannel=channels[0],
                            ychannel=channels[1],
                            xscale=scale[channels[0]],
                            yscale=scale[channels[1]],
                            **kwargs)
                return v
        else:
            raise util.CytoflowViewError(
                None,
                "Can't specify more than two channels for a default view")
Ejemplo n.º 7
0
    def estimate(self, experiment, subset=None):
        """
        Estimate the decomposition
        
        Parameters
        ----------
        experiment : Experiment
            The :class:`.Experiment` to use to estimate the k-means clusters
            
        subset : str (default = None)
            A Python expression that specifies a subset of the data in 
            ``experiment`` to use to parameterize the operation.

        """

        if experiment is None:
            raise util.CytoflowOpError('experiment', "No experiment specified")

        if len(self.channels) == 0:
            raise util.CytoflowOpError('channels',
                                       "Must set at least one channel")

        for c in self.channels:
            if c not in experiment.data:
                raise util.CytoflowOpError(
                    'channels',
                    "Channel {0} not found in the experiment".format(c))

        if self.num_components > len(self.channels):
            raise util.CytoflowOpError(
                'num_components', "Number of components must be less than "
                "or equal to number of channels.")

        for c in self.scale:
            if c not in self.channels:
                raise util.CytoflowOpError(
                    'scale', "Scale set for channel {0}, but it isn't "
                    "in `channels`".format(c))

        for b in self.by:
            if b not in experiment.data:
                raise util.CytoflowOpError(
                    'by', "Aggregation metadata {} not found, "
                    "must be one of {}".format(b, experiment.conditions))

        if subset:
            try:
                experiment = experiment.query(subset)
            except:
                raise util.CytoflowOpError(
                    'subset', "Subset string '{0}' isn't valid".format(subset))

            if len(experiment) == 0:
                raise util.CytoflowOpError(
                    'subset',
                    "Subset string '{0}' returned no events".format(subset))

        if self.by:
            groupby = experiment.data.groupby(self.by)
        else:
            # use a lambda expression to return a group that contains
            # all the events
            groupby = experiment.data.groupby(lambda _: True)

        # get the scale. estimate the scale params for the ENTIRE data set,
        # not subsets we get from groupby().  And we need to save it so that
        # the data is transformed the same way when we apply()
        for c in self.channels:
            if c in self.scale:
                self._scale[c] = util.scale_factory(self.scale[c],
                                                    experiment,
                                                    channel=c)
            else:
                self._scale[c] = util.scale_factory(util.get_default_scale(),
                                                    experiment,
                                                    channel=c)

        for group, data_subset in groupby:
            if len(data_subset) == 0:
                raise util.CytoflowOpError(
                    'by', "Group {} had no data".format(group))
            x = data_subset.loc[:, self.channels[:]]
            for c in self.channels:
                x[c] = self._scale[c](x[c])

            # drop data that isn't in the scale range
            for c in self.channels:
                x = x[~(np.isnan(x[c]))]
            x = x.values

            self._pca[group] = pca = \
                sklearn.decomposition.PCA(n_components = self.num_components,
                                          whiten = self.whiten,
                                          random_state = 0)

            pca.fit(x)
Ejemplo n.º 8
0
    def estimate(self, experiment, subset=None):
        """
        Estimate the Gaussian mixture model parameters
        """

        if experiment is None:
            raise util.CytoflowOpError("No experiment specified")

        if self.num_clusters < 2:
            raise util.CytoflowOpError("num_clusters must be >= 2")

        if len(self.channels) == 0:
            raise util.CytoflowOpError("Must set at least one channel")

        for c in self.channels:
            if c not in experiment.data:
                raise util.CytoflowOpError(
                    "Channel {0} not found in the experiment".format(c))

        for c in self.scale:
            if c not in self.channels:
                raise util.CytoflowOpError(
                    "Scale set for channel {0}, but it isn't "
                    "in the experiment".format(c))

        for b in self.by:
            if b not in experiment.data:
                raise util.CytoflowOpError("Aggregation metadata {0} not found"
                                           " in the experiment".format(b))
            if len(experiment.data[b].unique()) > 100:  #WARNING - magic number
                raise util.CytoflowOpError(
                    "More than 100 unique values found for"
                    " aggregation metadata {0}.  Did you"
                    " accidentally specify a data channel?".format(b))

        if subset:
            try:
                experiment = experiment.query(subset)
            except:
                raise util.CytoflowViewError(
                    "Subset string '{0}' isn't valid".format(subset))

            if len(experiment) == 0:
                raise util.CytoflowViewError(
                    "Subset string '{0}' returned no events".format(subset))

        if self.by:
            groupby = experiment.data.groupby(self.by)
        else:
            # use a lambda expression to return a group that contains
            # all the events
            groupby = experiment.data.groupby(lambda _: True)

        # get the scale. estimate the scale params for the ENTIRE data set,
        # not subsets we get from groupby().  And we need to save it so that
        # the data is transformed the same way when we apply()
        for c in self.channels:
            if c in self.scale:
                self._scale[c] = util.scale_factory(self.scale[c],
                                                    experiment,
                                                    channel=c)
            else:
                self._scale[c] = util.scale_factory(util.get_default_scale(),
                                                    experiment,
                                                    channel=c)

        for group, data_subset in groupby:
            if len(data_subset) == 0:
                raise util.CytoflowOpError(
                    "Group {} had no data".format(group))
            x = data_subset.loc[:, self.channels[:]]
            for c in self.channels:
                x[c] = self._scale[c](x[c])

            # drop data that isn't in the scale range
            for c in self.channels:
                x = x[~(np.isnan(x[c]))]
            x = x.values

            self._kmeans[group] = kmeans = \
                sklearn.cluster.MiniBatchKMeans(n_clusters = self.num_clusters)

            kmeans.fit(x)