Beispiel #1
0
    def apply(self, experiment):
        """
        Assigns new metadata to events using the mixture model estimated
        in :meth:`estimate`.
        
        Returns
        -------
        Experiment
            A new :class:`.Experiment` with the new condition variables as
            described in the class documentation.  Also adds the following
            new statistics:
            
            - **mean** : Float
                the mean of the fitted gaussian in each channel for each component.
                
            - **sigma** : (Float, Float)
                the locations the mean +/- one standard deviation in each channel
                for each component.
                
            - **correlation** : Float
                the correlation coefficient between each pair of channels for each
                component.
                
            - **proportion** : Float
                the proportion of events in each component of the mixture model.  only
                added if :attr:`num_components` ``> 1``.
        """
             
        if experiment is None:
            raise util.CytoflowOpError('experiment',
                                       "No experiment specified")
         
        if len(self.channels) == 0:
            raise util.CytoflowOpError('channels',
                                       "Must set at least one channel")
         
        # make sure name got set!
        if not self.name:
            raise util.CytoflowOpError('name',
                                       "You have to set the gate's name "
                                       "before applying it!")
            
        if self.name != util.sanitize_identifier(self.name):
            raise util.CytoflowOpError('name',
                                       "Name can only contain letters, numbers and underscores."
                                       .format(self.name)) 
        
        if self.num_components > 1 and self.name in experiment.data.columns:
            raise util.CytoflowOpError('name',
                                       "Experiment already has a column named {0}"
                                       .format(self.name))
            
        if self.sigma is not None:
            for i in range(1, self.num_components + 1):
                cname = "{}_{}".format(self.name, i)
                if cname in experiment.data.columns:
                    raise util.CytoflowOpError('name',
                                               "Experiment already has a column named {}"
                                               .format(cname))
 
        if self.posteriors:
            for i in range(1, self.num_components + 1):
                cname = "{}_{}_posterior".format(self.name, i)
                if cname in experiment.data.columns:
                    raise util.CytoflowOpError('name',
                                               "Experiment already has a column named {}"
                                               .format(cname))               
         
        if not self._gmms:
            raise util.CytoflowOpError(None, 
                                       "No components found.  Did you forget to "
                                       "call estimate()?")
            
        for c in self.channels:
            if c not in self._scale:
                raise util.CytoflowOpError(None,
                                           "Model scale not set.  Did you forget "
                                           "to call estimate()?")
 
        for c in self.channels:
            if c not in experiment.channels:
                raise util.CytoflowOpError('channels',
                                           "Channel {0} not found in the experiment"
                                           .format(c))
        
        for b in self.by:
            if b not in experiment.conditions:
                raise util.CytoflowOpError('by',
                                           "Aggregation metadata {} not found, "
                                           "must be one of {}"
                                           .format(b, experiment.conditions))
#                             
#         if self.num_components == 1 and self.sigma == 0.0:
#             raise util.CytoflowOpError('sigma',
#                                        "if num_components is 1, sigma must be > 0.0")
        
                
        if self.num_components == 1 and self.posteriors:
            warn("If num_components == 1, all posteriors will be 1",
                 util.CytoflowOpWarning)
#             raise util.CytoflowOpError('posteriors',
#                                        "If num_components == 1, all posteriors will be 1.")
         
        if self.num_components > 1:
            event_assignments = pd.Series(["{}_None".format(self.name)] * len(experiment), dtype = "object")
 
        if self.sigma is not None:
            event_gate = {i : pd.Series([False] * len(experiment), dtype = "double")
                           for i in range(self.num_components)}
 
        if self.posteriors:
            event_posteriors = {i : pd.Series([0.0] * len(experiment), dtype = "double")
                                for i in range(self.num_components)}

        if self.by:
            groupby = experiment.data.groupby(self.by)
        else:
            # use a lambda expression to return a group that
            # contains all the events
            groupby = experiment.data.groupby(lambda _: True)   

        # make the statistics       
        components = [x + 1 for x in range(self.num_components)]
         
        prop_idx = pd.MultiIndex.from_product([experiment[x].unique() for x in self.by] + [components], 
                                         names = list(self.by) + ["Component"])
        prop_stat = pd.Series(name = "{} : {}".format(self.name, "proportion"),
                              index = prop_idx, 
                              dtype = np.dtype(object)).sort_index()
                  
        mean_idx = pd.MultiIndex.from_product([experiment[x].unique() for x in self.by] + [components] + [self.channels], 
                                              names = list(self.by) + ["Component"] + ["Channel"])
        mean_stat = pd.Series(name = "{} : {}".format(self.name, "mean"),
                              index = mean_idx, 
                              dtype = np.dtype(object)).sort_index()
        sigma_stat = pd.Series(name = "{} : {}".format(self.name, "sigma"),
                               index = mean_idx,
                               dtype = np.dtype(object)).sort_index()
        interval_stat = pd.Series(name = "{} : {}".format(self.name, "interval"),
                                  index = mean_idx, 
                                  dtype = np.dtype(object)).sort_index()

        corr_idx = pd.MultiIndex.from_product([experiment[x].unique() for x in self.by] + [components] + [self.channels] + [self.channels], 
                                              names = list(self.by) + ["Component"] + ["Channel_1"] + ["Channel_2"])
        corr_stat = pd.Series(name = "{} : {}".format(self.name, "correlation"),
                              index = corr_idx, 
                              dtype = np.dtype(object)).sort_index()  
                 
        for group, data_subset in groupby:
            if group not in self._gmms:
                # there weren't any events in this group, so we didn't get
                # a gmm.
                continue
             
            gmm = self._gmms[group]
            x = data_subset.loc[:, self.channels[:]]
            for c in self.channels:
                x[c] = self._scale[c](x[c])
                
            # which values are missing?

            x_na = pd.Series([False] * len(x))
            for c in self.channels:
                x_na[np.isnan(x[c]).values] = True
                        
            x = x.values
            x_na = x_na.values
            group_idx = groupby.groups[group]
 
            if self.num_components > 1:
                predicted = np.full(len(x), -1, "int")
                predicted[~x_na] = gmm.predict(x[~x_na])
                
                predicted_str = pd.Series(["(none)"] * len(predicted))
                for c in range(0, self.num_components):
                    predicted_str[predicted == c] = "{0}_{1}".format(self.name, c + 1)
                predicted_str[predicted == -1] = "{0}_None".format(self.name)
                predicted_str.index = group_idx
     
                event_assignments.iloc[group_idx] = predicted_str
                
            # if we're doing sigma-based gating, for each component check
            # to see if the event is in the sigma gate.
            if self.sigma is not None:
                for c in range(self.num_components):
                    s = np.linalg.pinv(gmm.covariances_[c])
                    mu = gmm.means_[c]
                    
                    # compute the Mahalanobis distance

                    f = lambda x, mu, s: np.dot(np.dot((x - mu).T, s), (x - mu))
                    dist = np.apply_along_axis(f, 1, x, mu, s)

                    # come up with a threshold based on sigma.  you'll note we
                    # didn't sqrt dist: that's because for a multivariate 
                    # Gaussian, the square of the Mahalanobis distance is
                    # chi-square distributed
                    
                    p = (scipy.stats.norm.cdf(self.sigma) - 0.5) * 2
                    thresh = scipy.stats.chi2.ppf(p, 1)
                    
                    event_gate[c].iloc[group_idx] = np.less_equal(dist, thresh)
                    
            if self.posteriors:  
                p = np.full((len(x), self.num_components), 0.0)
                p[~x_na] = gmm.predict_proba(x[~x_na])
                for c in range(self.num_components):
                    event_posteriors[c].iloc[group_idx] = p[:, c]
                    
            for c in range(self.num_components):
                if len(self.by) == 0:
                    g = tuple([c + 1])
                elif hasattr(group, '__iter__') and not isinstance(group, (str, bytes)):
                    g = tuple(list(group) + [c + 1])
                else:
                    g = tuple([group] + [c + 1])

                prop_stat.at[g] = gmm.weights_[c]
                
                for cidx1, channel1 in enumerate(self.channels):
                    g2 = tuple(list(g) + [channel1])
                    mean_stat.at[g2] = self._scale[channel1].inverse(gmm.means_[c, cidx1])
                    
                    s, corr = util.cov2corr(gmm.covariances_[c])
                    sigma_stat[g2] = (self._scale[channel1].inverse(s[cidx1]))
                    interval_stat.at[g2] = (self._scale[channel1].inverse(gmm.means_[c, cidx1] - s[cidx1]),
                                             self._scale[channel1].inverse(gmm.means_[c, cidx1] + s[cidx1]))
            
                    for cidx2, channel2 in enumerate(self.channels):
                        g3 = tuple(list(g2) + [channel2])
                        corr_stat[g3] = corr[cidx1, cidx2]
                        
                    corr_stat.drop(tuple(list(g2) + [channel1]), inplace = True)

        new_experiment = experiment.clone()
          
        if self.num_components > 1:
            new_experiment.add_condition(self.name, "category", event_assignments)
            
        if self.sigma is not None:
            for c in range(self.num_components):
                gate_name = "{}_{}".format(self.name, c + 1)
                new_experiment.add_condition(gate_name, "bool", event_gate[c])              
                
        if self.posteriors:
            for c in range(self.num_components):
                post_name = "{}_{}_posterior".format(self.name, c + 1)
                new_experiment.add_condition(post_name, "double", event_posteriors[c])
                
        new_experiment.statistics[(self.name, "mean")] = pd.to_numeric(mean_stat)
        new_experiment.statistics[(self.name, "sigma")] = sigma_stat
        new_experiment.statistics[(self.name, "interval")] = interval_stat
        if len(corr_stat) > 0:
            new_experiment.statistics[(self.name, "correlation")] = pd.to_numeric(corr_stat)
        if self.num_components > 1:
            new_experiment.statistics[(self.name, "proportion")] = pd.to_numeric(prop_stat)

        new_experiment.history.append(self.clone_traits(transient = lambda _: True))
        return new_experiment
Beispiel #2
0
    def apply(self, experiment):
        """
        Assigns new metadata to events using the mixture model estimated
        in `estimate`.
        """

        if experiment is None:
            raise util.CytoflowOpError("No experiment specified")

        if len(self.channels) == 0:
            raise util.CytoflowOpError("Must set at least one channel")

        # make sure name got set!
        if not self.name:
            raise util.CytoflowOpError("You have to set the gate's name "
                                       "before applying it!")

        if self.num_components > 1 and self.name in experiment.data.columns:
            raise util.CytoflowOpError(
                "Experiment already has a column named {0}".format(self.name))

        if self.sigma > 0:
            for i in range(1, self.num_components + 1):
                cname = "{}_{}".format(self.name, i)
                if cname in experiment.data.columns:
                    raise util.CytoflowOpError(
                        "Experiment already has a column named {}".format(
                            cname))

        if self.posteriors:
            for i in range(1, self.num_components + 1):
                cname = "{}_{}_posterior".format(self.name, i)
                if cname in experiment.data.columns:
                    raise util.CytoflowOpError(
                        "Experiment already has a column named {}".format(
                            cname))

        if not self._gmms:
            raise util.CytoflowOpError(
                "No components found.  Did you forget to "
                "call estimate()?")

        for c in self.channels:
            if c not in experiment.channels:
                raise util.CytoflowOpError(
                    "Channel {0} not found in the experiment".format(c))

        if self.posteriors:
            col_name = "{0}_Posterior".format(self.name)
            if col_name in experiment.data:
                raise util.CytoflowOpError(
                    "Column {0} already found in the experiment".format(
                        col_name))

        for b in self.by:
            if b not in experiment.data:
                raise util.CytoflowOpError("Aggregation metadata {0} not found"
                                           " in the experiment".format(b))

            if len(experiment.data[b].unique()) > 100:  #WARNING - magic number
                raise util.CytoflowOpError(
                    "More than 100 unique values found for"
                    " aggregation metadata {0}.  Did you"
                    " accidentally specify a data channel?".format(b))

        if self.num_components == 1 and self.sigma == 0.0:
            raise util.CytoflowOpError(
                "if num_components is 1, sigma must be > 0.0")

        if self.num_components == 1 and self.posteriors:
            raise util.CytoflowOpError(
                "If num_components == 1, all posteriors will be 1.")

        if self.num_components > 1:
            event_assignments = pd.Series(["{}_None".format(self.name)] *
                                          len(experiment),
                                          dtype="object")

        if self.sigma > 0:
            event_gate = {
                i: pd.Series([False] * len(experiment), dtype="double")
                for i in range(self.num_components)
            }

        if self.posteriors:
            event_posteriors = {
                i: pd.Series([0.0] * len(experiment), dtype="double")
                for i in range(self.num_components)
            }

        if self.by:
            groupby = experiment.data.groupby(self.by)
        else:
            # use a lambda expression to return a group that
            # contains all the events
            groupby = experiment.data.groupby(lambda _: True)

        # make the statistics
        components = [x + 1 for x in range(self.num_components)]

        prop_idx = pd.MultiIndex.from_product(
            [experiment[x].unique() for x in self.by] + [components],
            names=list(self.by) + ["Component"])
        prop_stat = pd.Series(index=prop_idx,
                              dtype=np.dtype(object)).sort_index()

        mean_idx = pd.MultiIndex.from_product(
            [experiment[x].unique()
             for x in self.by] + [components] + [self.channels],
            names=list(self.by) + ["Component"] + ["Channel"])
        mean_stat = pd.Series(index=mean_idx,
                              dtype=np.dtype(object)).sort_index()
        sigma_stat = pd.Series(index=mean_idx,
                               dtype=np.dtype(object)).sort_index()

        corr_idx = pd.MultiIndex.from_product(
            [experiment[x].unique() for x in self.by] + [components] +
            [self.channels] + [self.channels],
            names=list(self.by) + ["Component"] + ["Channel_1"] +
            ["Channel_2"])
        corr_stat = pd.Series(index=corr_idx,
                              dtype=np.dtype(object)).sort_index()

        for group, data_subset in groupby:
            if group not in self._gmms:
                # there weren't any events in this group, so we didn't get
                # a gmm.
                continue

            gmm = self._gmms[group]
            x = data_subset.loc[:, self.channels[:]]
            for c in self.channels:
                x[c] = self._scale[c](x[c])

            # which values are missing?

            x_na = pd.Series([False] * len(x))
            for c in self.channels:
                x_na[np.isnan(x[c]).values] = True

            x = x.values
            x_na = x_na.values
            group_idx = groupby.groups[group]

            if self.num_components > 1:
                predicted = np.full(len(x), -1, "int")
                predicted[~x_na] = gmm.predict(x[~x_na])

                predicted_str = pd.Series(["(none)"] * len(predicted))
                for c in range(0, self.num_components):
                    predicted_str[predicted == c] = "{0}_{1}".format(
                        self.name, c + 1)
                predicted_str[predicted == -1] = "{0}_None".format(self.name)
                predicted_str.index = group_idx

                event_assignments.iloc[group_idx] = predicted_str

            # if we're doing sigma-based gating, for each component check
            # to see if the event is in the sigma gate.
            if self.sigma > 0.0:
                for c in range(self.num_components):
                    s = np.linalg.pinv(gmm.covariances_[c])
                    mu = gmm.means_[c]

                    # compute the Mahalanobis distance

                    f = lambda x, mu, s: np.dot(np.dot((x - mu).T, s),
                                                (x - mu))
                    dist = np.apply_along_axis(f, 1, x, mu, s)

                    # come up with a threshold based on sigma.  you'll note we
                    # didn't sqrt dist: that's because for a multivariate
                    # Gaussian, the square of the Mahalanobis distance is
                    # chi-square distributed

                    p = (scipy.stats.norm.cdf(self.sigma) - 0.5) * 2
                    thresh = scipy.stats.chi2.ppf(p, 1)

                    event_gate[c].iloc[group_idx] = np.less_equal(dist, thresh)

            if self.posteriors:
                p = gmm.predict(x)
                for c in range(self.num_components):
                    event_posteriors[c].iloc[group_idx] = p[c]

            for c in range(self.num_components):
                if len(self.by) == 0:
                    g = [c + 1]
                elif hasattr(group, '__iter__'):
                    g = tuple(list(group) + [c + 1])
                else:
                    g = tuple([group] + [c + 1])

                prop_stat.loc[g] = gmm.weights_[c]

                for cidx1, channel1 in enumerate(self.channels):
                    g2 = tuple(list(g) + [channel1])
                    mean_stat.loc[g2] = self._scale[channel1].inverse(
                        gmm.means_[c, cidx1])

                    s, corr = util.cov2corr(gmm.covariances_[c])
                    sigma_stat.loc[g2] = (
                        self._scale[channel1].inverse(gmm.means_[c, cidx1] -
                                                      s[cidx1]),
                        self._scale[channel1].inverse(gmm.means_[c, cidx1] +
                                                      s[cidx1]))

                    for cidx2, channel2 in enumerate(self.channels):
                        g3 = tuple(list(g2) + [channel2])
                        corr_stat[g3] = corr[cidx1, cidx2]

                    corr_stat.drop(tuple(list(g2) + [channel1]), inplace=True)

        new_experiment = experiment.clone()

        if self.num_components > 1:
            new_experiment.add_condition(self.name, "category",
                                         event_assignments)

        if self.sigma > 0:
            for c in range(self.num_components):
                gate_name = "{}_{}".format(self.name, c + 1)
                new_experiment.add_condition(gate_name, "bool", event_gate[c])

        if self.posteriors:
            for c in range(self.num_components):
                post_name = "{}_{}_posterior".format(self.name, c + 1)
                new_experiment.add_condition(post_name, "double",
                                             event_posteriors[c])

        new_experiment.statistics[(self.name,
                                   "mean")] = pd.to_numeric(mean_stat)
        new_experiment.statistics[(self.name, "sigma")] = sigma_stat
        if len(corr_stat) > 0:
            new_experiment.statistics[(
                self.name, "correlation")] = pd.to_numeric(corr_stat)
        if self.num_components > 1:
            new_experiment.statistics[(
                self.name, "proportion")] = pd.to_numeric(prop_stat)

        new_experiment.history.append(
            self.clone_traits(transient=lambda _: True))
        return new_experiment