Ejemplo n.º 1
0
 def _get_subset_str(self):
     if self.selected_t and not self.selected_f:
         return "({0} == True)".format(sanitize_identifier(self.name))
     elif not self.selected_t and self.selected_f:
         return "({0} == False)".format(sanitize_identifier(self.name))
     else:
         return ""
Ejemplo n.º 2
0
 def _get_str(self):
     if self.selected_t and not self.selected_f:
         return "({0} == True)".format(util.sanitize_identifier(self.name))
     elif not self.selected_t and self.selected_f:
         return "({0} == False)".format(util.sanitize_identifier(self.name))
     else:
         return ""
Ejemplo n.º 3
0
 def _get_str(self):
     if self.low == self.values[0] and self.high == self.values[-1]:
         return ""
     elif self.low == self.high:
         return "({0} == {1})" \
                .format(util.sanitize_identifier(self.name), self.low)
     else:
         return "({0} >= {1} and {0} <= {2})" \
                .format(util.sanitize_identifier(self.name), self.low, self.high) 
Ejemplo n.º 4
0
 def _set_subset_str(self, val):
     """Update the view based on a subset string"""
     if val == "({0} == True)".format(sanitize_identifier(self.name)):
         self.selected_t = True
         self.selected_f = False
     elif val == "({0} == False)".format(sanitize_identifier(self.name)):
         self.selected_t = False
         self.selected_f = True
     else:
         self.selected_t = False
         self.selected_f = False
Ejemplo n.º 5
0
 def _set_subset_str(self, val):
     """Update the view based on a subset string"""
     if val == "({0} == True)".format(sanitize_identifier(self.name)):
         self.selected_t = True
         self.selected_f = False
     elif val == "({0} == False)".format(sanitize_identifier(self.name)):
         self.selected_t = False
         self.selected_f = True
     else:
         self.selected_t = False
         self.selected_f = False
Ejemplo n.º 6
0
    def _on_import(self):
        """
        Import format: CSV, first column is filename, path relative to CSV.
        others are conditions, type is autodetected.  first row is header
        with names.
        """
        file_dialog = FileDialog()
        file_dialog.wildcard = "CSV files (*.csv)|*.csv|"
        file_dialog.action = 'open'
        file_dialog.open()

        if file_dialog.return_code != PyfaceOK:
            return

        csv = pandas.read_csv(file_dialog.path)
        csv_folder = Path(file_dialog.path).parent

        if self.model.tubes or self.model.tube_traits:
            if confirm(
                    parent=None,
                    message="This will clear the current conditions and tubes! "
                    "Are you sure you want to continue?",
                    title="Clear tubes and conditions?") != YES:
                return

        for col in csv.columns[1:]:
            self.model.tube_traits.append(
                TubeTrait(model=self.model,
                          name=util.sanitize_identifier(col),
                          type='category'))

        for _, row in csv.iterrows():
            filename = csv_folder / row[0]

            try:
                metadata, _ = parse_tube(str(filename), metadata_only=True)
            except Exception as e:
                warning(
                    None,
                    "Had trouble loading file {}: {}".format(filename, str(e)))
                continue

            metadata['CF_File'] = Path(filename).stem
            new_tube = Tube(file=str(filename),
                            parent=self.model,
                            metadata=sanitize_metadata(metadata))
            self.model.tubes.append(new_tube)

            for col in csv.columns[1:]:
                new_tube.trait_set(**{util.sanitize_identifier(col): row[col]})
Ejemplo n.º 7
0
            def validate(self, obj, name, value):
                value = super(ValidPythonIdentifier,
                              self).validate(obj, name, value)
                if util.sanitize_identifier(value) == value:
                    return value

                self.error(obj, name, value)
Ejemplo n.º 8
0
    def query(self, expr, **kwargs):
        """
        Expose pandas.DataFrame.query() to the outside world

        This method "sanitizes" column names first, replacing characters that
        are not valid in a Python identifier with an underscore '_'. So, the
        column name `a column` becomes `a_column`, and can be queried with
        an `a_column == True` or such.
        
        Parameters
        ----------
        expr : string
            The expression to pass to `pandas.DataFrame.query()`.  Must be
            a valid Python expression, something you could pass to `eval()`.
            
        **kwargs : dict
            Other named parameters to pass to `pandas.DataFrame.query()`.
        """

        resolvers = {}
        for name, col in self.data.iteritems():
            new_name = util.sanitize_identifier(name)
            if new_name in resolvers:
                raise util.CytoflowError(
                    "Tried to sanitize column name {1} to "
                    "{2} but it already existed in the "
                    " DataFrame.".format(name, new_name))
            else:
                resolvers[new_name] = col

        return self.data.query(expr, resolvers=({}, resolvers), **kwargs)
Ejemplo n.º 9
0
    def query(self, expr, **kwargs):
        """
        Expose pandas.DataFrame.query() to the outside world

        This method "sanitizes" column names first, replacing characters that
        are not valid in a Python identifier with an underscore '_'. So, the
        column name `a column` becomes `a_column`, and can be queried with
        an `a_column == True` or such.
        
        Parameters
        ----------
        expr : string
            The expression to pass to `pandas.DataFrame.query()`.  Must be
            a valid Python expression, something you could pass to `eval()`.
            
        **kwargs : dict
            Other named parameters to pass to `pandas.DataFrame.query()`.
        """
        
        resolvers = {}
        for name, col in self.data.iteritems():
            new_name = util.sanitize_identifier(name)
            if new_name in resolvers:
                raise util.CytoflowError("Tried to sanitize column name {1} to "
                                         "{2} but it already existed in the "
                                         " DataFrame."
                                         .format(name, new_name))
            else:
                resolvers[new_name] = col

        return self.data.query(expr, resolvers = ({}, resolvers), **kwargs)
Ejemplo n.º 10
0
    def subset(self, name, value):
        """
        A fast way to get a subset of the data where a condition equals a
        particular value.

        This method "sanitizes" column names first, replacing characters that
        are not valid in a Python identifier with an underscore '_'. So, the
        column name `a column` becomes `a_column`, and can be queried with
        an `a_column == True` or such.

        Parameters
        ----------
        name : Str
            A condition; ie, a key in `self.conditions`.

        value : Any
            The value to look for.  Will be checked with equality, ie `==`

        """
        new_name = util.sanitize_identifier(name)

        if new_name not in self.conditions:
            raise util.CytoflowError("Can't find condition '{}'"
                                     .format(name))

        ret = self.clone()
        ret.data = self.data[ self.data[new_name] == value ]
        ret.data.reset_index(drop = True, inplace = True)
        return ret
Ejemplo n.º 11
0
 def subset(self, name, value):
     """
     A fast way to get a subset of the data where a condition equals a 
     particular value.
     
     This method "sanitizes" column names first, replacing characters that
     are not valid in a Python identifier with an underscore '_'. So, the
     column name `a column` becomes `a_column`, and can be queried with
     an `a_column == True` or such.
     
     Parameters
     ----------
     name : Str
         A condition; ie, a key in `self.conditions`.
         
     value : Any
         The value to look for.  Will be checked with equality, ie `==`
     """
     new_name = util.sanitize_identifier(name)
     
     if new_name not in self.conditions:
         raise util.CytoflowError("Can't find condition '{}'"
                                  .format(name))
         
     ret = self.clone()
     ret.data = self.data[ self.data[new_name] == value ]
     return ret
Ejemplo n.º 12
0
    def apply(self, experiment):
        """Applies the ratio operation to an experiment
        
        Parameters
        ----------
        experiment : Experiment
            the old experiment to which this op is applied
            
        Returns
        -------
        Experiment
            a new experiment with the new ratio channel
            
            The new channel also has the following new metadata:

            - **numerator** : Str
                What was the numerator channel for the new one?
        
            - **denominator** : Str
                What was the denominator channel for the new one?
    
        """

        if experiment is None:
            raise util.CytoflowOpError('experiment', "No experiment specified")

        if self.numerator not in experiment.channels:
            raise util.CytoflowOpError(
                'numerator',
                "Channel {0} not in the experiment".format(self.numerator))

        if self.denominator not in experiment.channels:
            raise util.CytoflowOpError(
                'denominator',
                "Channel {0} not in the experiment".format(self.denominator))

        if self.name != util.sanitize_identifier(self.name):
            raise util.CytoflowOpError(
                'name',
                "Name can only contain letters, numbers and underscores.".
                format(self.name))

        if self.name in experiment.channels:
            raise util.CytoflowOpError(
                'name', "New channel {0} is already in the experiment".format(
                    self.name))

        new_experiment = experiment.clone()
        new_experiment.add_channel(
            self.name,
            experiment[self.numerator] / experiment[self.denominator])
        new_experiment.data.replace([np.inf, -np.inf], np.nan, inplace=True)
        new_experiment.data.dropna(inplace=True)
        new_experiment.history.append(
            self.clone_traits(transient=lambda t: True))
        new_experiment.metadata[self.name]['numerator'] = self.numerator
        new_experiment.metadata[self.name]['denominator'] = self.denominator
        return new_experiment
Ejemplo n.º 13
0
def sanitize_metadata(meta):
    ret = {}
    for k, v in meta.items():
        if len(k) > 0 and k[0] == '$':
            k = k[1:]
        k = util.sanitize_identifier(k)
        ret[k] = v
        
    return ret
Ejemplo n.º 14
0
    def apply(self, experiment):
        """Applies the threshold to an experiment.
        
        Parameters
        ----------
        experiment : Experiment
            the experiment to which this operation is applied
            
        Returns
        -------
        Experiment
            a new :class:`~experiment`, the same as the old experiment but with 
            a new column of type ``bool`` with the same name as the operation 
            :attr:`name`.  The new condition is ``True`` if the event's 
            measurement in :attr:`channel` is greater than :attr:`threshold`;
            it is ``False`` otherwise.
        """

        if experiment is None:
            raise util.CytoflowOpError('experiment', "No experiment specified")

        # make sure name got set!
        if not self.name:
            raise util.CytoflowOpError(
                'name', "You have to set the gate's name "
                "before applying it!")

        if self.name != util.sanitize_identifier(self.name):
            raise util.CytoflowOpError(
                'name',
                "Name can only contain letters, numbers and underscores.".
                format(self.name))

        # make sure old_experiment doesn't already have a column named self.name
        if (self.name in experiment.data.columns):
            raise util.CytoflowOpError(
                'name',
                "Experiment already contains a column {0}".format(self.name))

        if self.channel not in experiment.channels:
            raise util.CytoflowOpError(
                'channel',
                "{0} isn't a channel in the experiment".format(self.channel))

        if self.threshold is None:
            raise util.CytoflowOpError('threshold', "must set 'threshold'")

        gate = pd.Series(experiment[self.channel] > self.threshold)

        new_experiment = experiment.clone()
        new_experiment.add_condition(self.name, "bool", gate)
        new_experiment.history.append(
            self.clone_traits(transient=lambda t: True))
        return new_experiment
Ejemplo n.º 15
0
 def _get_str(self):
     if len(self.selected) == 0:
         return ""
     
     phrase = "("
     for cat in self.selected:
         if len(phrase) > 1:
             phrase += " or "
         phrase += "{0} == \"{1}\"".format(util.sanitize_identifier(self.name), cat) 
     phrase += ")"
     
     return phrase
Ejemplo n.º 16
0
 def _get_subset_str(self):
     if len(self.selected) == 0:
         return ""
     
     phrase = "("
     for cat in self.selected:
         if len(phrase) > 1:
             phrase += " or "
         phrase += "{0} == \"{1}\"".format(sanitize_identifier(self.name), cat) 
     phrase += ")"
     
     return phrase
Ejemplo n.º 17
0
    def apply(self, experiment):
        """Applies the ratio operation to an experiment
        
        Parameters
        ----------
        experiment : Experiment
            the old experiment to which this op is applied
            
        Returns
        -------
            a new experiment with the new ratio channel
        """

        if not experiment:
            raise util.CytoflowOpError("No experiment specified")

        if self.numerator not in experiment.channels:
            raise util.CytoflowOpError(
                "Channel {0} not in the experiment".format(self.numerator))

        if self.denominator not in experiment.channels:
            raise util.CytoflowOpError(
                "Channel {0} not in the experiment".format(self.denominator))

        if self.name != util.sanitize_identifier(self.name):
            raise util.CytoflowOpError(
                "New channel {0} must be a valid Python identifier".format(
                    self.name))

        if self.name in experiment.channels:
            raise util.CytoflowOpError(
                "New channel {0} is already in the experiment".format(
                    self.name))

        new_experiment = experiment.clone()
        new_experiment.add_channel(
            self.name,
            experiment[self.numerator] / experiment[self.denominator])
        new_experiment.data.replace([np.inf, -np.inf], np.nan, inplace=True)
        new_experiment.data.dropna(inplace=True)
        new_experiment.history.append(
            self.clone_traits(transient=lambda t: True))
        new_experiment.metadata[self.name]['numerator'] = self.numerator
        new_experiment.metadata[self.name]['denominator'] = self.denominator
        return new_experiment
Ejemplo n.º 18
0
    def query(self, expr, **kwargs):
        """
        Return an experiment whose data is a subset of this one where ``expr``
        evaluates to ``True``.

        This method "sanitizes" column names first, replacing characters that
        are not valid in a Python identifier with an underscore ``_``. So, the
        column name ``a column`` becomes ``a_column``, and can be queried with
        an ``a_column == True`` or such.
        
        Parameters
        ----------
        expr : string
            The expression to pass to :meth:`pandas.DataFrame.query`.  Must be
            a valid Python expression, something you could pass to :func:`eval`.
            
        **kwargs : dict
            Other named parameters to pass to :meth:`pandas.DataFrame.query`.
            
        Returns
        -------
        Experiment
            A new :class:`Experiment`, a clone of this one with the data 
            returned by :meth:`pandas.DataFrame.query()`
        """

        resolvers = {}
        for name, col in self.data.iteritems():
            new_name = util.sanitize_identifier(name)
            if new_name in resolvers:
                raise util.CytoflowError(
                    "Tried to sanitize column name {1} to "
                    "{2} but it already existed in the "
                    " DataFrame.".format(name, new_name))
            else:
                resolvers[new_name] = col

        ret = self.clone()
        ret.data = self.data.query(expr, resolvers=({}, resolvers), **kwargs)
        ret.data.reset_index(drop=True, inplace=True)

        if len(ret.data) == 0:
            raise util.CytoflowError("No events matched {}".format(expr))

        return ret
Ejemplo n.º 19
0
    def apply(self, experiment):
        """
        Apply the operation to an :class:`.Experiment`.
        
        Parameters
        ----------
        experiment
            The :class:`.Experiment` to apply this operation to.
            
        Returns
        -------
        Experiment
            A new :class:`.Experiment`, containing a new entry in 
            :attr:`.Experiment.statistics`.  The key of the new entry is a 
            tuple ``(name, function)`` (or ``(name, statistic_name)`` if 
            :attr:`statistic_name` is set.
        """

        if experiment is None:
            raise util.CytoflowOpError('experiment',
                                       "Must specify an experiment")

        if not self.name:
            raise util.CytoflowOpError('name', "Must specify a name")

        if self.name != util.sanitize_identifier(self.name):
            raise util.CytoflowOpError(
                'name',
                "Name can only contain letters, numbers and underscores.".
                format(self.name))

        if not self.channel:
            raise util.CytoflowOpError('channel', "Must specify a channel")

        if not self.function:
            raise util.CytoflowOpError('function', "Must specify a function")

        if self.channel not in experiment.data:
            raise util.CytoflowOpError(
                'channel',
                "Channel {0} not found in the experiment".format(self.channel))

        if not self.by:
            raise util.CytoflowOpError(
                'by', "Must specify some grouping conditions "
                "in 'by'")

        stat_name = (self.name, self.statistic_name) \
                     if self.statistic_name \
                     else (self.name, self.function.__name__)

        if stat_name in experiment.statistics:
            raise util.CytoflowOpError(
                'name', "{} is already in the experiment's statistics".format(
                    stat_name))

        new_experiment = experiment.clone()
        if self.subset:
            try:
                experiment = experiment.query(self.subset)
            except Exception as exc:
                raise util.CytoflowOpError(
                    'subset', "Subset string '{0}' isn't valid".format(
                        self.subset)) from exc

            if len(experiment) == 0:
                raise util.CytoflowOpError(
                    'subset', "Subset string '{0}' returned no events".format(
                        self.subset))

        for b in self.by:
            if b not in experiment.conditions:
                raise util.CytoflowOpError(
                    'by', "Aggregation metadata {} not found, "
                    "must be one of {}".format(b, experiment.conditions))
            unique = experiment.data[b].unique()

            if len(unique) == 1:
                warn("Only one category for {}".format(b),
                     util.CytoflowOpWarning)

        groupby = experiment.data.groupby(self.by)

        for group, data_subset in groupby:
            if len(data_subset) == 0:
                warn("Group {} had no data".format(group),
                     util.CytoflowOpWarning)

        # this shouldn't be necessary, but see pandas bug #38053
        if len(self.by) == 1:
            idx = pd.Index(experiment[self.by[0]].unique(), name=self.by[0])
        else:
            idx = pd.MultiIndex.from_product(
                [experiment[x].unique() for x in self.by], names=self.by)

        stat = pd.Series(data=[self.fill] * len(idx),
                         index=idx,
                         name="{} : {}".format(stat_name[0], stat_name[1]),
                         dtype=np.dtype(object)).sort_index()

        for group, data_subset in groupby:
            if len(data_subset) == 0:
                continue

            if not isinstance(group, tuple):
                group = (group, )

            try:
                v = self.function(data_subset[self.channel])

                stat.at[group] = v

            except Exception as e:
                raise util.CytoflowOpError(
                    None, "Your function threw an error in group {}".format(
                        group)) from e

            # check for, and warn about, NaNs.
            if pd.Series(stat.loc[group]).isna().any():
                warn(
                    "Found NaN in category {} returned {}".format(
                        group, stat.loc[group]), util.CytoflowOpWarning)

        # try to convert to numeric, but if there are non-numeric bits ignore
        stat = pd.to_numeric(stat, errors='ignore')

        new_experiment.history.append(
            self.clone_traits(transient=lambda _: True))
        new_experiment.statistics[stat_name] = stat

        return new_experiment
Ejemplo n.º 20
0
    def apply(self, experiment):
        """
        Assigns new metadata to events using the mixture model estimated
        in :meth:`estimate`.
        
        Returns
        -------
        Experiment
            A new :class:`.Experiment` with the new condition variables as
            described in the class documentation.  Also adds the following
            new statistics:
            
            - **mean** : Float
                the mean of the fitted gaussian in each channel for each component.
                
            - **sigma** : (Float, Float)
                the locations the mean +/- one standard deviation in each channel
                for each component.
                
            - **correlation** : Float
                the correlation coefficient between each pair of channels for each
                component.
                
            - **proportion** : Float
                the proportion of events in each component of the mixture model.  only
                added if :attr:`num_components` ``> 1``.
        """
             
        if experiment is None:
            raise util.CytoflowOpError('experiment',
                                       "No experiment specified")
         
        if len(self.channels) == 0:
            raise util.CytoflowOpError('channels',
                                       "Must set at least one channel")
         
        # make sure name got set!
        if not self.name:
            raise util.CytoflowOpError('name',
                                       "You have to set the gate's name "
                                       "before applying it!")
            
        if self.name != util.sanitize_identifier(self.name):
            raise util.CytoflowOpError('name',
                                       "Name can only contain letters, numbers and underscores."
                                       .format(self.name)) 
        
        if self.num_components > 1 and self.name in experiment.data.columns:
            raise util.CytoflowOpError('name',
                                       "Experiment already has a column named {0}"
                                       .format(self.name))
            
        if self.sigma is not None:
            for i in range(1, self.num_components + 1):
                cname = "{}_{}".format(self.name, i)
                if cname in experiment.data.columns:
                    raise util.CytoflowOpError('name',
                                               "Experiment already has a column named {}"
                                               .format(cname))
 
        if self.posteriors:
            for i in range(1, self.num_components + 1):
                cname = "{}_{}_posterior".format(self.name, i)
                if cname in experiment.data.columns:
                    raise util.CytoflowOpError('name',
                                               "Experiment already has a column named {}"
                                               .format(cname))               
         
        if not self._gmms:
            raise util.CytoflowOpError(None, 
                                       "No components found.  Did you forget to "
                                       "call estimate()?")
            
        for c in self.channels:
            if c not in self._scale:
                raise util.CytoflowOpError(None,
                                           "Model scale not set.  Did you forget "
                                           "to call estimate()?")
 
        for c in self.channels:
            if c not in experiment.channels:
                raise util.CytoflowOpError('channels',
                                           "Channel {0} not found in the experiment"
                                           .format(c))
        
        for b in self.by:
            if b not in experiment.conditions:
                raise util.CytoflowOpError('by',
                                           "Aggregation metadata {} not found, "
                                           "must be one of {}"
                                           .format(b, experiment.conditions))
#                             
#         if self.num_components == 1 and self.sigma == 0.0:
#             raise util.CytoflowOpError('sigma',
#                                        "if num_components is 1, sigma must be > 0.0")
        
                
        if self.num_components == 1 and self.posteriors:
            warn("If num_components == 1, all posteriors will be 1",
                 util.CytoflowOpWarning)
#             raise util.CytoflowOpError('posteriors',
#                                        "If num_components == 1, all posteriors will be 1.")
         
        if self.num_components > 1:
            event_assignments = pd.Series(["{}_None".format(self.name)] * len(experiment), dtype = "object")
 
        if self.sigma is not None:
            event_gate = {i : pd.Series([False] * len(experiment), dtype = "double")
                           for i in range(self.num_components)}
 
        if self.posteriors:
            event_posteriors = {i : pd.Series([0.0] * len(experiment), dtype = "double")
                                for i in range(self.num_components)}

        if self.by:
            groupby = experiment.data.groupby(self.by)
        else:
            # use a lambda expression to return a group that
            # contains all the events
            groupby = experiment.data.groupby(lambda _: True)   

        # make the statistics       
        components = [x + 1 for x in range(self.num_components)]
         
        prop_idx = pd.MultiIndex.from_product([experiment[x].unique() for x in self.by] + [components], 
                                         names = list(self.by) + ["Component"])
        prop_stat = pd.Series(name = "{} : {}".format(self.name, "proportion"),
                              index = prop_idx, 
                              dtype = np.dtype(object)).sort_index()
                  
        mean_idx = pd.MultiIndex.from_product([experiment[x].unique() for x in self.by] + [components] + [self.channels], 
                                              names = list(self.by) + ["Component"] + ["Channel"])
        mean_stat = pd.Series(name = "{} : {}".format(self.name, "mean"),
                              index = mean_idx, 
                              dtype = np.dtype(object)).sort_index()
        sigma_stat = pd.Series(name = "{} : {}".format(self.name, "sigma"),
                               index = mean_idx,
                               dtype = np.dtype(object)).sort_index()
        interval_stat = pd.Series(name = "{} : {}".format(self.name, "interval"),
                                  index = mean_idx, 
                                  dtype = np.dtype(object)).sort_index()

        corr_idx = pd.MultiIndex.from_product([experiment[x].unique() for x in self.by] + [components] + [self.channels] + [self.channels], 
                                              names = list(self.by) + ["Component"] + ["Channel_1"] + ["Channel_2"])
        corr_stat = pd.Series(name = "{} : {}".format(self.name, "correlation"),
                              index = corr_idx, 
                              dtype = np.dtype(object)).sort_index()  
                 
        for group, data_subset in groupby:
            if group not in self._gmms:
                # there weren't any events in this group, so we didn't get
                # a gmm.
                continue
             
            gmm = self._gmms[group]
            x = data_subset.loc[:, self.channels[:]]
            for c in self.channels:
                x[c] = self._scale[c](x[c])
                
            # which values are missing?

            x_na = pd.Series([False] * len(x))
            for c in self.channels:
                x_na[np.isnan(x[c]).values] = True
                        
            x = x.values
            x_na = x_na.values
            group_idx = groupby.groups[group]
 
            if self.num_components > 1:
                predicted = np.full(len(x), -1, "int")
                predicted[~x_na] = gmm.predict(x[~x_na])
                
                predicted_str = pd.Series(["(none)"] * len(predicted))
                for c in range(0, self.num_components):
                    predicted_str[predicted == c] = "{0}_{1}".format(self.name, c + 1)
                predicted_str[predicted == -1] = "{0}_None".format(self.name)
                predicted_str.index = group_idx
     
                event_assignments.iloc[group_idx] = predicted_str
                
            # if we're doing sigma-based gating, for each component check
            # to see if the event is in the sigma gate.
            if self.sigma is not None:
                for c in range(self.num_components):
                    s = np.linalg.pinv(gmm.covariances_[c])
                    mu = gmm.means_[c]
                    
                    # compute the Mahalanobis distance

                    f = lambda x, mu, s: np.dot(np.dot((x - mu).T, s), (x - mu))
                    dist = np.apply_along_axis(f, 1, x, mu, s)

                    # come up with a threshold based on sigma.  you'll note we
                    # didn't sqrt dist: that's because for a multivariate 
                    # Gaussian, the square of the Mahalanobis distance is
                    # chi-square distributed
                    
                    p = (scipy.stats.norm.cdf(self.sigma) - 0.5) * 2
                    thresh = scipy.stats.chi2.ppf(p, 1)
                    
                    event_gate[c].iloc[group_idx] = np.less_equal(dist, thresh)
                    
            if self.posteriors:  
                p = np.full((len(x), self.num_components), 0.0)
                p[~x_na] = gmm.predict_proba(x[~x_na])
                for c in range(self.num_components):
                    event_posteriors[c].iloc[group_idx] = p[:, c]
                    
            for c in range(self.num_components):
                if len(self.by) == 0:
                    g = tuple([c + 1])
                elif hasattr(group, '__iter__') and not isinstance(group, (str, bytes)):
                    g = tuple(list(group) + [c + 1])
                else:
                    g = tuple([group] + [c + 1])

                prop_stat.at[g] = gmm.weights_[c]
                
                for cidx1, channel1 in enumerate(self.channels):
                    g2 = tuple(list(g) + [channel1])
                    mean_stat.at[g2] = self._scale[channel1].inverse(gmm.means_[c, cidx1])
                    
                    s, corr = util.cov2corr(gmm.covariances_[c])
                    sigma_stat[g2] = (self._scale[channel1].inverse(s[cidx1]))
                    interval_stat.at[g2] = (self._scale[channel1].inverse(gmm.means_[c, cidx1] - s[cidx1]),
                                             self._scale[channel1].inverse(gmm.means_[c, cidx1] + s[cidx1]))
            
                    for cidx2, channel2 in enumerate(self.channels):
                        g3 = tuple(list(g2) + [channel2])
                        corr_stat[g3] = corr[cidx1, cidx2]
                        
                    corr_stat.drop(tuple(list(g2) + [channel1]), inplace = True)

        new_experiment = experiment.clone()
          
        if self.num_components > 1:
            new_experiment.add_condition(self.name, "category", event_assignments)
            
        if self.sigma is not None:
            for c in range(self.num_components):
                gate_name = "{}_{}".format(self.name, c + 1)
                new_experiment.add_condition(gate_name, "bool", event_gate[c])              
                
        if self.posteriors:
            for c in range(self.num_components):
                post_name = "{}_{}_posterior".format(self.name, c + 1)
                new_experiment.add_condition(post_name, "double", event_posteriors[c])
                
        new_experiment.statistics[(self.name, "mean")] = pd.to_numeric(mean_stat)
        new_experiment.statistics[(self.name, "sigma")] = sigma_stat
        new_experiment.statistics[(self.name, "interval")] = interval_stat
        if len(corr_stat) > 0:
            new_experiment.statistics[(self.name, "correlation")] = pd.to_numeric(corr_stat)
        if self.num_components > 1:
            new_experiment.statistics[(self.name, "proportion")] = pd.to_numeric(prop_stat)

        new_experiment.history.append(self.clone_traits(transient = lambda _: True))
        return new_experiment
Ejemplo n.º 21
0
    def apply(self, experiment):
        """
        Creates a new condition based on membership in the gate that was
        parameterized with :meth:`estimate`.
        
        Parameters
        ----------
        experiment : Experiment
            the :class:`.Experiment` to apply the gate to.
            
        Returns
        -------
        Experiment
            a new :class:`.Experiment` with the new gate applied.
        """
            
        if experiment is None:
            raise util.CytoflowOpError('experiment',
                                       "No experiment specified")
        
        if not self.xchannel:
            raise util.CytoflowOpError('xchannel',
                                       "Must set X channel")

        if not self.ychannel:
            raise util.CytoflowOpError('ychannel',
                                       "Must set Y channel")
        
        # make sure name got set!
        if not self.name:
            raise util.CytoflowOpError('name',
                                       "You have to set the gate's name "
                                       "before applying it!")
            
        if self.name != util.sanitize_identifier(self.name):
            raise util.CytoflowOpError('name',
                                       "Name can only contain letters, numbers and underscores."
                                       .format(self.name))  

        if self.name in experiment.data.columns:
            raise util.CytoflowOpError('name',
                                       "Experiment already has a column named {0}"
                                       .format(self.name))
        
        if not (self._xbins.size and self._ybins.size and self._keep_xbins):
            raise util.CytoflowOpError(None,
                                       "No gate estimate found.  Did you forget to "
                                       "call estimate()?")

        if not self._xscale:
            raise util.CytoflowOpError(None,
                                       "Couldn't find _xscale.  What happened??")
        
        if not self._yscale:
            raise util.CytoflowOpError(None,
                                       "Couldn't find _yscale.  What happened??")

        if self.xchannel not in experiment.data:
            raise util.CytoflowOpError('xchannel',
                                       "Column {0} not found in the experiment"
                                       .format(self.xchannel))

        if self.ychannel not in experiment.data:
            raise util.CytoflowOpError('ychannel',
                                       "Column {0} not found in the experiment"
                                       .format(self.ychannel))
       
        for b in self.by:
            if b not in experiment.conditions:
                raise util.CytoflowOpError('by',
                                           "Aggregation metadata {} not found, "
                                           "must be one of {}"
                                           .format(b, experiment.conditions))
        
        if self.by:
            groupby = experiment.data.groupby(self.by)
        else:
            # use a lambda expression to return a group that
            # contains all the events
            groupby = experiment.data.groupby(lambda _: True)
            
        event_assignments = pd.Series([False] * len(experiment), dtype = "bool")
        
        for group, group_data in groupby:
            if group not in self._keep_xbins:
                # there weren't any events in this group, so we didn't get
                # an estimate
                continue
            
            group_idx = groupby.groups[group]
            
            cX = pd.cut(group_data[self.xchannel], self._xbins, include_lowest = True, labels = False)
            cY = pd.cut(group_data[self.ychannel], self._ybins, include_lowest = True, labels = False)

            group_keep = pd.Series([False] * len(group_data))
            
            keep_x = self._keep_xbins[group]
            keep_y = self._keep_ybins[group]
            
            for (xbin, ybin) in zip(keep_x, keep_y):
                group_keep = group_keep | ((cX == xbin) & (cY == ybin))
                            
            event_assignments.iloc[group_idx] = group_keep
                    
        new_experiment = experiment.clone()
        
        new_experiment.add_condition(self.name, "bool", event_assignments)

        new_experiment.history.append(self.clone_traits(transient = lambda _: True))
        return new_experiment
Ejemplo n.º 22
0
    def apply(self, experiment):
        """
        Applies the binning to an experiment.
        
        Parameters
        ----------
        experiment : Experiment
            the old_experiment to which this op is applied
            
        Returns
        -------
        Experiment
            A new experiment with a condition column named :attr:`name`, which
            contains the location of the left-most edge of the bin that the
            event is in.  If :attr:`bin_count_name` is set, another column
            is added with that name as well, containing the number of events
            in the same bin as the event.

        """
        if experiment is None:
            raise util.CytoflowOpError('experiment', "no experiment specified")

        if not self.name:
            raise util.CytoflowOpError('name', "Name is not set")

        if self.name != util.sanitize_identifier(self.name):
            raise util.CytoflowOpError(
                'name',
                "Name can only contain letters, numbers and underscores.".
                format(self.name))

        if self.name in experiment.data.columns:
            raise util.CytoflowOpError(
                'name',
                "Name {} is in the experiment already".format(self.name))

        if self.bin_count_name and self.bin_count_name in experiment.data.columns:
            raise util.CytoflowOpError(
                'bin_count_name',
                "bin_count_name {} is in the experiment already".format(
                    self.bin_count_name))

        if not self.channel:
            raise util.CytoflowOpError('channel', "channel is not set")

        if self.channel not in experiment.data.columns:
            raise util.CytoflowOpError(
                'channel',
                "channel {} isn't in the experiment".format(self.channel))

        if not self.bin_width:
            raise util.CytoflowOpError('bin_width', "must set bin width")

        if not (self.scale == "linear" or self.scale == "log"):
            raise util.CytoflowOpError(
                'scale', "Can only use binning op with linear or log scale")

        scale = util.scale_factory(self.scale,
                                   experiment,
                                   channel=self.channel)

        scaled_min = scale(scale.clip(experiment.data[self.channel]).min())
        scaled_max = scale(scale.clip(experiment.data[self.channel]).max())

        if self.scale == 'linear':
            start = 0
        else:
            start = 1

        scaled_bins_left = np.arange(start=-1.0 * start,
                                     stop=(-1.0 * scaled_min) + self.bin_width,
                                     step=self.bin_width) * -1.0
        scaled_bins_left = scaled_bins_left[::-1][:-1]

        scaled_bins_right = np.arange(start=start,
                                      stop=scaled_max + self.bin_width,
                                      step=self.bin_width)
        scaled_bins = np.append(scaled_bins_left, scaled_bins_right)

        if len(scaled_bins) > self._max_num_bins:
            raise util.CytoflowOpError(
                None, "Too many bins! To increase this limit, "
                "change _max_num_bins (currently {})".format(
                    self._max_num_bins))

        if len(scaled_bins) < 2:
            raise util.CytoflowOpError('bin_width',
                                       "Must have more than one bin")

        # now, back into data space
        bins = scale.inverse(scaled_bins)

        # reduce to 4 sig figs
        bins = ['%.4g' % x for x in bins]
        bins = [float(x) for x in bins]
        bins = np.array(bins)

        # put the data in bins
        bin_idx = np.digitize(experiment.data[self.channel], bins[1:-1])

        new_experiment = experiment.clone()
        new_experiment.add_condition(self.name, "float64", bins[bin_idx])

        # keep track of the bins we used, for prettier plotting later.
        new_experiment.metadata[self.name]["bin_scale"] = self.scale
        new_experiment.metadata[self.name]["bins"] = bins

        if self.bin_count_name:
            # TODO - this is a HUGE memory hog?!
            # TODO - fix this, then turn it on by default
            agg_count = new_experiment.data.groupby(self.name).count()
            agg_count = agg_count[agg_count.columns[0]]

            # have to make the condition a float64, because if we're in log
            # space there may be events that have NaN as the bin number.

            new_experiment.add_condition(
                self.bin_count_name, "float64",
                new_experiment[self.name].map(agg_count))

        new_experiment.history.append(
            self.clone_traits(transient=lambda _: True))
        return new_experiment
Ejemplo n.º 23
0
    def apply(self, experiment):
        """
        Applies :attr:`function` to a statistic.
        
        Parameters
        ----------
        experiment : Experiment
            The experiment to apply the operation to
        
        Returns
        -------
        Experiment
            The same as the old experiment, but with a new statistic that
            results from applying :attr:`function` to the statistic specified
            in :attr:`statistic`.
        """
        
        if experiment is None:
            raise util.CytoflowOpError('experiment',
                                       "Must specify an experiment")

        if not self.name:
            raise util.CytoflowOpError('name',
                                       "Must specify a name")
        
        if self.name != util.sanitize_identifier(self.name):
            raise util.CytoflowOpError('name',
                                       "Name can only contain letters, numbers and underscores."
                                       .format(self.name)) 
        
        if not self.statistic:
            raise util.CytoflowViewError('statistic',
                                         "Statistic not set")
        
        if self.statistic not in experiment.statistics:
            raise util.CytoflowViewError('statistic',
                                         "Can't find the statistic {} in the experiment"
                                         .format(self.statistic))
        else:
            stat = experiment.statistics[self.statistic]

        if not self.function:
            raise util.CytoflowOpError('function',
                                       "Must specify a function")
            
        stat_name = (self.name, self.statistic_name) \
                     if self.statistic_name \
                     else (self.name, self.function.__name__)
                     
        if stat_name in experiment.statistics:
            raise util.CytoflowOpError('name',
                                       "{} is already in the experiment's statistics"
                                       .format(stat_name))

        for b in self.by:
            if b not in stat.index.names:
                raise util.CytoflowOpError('by',
                                           "{} is not a statistic index; "
                                           " must be one of {}"
                                           .format(b, stat.index.names))
                
        data = stat.reset_index()
                
        if self.by:
            idx = pd.MultiIndex.from_product([data[x].unique() for x in self.by], 
                                             names = self.by)
        else:
            idx = stat.index.copy()
                    
        new_stat = pd.Series(data = self.fill,
                             index = idx, 
                             dtype = np.dtype(object)).sort_index()
                    
        if self.by:                         
            for group in data[self.by].itertuples(index = False, name = None):                
                if isinstance(stat.index, pd.MultiIndex):
                    s = stat.xs(group, level = self.by, drop_level = False)
                else:
                    s = stat.loc[list(group)]
                                    
                if len(s) == 0:
                    continue
    
                try:
                    new_stat[group] = self.function(s)
                except Exception as e:
                    raise util.CytoflowOpError('function',
                                               "Your function threw an error in group {}".format(group)) from e
                                        
                # check for, and warn about, NaNs.
                if np.any(np.isnan(new_stat.loc[group])):
                    warn("Category {} returned {}".format(group, new_stat.loc[group]), 
                         util.CytoflowOpWarning)
                    
        else:
            new_stat = self.function(stat)
            
            if not isinstance(new_stat, pd.Series):
                raise util.CytoflowOpError('by',
                                           "Transform function {} does not return a Series; "
                                           "in this case, you must set 'by'"
                                           .format(self.function))
                
        new_stat.name = "{} : {}".format(stat_name[0], stat_name[1])
                                                    
        matched_series = True
        for group in data[self.by].itertuples(index = False, name = None):
            if isinstance(stat.index, pd.MultiIndex):
                s = stat.xs(group, level = self.by, drop_level = False)
            else:
                s = stat.loc[list(group)]

            if isinstance(new_stat.loc[group], pd.Series) and \
                s.index.equals(new_stat.loc[group].index):
                pass
            else:
                matched_series = False
                break
            
        if matched_series and len(self.by) > 0:
            new_stat = pd.concat(new_stat.values)
            
        # try to convert to numeric, but if there are non-numeric bits ignore
        new_stat = pd.to_numeric(new_stat, errors = 'ignore')
        
        # sort the index, for performance
        new_stat = new_stat.sort_index()
        
        new_experiment = experiment.clone()
        new_experiment.history.append(self.clone_traits(transient = lambda t: True))
        if self.statistic_name:
            new_experiment.statistics[(self.name, self.statistic_name)] = new_stat
        else:
            new_experiment.statistics[(self.name, self.function.__name__)] = new_stat

        return new_experiment
Ejemplo n.º 24
0
    def apply(self, experiment):
        """
        Apply the KMeans clustering to the data.
        
        Returns
        -------
        Experiment
            a new Experiment with one additional :attr:`~Experiment.condition` 
            named :attr:`name`, of type ``category``.  The new category has 
            values  ``name_1, name_2, etc`` to indicate which k-means cluster 
            an event is a member of.
            
            The new :class:`.Experiment` also has one new statistic called
            ``centers``, which is a list of tuples encoding the centroids of each
            k-means cluster.
        """
 
        if experiment is None:
            raise util.CytoflowOpError('experiment',
                                       "No experiment specified")
         
        # make sure name got set!
        if not self.name:
            raise util.CytoflowOpError('name',
                                       "You have to set the gate's name "
                                       "before applying it!")
            
        if self.name != util.sanitize_identifier(self.name):
            raise util.CytoflowOpError('name',
                                       "Name can only contain letters, numbers and underscores."
                                       .format(self.name)) 
         
        if self.name in experiment.data.columns:
            raise util.CytoflowOpError('name',
                                       "Experiment already has a column named {0}"
                                       .format(self.name))
            
        if not self._kmeans:
            raise util.CytoflowOpError(None, 
                                       "No components found.  Did you forget to "
                                       "call estimate()?")
         
        if len(self.channels) == 0:
            raise util.CytoflowOpError('channels',
                                       "Must set at least one channel")
 
        for c in self.channels:
            if c not in experiment.data:
                raise util.CytoflowOpError('channels',
                                           "Channel {0} not found in the experiment"
                                      .format(c))
                 
        for c in self.scale:
            if c not in self.channels:
                raise util.CytoflowOpError('scale',
                                           "Scale set for channel {0}, but it isn't "
                                           "in the experiment"
                                           .format(c))
        
        for b in self.by:
            if b not in experiment.data:
                raise util.CytoflowOpError('by',
                                           "Aggregation metadata {} not found, "
                                           "must be one of {}"
                                           .format(b, experiment.conditions))
        
                 
        if self.by:
            groupby = experiment.data.groupby(self.by)
        else:
            # use a lambda expression to return a group that contains
            # all the events
            groupby = experiment.data.groupby(lambda _: True)
                 
        event_assignments = pd.Series(["{}_None".format(self.name)] * len(experiment), dtype = "object")
         
        # make the statistics       
        clusters = [x + 1 for x in range(self.num_clusters)]
          
        idx = pd.MultiIndex.from_product([experiment[x].unique() for x in self.by] + [clusters] + [self.channels], 
                                         names = list(self.by) + ["Cluster"] + ["Channel"])
        centers_stat = pd.Series(index = idx, dtype = np.dtype(object)).sort_index()
                     
        for group, data_subset in groupby:
            if len(data_subset) == 0:
                raise util.CytoflowOpError('by',
                                           "Group {} had no data"
                                           .format(group))
            
            if group not in self._kmeans:
                raise util.CytoflowOpError('by',
                                           "Group {} not found in the estimated model. "
                                           "Do you need to re-run estimate()?"
                                           .format(group))    
            
            x = data_subset.loc[:, self.channels[:]]
            for c in self.channels:
                x[c] = self._scale[c](x[c])
                 
            # which values are missing?
 
            x_na = pd.Series([False] * len(x))
            for c in self.channels:
                x_na[np.isnan(x[c]).values] = True
                         
            x = x.values
            x_na = x_na.values
            group_idx = groupby.groups[group]
            
            kmeans = self._kmeans[group]
  
            predicted = np.full(len(x), -1, "int")
            predicted[~x_na] = kmeans.predict(x[~x_na])
                 
            predicted_str = pd.Series(["(none)"] * len(predicted))
            for c in range(0, self.num_clusters):
                predicted_str[predicted == c] = "{0}_{1}".format(self.name, c + 1)
            predicted_str[predicted == -1] = "{0}_None".format(self.name)
            predicted_str.index = group_idx
      
            event_assignments.iloc[group_idx] = predicted_str
            
            for c in range(self.num_clusters):
                if len(self.by) == 0:
                    g = [c + 1]
                elif hasattr(group, '__iter__') and not isinstance(group, (str, bytes)):
                    g = tuple(list(group) + [c + 1])
                else:
                    g = tuple([group] + [c + 1])
                
                for cidx1, channel1 in enumerate(self.channels):
                    g2 = tuple(list(g) + [channel1])
                    centers_stat.loc[g2] = self._scale[channel1].inverse(kmeans.cluster_centers_[c, cidx1])
         
        new_experiment = experiment.clone()          
        new_experiment.add_condition(self.name, "category", event_assignments)
        
        new_experiment.statistics[(self.name, "centers")] = pd.to_numeric(centers_stat)
 
        new_experiment.history.append(self.clone_traits(transient = lambda _: True))
        return new_experiment
Ejemplo n.º 25
0
    def add_condition(self, name, dtype, data=None):
        """
        Add a new column of per-event metadata to this :class:`Experiment`.
        
        .. note::
            :meth:`add_condition` operates **in place.**
        
        There are two places to call `add_condition`.
        
          - As you're setting up a new :class:`Experiment`, call 
            :meth:`add_condition` with ``data`` set to ``None`` to specify the 
            conditions the new events will have.
          - If you compute some new per-event metadata on an existing 
            :class:`Experiment`, call :meth:`add_condition` to add it. 
        
        Parameters
        ----------
        name : String
            The name of the new column in :attr:`data`.  Must be a valid Python
            identifier: must start with ``[A-Za-z_]`` and contain only the 
            characters ``[A-Za-z0-9_]``.
        
        dtype : String
            The type of the new column in :attr:`data`.  Must be a string that
            :class:`pandas.Series` recognizes as a ``dtype``: common types are 
            ``category``, ``float``, ``int``, and ``bool``.
            
        data : pandas.Series (default = None)
            The :class:`pandas.Series` to add to :attr:`data`.  Must be the same
            length as :attr:`data`, and it must be convertable to a 
            :class:`pandas.Series` of type ``dtype``.  If ``None``, will add an
            empty column to the :class:`Experiment` ... but the 
            :class:`Experiment` must be empty to do so!
             
        Raises
        ------
        :class:`.CytoflowError`
            If the :class:`pandas.Series` passed in ``data`` isn't the same 
            length as :attr:`data`, or isn't convertable to type ``dtype``.          
            
        Examples
        --------
        >>> import cytoflow as flow
        >>> ex = flow.Experiment()
        >>> ex.add_condition("Time", "float")
        >>> ex.add_condition("Strain", "category")      
        
        """

        if name != util.sanitize_identifier(name):
            raise util.CytoflowError(
                "Name '{}' is not a valid Python identifier".format(name))

        if name in self.data:
            raise util.CytoflowError(
                "Already a column named {0} in self.data".format(name))

        if data is None and len(self) > 0:
            raise util.CytoflowError(
                "If data is None, self.data must be empty!")

        if data is not None and len(self) != len(data):
            raise util.CytoflowError(
                "data must be the same length as self.data")

        try:
            if data is not None:
                self.data[name] = data.astype(dtype, copy=True)
            else:
                self.data[name] = pd.Series(dtype=dtype)

        except (ValueError, TypeError) as exc:
            raise util.CytoflowError(
                "Had trouble converting data to type {0}".format(
                    dtype)) from exc

        self.metadata[name] = {}
        self.metadata[name]['type'] = "condition"
Ejemplo n.º 26
0
    def apply(self, experiment=None, metadata_only=False):
        """
        Load a new :class:`.Experiment`.  
        
        Parameters
        ----------
        experiment : Experiment
            Ignored
            
        metadata_only : bool (default = False)
            Only "import" the metadata, creating an Experiment with all the
            expected metadata and structure but 0 events.
        
        Returns
        -------
        Experiment
            The new :class:`.Experiment`.  New channels have the following
            metadata:
            
            - **voltage** - int
                The voltage that this channel was collected at.  Determined
                by the ``$PnV`` field from the first FCS file.
                
            - **range** - int
                The maximum range of this channel.  Determined by the ``$PnR``
                field from the first FCS file.
                
            New experimental conditions do not have **voltage** or **range**
            metadata, obviously.  Instead, they have **experiment** set to 
            ``True``, to distinguish the experimental variables from the
            conditions that were added by gates, etc.
            
            If :attr:`ignore_v` is set, it is added as a key to the 
            :class:`.Experiment`-wide metadata.
            
        """

        if not self.tubes or len(self.tubes) == 0:
            raise util.CytoflowOpError('tubes', "Must specify some tubes!")

        # if we have channel renaming, make sure the new names are valid
        # python identifiers
        if self.channels:
            for old_name, new_name in self.channels.items():
                if old_name != new_name and new_name != util.sanitize_identifier(
                        new_name):
                    raise util.CytoflowOpError(
                        'channels', "Channel name {} must be a "
                        "valid Python identifier.".format(new_name))

        # make sure each tube has the same conditions
        tube0_conditions = set(self.tubes[0].conditions)
        for tube in self.tubes:
            tube_conditions = set(tube.conditions)
            if len(tube0_conditions ^ tube_conditions) > 0:
                raise util.CytoflowOpError(
                    'tubes', "Tube {0} didn't have the same "
                    "conditions as tube {1}".format(tube.file,
                                                    self.tubes[0].file))

        # make sure experimental conditions are unique
        for idx, i in enumerate(self.tubes[0:-1]):
            for j in self.tubes[idx + 1:]:
                if i.conditions_equal(j):
                    raise util.CytoflowOpError(
                        'tubes', "The same conditions specified for "
                        "tube {0} and tube {1}".format(i.file, j.file))

        experiment = Experiment()

        experiment.metadata["ignore_v"] = self.ignore_v

        for condition, dtype in list(self.conditions.items()):
            experiment.add_condition(condition, dtype)
            experiment.metadata[condition]['experiment'] = True

        try:
            # silence warnings about duplicate channels;
            # we'll figure that out below
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                tube0_meta = fcsparser.parse(self.tubes[0].file,
                                             data_set=self.data_set,
                                             meta_data_only=True,
                                             reformat_meta=True)
        except Exception as e:
            raise util.CytoflowOpError(
                'tubes', "FCS reader threw an error reading metadata "
                "for tube {}: {}".format(self.tubes[0].file, str(e))) from e

        meta_channels = tube0_meta["_channels_"]

        if self.name_metadata:
            experiment.metadata["name_metadata"] = self.name_metadata
        else:
            experiment.metadata["name_metadata"] = autodetect_name_metadata(
                self.tubes[0].file, data_set=self.data_set)

        meta_channels['Index'] = meta_channels.index
        meta_channels.set_index(experiment.metadata["name_metadata"],
                                inplace=True)

        channels = list(self.channels.keys()) if self.channels \
                   else list(meta_channels.index.values)

        # make sure everything in self.channels is in the tube channels
        for channel in channels:
            if channel not in meta_channels.index:
                raise util.CytoflowOpError(
                    'channels', "Channel {0} not in tube {1}".format(
                        channel, self.tubes[0].file))

        # now that we have the metadata, load it into experiment

        for channel in channels:
            experiment.add_channel(channel)

            experiment.metadata[channel]["fcs_name"] = channel

            # keep track of the channel's PMT voltage
            if ("$PnV" in meta_channels.loc[channel]):
                v = meta_channels.loc[channel]['$PnV']
                if v: experiment.metadata[channel]["voltage"] = v

            # add the maximum possible value for this channel.
            data_range = meta_channels.loc[channel]['$PnR']
            data_range = float(data_range)
            experiment.metadata[channel]['range'] = data_range

        experiment.metadata['fcs_metadata'] = {}
        for tube in self.tubes:
            if metadata_only:
                tube_meta, tube_data = parse_tube(tube.file,
                                                  experiment,
                                                  data_set=self.data_set,
                                                  metadata_only=True)
            else:
                tube_meta, tube_data = parse_tube(tube.file,
                                                  experiment,
                                                  data_set=self.data_set)

                if self.events:
                    if self.events <= len(tube_data):
                        tube_data = tube_data.loc[np.random.choice(
                            tube_data.index, self.events, replace=False)]
                    else:
                        warnings.warn(
                            "Only {0} events in tube {1}".format(
                                len(tube_data), tube.file),
                            util.CytoflowWarning)

                experiment.add_events(tube_data[channels], tube.conditions)

            # extract the row and column from wells collected on a
            # BD HTS
            if 'WELL ID' in tube_meta:
                pos = tube_meta['WELL ID']
                tube_meta['CF_Row'] = pos[0]
                tube_meta['CF_Col'] = int(pos[1:3])

            for i, channel in enumerate(channels):
                # remove the PnV tube metadata

                if '$P{}V'.format(i + 1) in tube_meta:
                    del tube_meta['$P{}V'.format(i + 1)]

                # work around a bug where the PnR is sometimes not the detector range
                # but the data range.
                pnr = '$P{}R'.format(i + 1)
                if pnr in tube_meta and float(
                        tube_meta[pnr]
                ) > experiment.metadata[channel]['range']:
                    experiment.metadata[channel]['range'] = float(
                        tube_meta[pnr])

            tube_meta['CF_File'] = Path(tube.file).stem

            experiment.metadata['fcs_metadata'][tube.file] = tube_meta

        for channel in channels:
            if self.channels and channel in self.channels:
                new_name = self.channels[channel]
                if channel == new_name:
                    continue
                experiment.data.rename(columns={channel: new_name},
                                       inplace=True)
                experiment.metadata[new_name] = experiment.metadata[channel]
                experiment.metadata[new_name]["fcs_name"] = channel
                del experiment.metadata[channel]

            # this catches an odd corner case where some instruments store
            # instrument-specific info in the "extra" bits.  we have to
            # clear them out.
            if tube0_meta['$DATATYPE'] == 'I':
                data_bits = int(meta_channels.loc[channel]['$PnB'])
                data_range = float(meta_channels.loc[channel]['$PnR'])
                range_bits = int(math.log(data_range, 2))

                if range_bits < data_bits:
                    mask = 1
                    for _ in range(1, range_bits):
                        mask = mask << 1 | 1

                    experiment.data[channel] = experiment.data[
                        channel].values.astype('int') & mask

            # re-scale the data to linear if if's recorded as log-scaled with
            # integer channels
            data_range = float(meta_channels.loc[channel]['$PnR'])
            f1 = float(meta_channels.loc[channel]['$PnE'][0])
            f2 = float(meta_channels.loc[channel]['$PnE'][1])

            if f1 > 0.0 and f2 == 0.0:
                warnings.warn(
                    'Invalid $PnE = {},{} for channel {}, changing it to {},1.0'
                    .format(f1, f2, channel, f1), util.CytoflowWarning)
                f2 = 1.0

            if f1 > 0.0 and f2 > 0.0 and tube0_meta['$DATATYPE'] == 'I':
                warnings.warn(
                    'Converting channel {} from logarithmic to linear'.format(
                        channel), util.CytoflowWarning)


#                 experiment.data[channel] = 10 ** (f1 * experiment.data[channel] / data_range) * f2

        return experiment
Ejemplo n.º 27
0
    def apply(self, experiment):
        """
        Assigns new metadata to events using the mixture model estimated
        in :meth:`estimate`.
        
        Returns
        -------
        Experiment
            A new :class:`.Experiment`, with a new column named :attr:`name`,
            and possibly one named :attr:`name` _Posterior.  Also the following
            new :attr:`~.Experiment.statistics`:
            
            - **mean** : Float
                the mean of the fitted gaussian
            
            - **stdev** : Float
                the inverse-scaled standard deviation of the fitted gaussian.  on a 
                linear scale, this is in the same units as the mean; on a log scale,
                this is a scalar multiple; and on a logicle scale, this is probably
                meaningless!
            
            - **interval** : (Float, Float)
                the inverse-scaled (mean - stdev, mean + stdev) of the fitted gaussian.
                this is likely more meaningful than ``stdev``, especially on the
                ``logicle`` scale.
            
            - **proportion** : Float
                the proportion of events in each component of the mixture model.  only
                set if :attr:`num_components` ``> 1``.
             
        """
        
        warn("GaussianMixture1DOp is DEPRECATED.  Please use GaussianMixtureOp.",
             util.CytoflowOpWarning)
            
        if experiment is None:
            raise util.CytoflowOpError('experiment',
                                       "No experiment specified")

        if not self._gmms:
            raise util.CytoflowOpError(None,
                                       "No model found.  Did you forget to "
                                       "call estimate()?")
        
        # make sure name got set!
        if not self.name:
            raise util.CytoflowOpError('name',
                                       "You have to set the gate's name "
                                       "before applying it!")
            
        if self.name != util.sanitize_identifier(self.name):
            raise util.CytoflowOpError('name',
                                       "Name can only contain letters, numbers and underscores."
                                       .format(self.name))  

        if self.name in experiment.data.columns:
            raise util.CytoflowOpError('name',
                                       "Experiment already has a column named {0}"
                                       .format(self.name))
            
        if not self._gmms:
            raise util.CytoflowOpError(None,
                                       "No components found.  Did you forget to "
                                       "call estimate()?")

        if not self._scale:
            raise util.CytoflowOpError(None,
                                       "Couldn't find _scale.  What happened??")

        if self.channel not in experiment.data:
            raise util.CytoflowOpError('channel',
                                       "Column {0} not found in the experiment"
                                       .format(self.channel))

        if self.posteriors:
            col_name = "{0}_Posterior".format(self.name)
            if col_name in experiment.data:
                raise util.CytoflowOpError('posteriors',
                                           "Column {0} already found in the experiment"
                              .format(col_name))
       
        for b in self.by:
            if b not in experiment.data:
                raise util.CytoflowOpError('by',
                                           "Aggregation metadata {} not found, "
                                           "must be one of {}"
                                           .format(b, experiment.conditions))
                           
        if self.sigma < 0.0:
            raise util.CytoflowOpError('sigma',
                                       "sigma must be >= 0.0")

        if self.by:
            by = sorted(self.by)
            groupby = experiment.data.groupby(by)
        else:
            # use a lambda expression to return a group that
            # contains all the events
            groupby = experiment.data.groupby(lambda _: True)

        event_assignments = pd.Series([None] * len(experiment), dtype = "object")
                                      
        if self.posteriors:
            event_posteriors = pd.Series([0.0] * len(experiment))
            
        # what we DON'T want to do is iterate through event-by-event.
        # the more of this we can push into numpy, sklearn and pandas,
        # the faster it's going to be.
        
        for group, data_subset in groupby:
            
            # if there weren't any events in this group, there's no gmm
            if group not in self._gmms:
                warn("There wasn't a GMM for data subset {}".format(group),
                     util.CytoflowOpWarning)
                continue
            
            gmm = self._gmms[group]
            x = data_subset[self.channel]
            x = self._scale(x).values
                        
            # which values are missing?
            x_na = np.isnan(x)
            
            group_idx = groupby.groups[group]
            
            # make a preliminary assignment
            predicted = np.full(len(x), -1, "int")
            predicted[~x_na] = gmm.predict(x[~x_na, np.newaxis])
            
            # if we're doing sigma-based gating, for each component check
            # to see if the event is in the sigma gate.
            if self.sigma > 0.0:
                
                # make a quick dataframe with the value and the predicted
                # component
                gate_df = pd.DataFrame({"x" : x, "p" : predicted})

                # for each component, get the low and the high threshold
                for c in range(0, self.num_components):
                    lo = (gmm.means_[c][0]    # @UnusedVariable
                          - self.sigma * np.sqrt(gmm.covariances_[c][0]))
                    hi = (gmm.means_[c][0]    # @UnusedVariable
                          + self.sigma * np.sqrt(gmm.covariances_[c][0]))
                    
                    # and build an expression with numexpr so it evaluates fast!
                    gate_bool = gate_df.eval("p == @c and x >= @lo and x <= @hi").values
                    predicted[np.logical_and(predicted == c, gate_bool == False)] = -1
        
            predicted_str = pd.Series(["(none)"] * len(predicted))
            for c in range(0, self.num_components):
                predicted_str[predicted == c] = "{0}_{1}".format(self.name, c + 1)
            predicted_str[predicted == -1] = "{0}_None".format(self.name)
            predicted_str.index = group_idx

            event_assignments.iloc[group_idx] = predicted_str
                                
            if self.posteriors:
                probability = np.full((len(x), self.num_components), 0.0, "float")
                probability[~x_na, :] = gmm.predict_proba(x[~x_na, np.newaxis])
                posteriors = pd.Series([0.0] * len(predicted))
                for i in range(0, self.num_components):
                    posteriors[predicted == i] = probability[predicted == i, i]
                posteriors.index = group_idx
                event_posteriors.iloc[group_idx] = posteriors
                    
        new_experiment = experiment.clone()
        
        if self.num_components == 1 and self.sigma > 0:
            new_experiment.add_condition(self.name, "bool", event_assignments == "{0}_1".format(self.name))
        elif self.num_components > 1:
            new_experiment.add_condition(self.name, "category", event_assignments)
            
        if self.posteriors and self.num_components > 1:
            col_name = "{0}_Posterior".format(self.name)
            new_experiment.add_condition(col_name, "float", event_posteriors)

        # add the statistics
        levels = list(self.by)
        if self.num_components > 1:
            levels.append(self.name)
        
        if levels:     
            idx = pd.MultiIndex.from_product([new_experiment[x].unique() for x in levels], 
                                             names = levels)
    
            mean_stat = pd.Series(index = idx, dtype = np.dtype(object)).sort_index()
            stdev_stat = pd.Series(index = idx, dtype = np.dtype(object)).sort_index()
            interval_stat = pd.Series(index = idx, dtype = np.dtype(object)).sort_index()
            prop_stat = pd.Series(index = idx, dtype = np.dtype(object)).sort_index()     
                                   
            for group, _ in groupby:
                gmm = self._gmms[group]
                for c in range(self.num_components):
                    if self.num_components > 1:
                        component_name = "{}_{}".format(self.name, c + 1)

                        if group is True:
                            g = [component_name]
                        elif isinstance(group, tuple):
                            g = list(group)
                            g.append(component_name)
                        else:
                            g = list([group])
                            g.append(component_name)
                        
                        if len(g) > 1:
                            g = tuple(g)
                        else:
                            g = (g[0],)
                    else:
                        g = group

                    mean_stat.at[g] = self._scale.inverse(gmm.means_[c][0])
                    stdev_stat.at[g] = self._scale.inverse(np.sqrt(gmm.covariances_[c][0]))[0]
                    interval_stat.at[g] = (self._scale.inverse(gmm.means_[c][0] - np.sqrt(gmm.covariances_[c][0][0])),
                                            self._scale.inverse(gmm.means_[c][0] + np.sqrt(gmm.covariances_[c][0][0])))
                    prop_stat.at[g] = gmm.weights_[c]
                     
            new_experiment.statistics[(self.name, "mean")] = pd.to_numeric(mean_stat)
            new_experiment.statistics[(self.name, "stdev")] = pd.to_numeric(stdev_stat)
            new_experiment.statistics[(self.name, "interval")] = interval_stat
            if self.num_components > 1:
                new_experiment.statistics[(self.name, "proportion")] = pd.to_numeric(prop_stat)
            
        new_experiment.history.append(self.clone_traits(transient = lambda _: True))
        return new_experiment
Ejemplo n.º 28
0
    def apply(self, experiment):
        """Applies the range gate to an experiment.
        
        Parameters
        ----------
        experiment : Experiment
            the old_experiment to which this op is applied
            
        Returns
        -------
        Experiment
            a new experiment, the same as old :class:`~Experiment` but with a new
            column of type ``bool`` with the same as the operation name.  The 
            bool is ``True`` if the event's measurement in :attr:`channel` is 
            greater than :attr:`low` and less than :attr:`high`; it is ``False`` 
            otherwise.
        """

        if experiment is None:
            raise util.CytoflowOpError('experiment', "No experiment specified")

        # make sure name got set!
        if not self.name:
            raise util.CytoflowOpError(
                'name', "You have to set the gate's name "
                "before applying it!")

        if self.name != util.sanitize_identifier(self.name):
            raise util.CytoflowOpError(
                'name',
                "Name can only contain letters, numbers and underscores.".
                format(self.name))

        if self.name in experiment.data.columns:
            raise util.CytoflowOpError(
                'name',
                "Experiment already has a column named {0}".format(self.name))

        if not self.channel:
            raise util.CytoflowOpError('channel', "Channel not specified")

        if not self.channel in experiment.channels:
            raise util.CytoflowOpError(
                'channel',
                "Channel {0} not in the experiment".format(self.channel))

        if self.high <= self.low:
            raise util.CytoflowOpError('high',
                                       "range high must be > range low")

        if self.high <= experiment[self.channel].min():
            raise util.CytoflowOpError(
                'high', "range high must be > {0}".format(
                    experiment[self.channel].min()))
        if self.low >= experiment[self.channel].max():
            raise util.CytoflowOpError(
                'low', "range low must be < {0}".format(
                    experiment[self.channel].max()))

        gate = experiment[self.channel].between(self.low, self.high)
        new_experiment = experiment.clone()
        new_experiment.add_condition(self.name, "bool", gate)
        new_experiment.history.append(
            self.clone_traits(transient=lambda _: True))

        return new_experiment
Ejemplo n.º 29
0
    def apply(self, experiment):
        if experiment is None:
            raise util.CytoflowOpError('experiment', "No experiment specified")

        if not self.name:
            raise util.CytoflowOpError('name', "Must specify a name")

        if self.name != util.sanitize_identifier(self.name):
            raise util.CytoflowOpError(
                'name',
                "Name can only contain letters, numbers and underscores.".
                format(self.name))

        if not self.function:
            raise util.CytoflowOpError('function', "Must specify a function")

        if not self.by:
            raise util.CytoflowOpError(
                'by', "Must specify some grouping conditions "
                "in 'by'")

        stat_name = (self.name, self.statistic_name) \
                     if self.statistic_name \
                     else (self.name, self.function.__name__)

        if stat_name in experiment.statistics:
            raise util.CytoflowOpError(
                'name', "{} is already in the experiment's statistics".format(
                    stat_name))

        new_experiment = experiment.clone()

        if self.subset:
            try:
                experiment = experiment.query(self.subset)
            except Exception as e:
                raise util.CytoflowOpError(
                    'subset', "Subset string '{0}' isn't valid".format(
                        self.subset)) from e

            if len(experiment) == 0:
                raise util.CytoflowOpError(
                    'subset', "Subset string '{0}' returned no events".format(
                        self.subset))

        for b in self.by:
            if b not in experiment.conditions:
                raise util.CytoflowOpError(
                    'by', "Aggregation metadata {} not found, "
                    " must be one of {}".format(b, experiment.conditions))
            unique = experiment.data[b].unique()

            if len(unique) == 1:
                warn("Only one category for {}".format(b),
                     util.CytoflowOpWarning)

        groupby = experiment.data.groupby(self.by)

        for group, data_subset in groupby:
            if len(data_subset) == 0:
                warn("Group {} had no data".format(group),
                     util.CytoflowOpWarning)

        idx = pd.MultiIndex.from_product(
            [experiment[x].unique() for x in self.by], names=self.by)

        stat = pd.Series(data=self.fill,
                         index=idx,
                         name="{} : {}".format(stat_name[0], stat_name[1]),
                         dtype=np.dtype(object)).sort_index()

        for group, data_subset in groupby:
            if len(data_subset) == 0:
                continue

            try:
                stat.loc[group] = self.function(data_subset)

            except Exception as e:
                raise util.CytoflowOpError(
                    'function',
                    "Your function threw an error in group {}".format(
                        group)) from e

            # check for, and warn about, NaNs.
            if np.any(np.isnan(stat.loc[group])):
                warn("Category {} returned {}".format(group, stat.loc[group]),
                     util.CytoflowOpWarning)

        # try to convert to numeric, but if there are non-numeric bits ignore
        stat = pd.to_numeric(stat, errors='ignore')

        new_experiment.history.append(
            self.clone_traits(transient=lambda t: True))
        new_experiment.statistics[stat_name] = stat

        return new_experiment
Ejemplo n.º 30
0
 def reset_channels(self):
     self.channels_list = [
         Channel(channel=x, name=util.sanitize_identifier(x))
         for x in self.original_channels
     ]
Ejemplo n.º 31
0
 def validate(self, obj, name, value):
     value = super(ValidPythonIdentifier, self).validate(obj, name, value)
     if util.sanitize_identifier(value) == value:
         return value 
     
     self.error(obj, name, value)
Ejemplo n.º 32
0
    def apply(self, experiment):
        """Applies the threshold to an experiment.
        
        Parameters
        ----------
        experiment : Experiment
            the old :class:`Experiment` to which this op is applied
            
        Returns
        -------
        Experiment
            a new :class:'Experiment`, the same as ``old_experiment`` but with 
            a new column of type `bool` with the same as the operation name.  
            The bool is ``True`` if the event's measurement is within the 
            polygon, and ``False`` otherwise.
            
        Raises
        ------
        util.CytoflowOpError
            if for some reason the operation can't be applied to this
            experiment. The reason is in :attr:`.CytoflowOpError.args`
        """

        if experiment is None:
            raise util.CytoflowOpError('experiment', "No experiment specified")

        if self.name in experiment.data.columns:
            raise util.CytoflowOpError(
                'name', "{} is in the experiment already!".format(self.name))

        if self.name != util.sanitize_identifier(self.name):
            raise util.CytoflowOpError(
                'name',
                "Name can only contain letters, numbers and underscores.".
                format(self.name))

        if not self.xchannel:
            raise util.CytoflowOpError('xchannel', "Must specify an x channel")

        if not self.ychannel:
            raise util.CytoflowOpError('ychannel', "Must specify a y channel")

        if not self.xchannel in experiment.channels:
            raise util.CytoflowOpError(
                'xchannel',
                "xchannel {0} is not in the experiment".format(self.xchannel))

        if not self.ychannel in experiment.channels:
            raise util.CytoflowOpError(
                'ychannel',
                "ychannel {0} is not in the experiment".format(self.ychannel))

        if len(self.vertices) < 3:
            raise util.CytoflowOpError('vertices',
                                       "Must have at least 3 vertices")

        if any([len(x) != 2 for x in self.vertices]):
            return util.CytoflowOpError(
                'vertices', "All vertices must be lists or tuples "
                "of length = 2")

        # make sure name got set!
        if not self.name:
            raise util.CytoflowOpError(
                'name', "You have to set the Polygon gate's name "
                "before applying it!")

        # make sure old_experiment doesn't already have a column named self.name
        if (self.name in experiment.data.columns):
            raise util.CytoflowOpError(
                'name',
                "Experiment already contains a column {0}".format(self.name))

        # there's a bit of a subtlety here: if the vertices were
        # selected with an interactive plot, and that plot had scaled
        # axes, we need to apply that scale function to both the
        # vertices and the data before looking for path membership
        xscale = util.scale_factory(self.xscale,
                                    experiment,
                                    channel=self.xchannel)
        yscale = util.scale_factory(self.yscale,
                                    experiment,
                                    channel=self.ychannel)

        vertices = [(xscale(x), yscale(y)) for (x, y) in self.vertices]
        data = experiment.data[[self.xchannel, self.ychannel]].copy()
        data[self.xchannel] = xscale(data[self.xchannel])
        data[self.ychannel] = yscale(data[self.ychannel])

        # use a matplotlib Path because testing for membership is a fast C fn.
        path = mpl.path.Path(np.array(vertices))
        xy_data = data[[self.xchannel, self.ychannel]].values

        new_experiment = experiment.clone()
        new_experiment.add_condition(self.name, "bool",
                                     path.contains_points(xy_data))
        new_experiment.history.append(
            self.clone_traits(transient=lambda _: True))

        return new_experiment
Ejemplo n.º 33
0
    def apply(self, experiment):
        """Applies the threshold to an experiment.
        
        Parameters
        ----------
        experiment : Experiment
            the old_experiment to which this op is applied
            
        Returns
        -------
        Experiment
            a new :class:`~Experiment`, the same as the old experiment but with 
            a new column with a data type of ``bool`` and the same as the 
            operation :attr:`name`.  The bool is ``True`` if the event's 
            measurement in :attr:`xchannel` is greater than :attr:`xlow` and
            less than :attr:`high`, and the event's measurement in 
            :attr:`ychannel` is greater than :attr:`ylow` and less than 
            :attr:`yhigh`; it is ``False`` otherwise.
        """

        if experiment is None:
            raise util.CytoflowOpError('experiment', "No experiment specified")

        # make sure name got set!
        if not self.name:
            raise util.CytoflowOpError(
                'name', "You have to set the gate's name "
                "before applying it!")

        if self.name != util.sanitize_identifier(self.name):
            raise util.CytoflowOpError(
                'name',
                "Name can only contain letters, numbers and underscores.".
                format(self.name))

        # make sure old_experiment doesn't already have a column named self.name
        if (self.name in experiment.data.columns):
            raise util.CytoflowOpError(
                'name',
                "Experiment already contains a column {0}".format(self.name))

        if not self.xchannel or not self.ychannel:
            raise util.CytoflowOpError('xchannel', "Must specify xchannel")

        if not self.xchannel in experiment.channels:
            raise util.CytoflowOpError('xchannel',
                                       "xchannel isn't in the experiment")

        if not self.ychannel:
            raise util.CytoflowOpError('ychannel', "Must specify ychannel")

        if not self.ychannel in experiment.channels:
            raise util.CytoflowOpError('ychannel',
                                       "ychannel isn't in the experiment")

        if self.xhigh <= experiment[self.xchannel].min():
            raise util.CytoflowOpError(
                'xhigh', "x channel range high must be > {0}".format(
                    experiment[self.xchannel].min()))
        if self.xlow >= experiment[self.xchannel].max():
            raise util.CytoflowOpError(
                'xlow', "x channel range low must be < {0}".format(
                    experiment[self.xchannel].max()))

        if self.yhigh <= experiment[self.ychannel].min():
            raise util.CytoflowOpError(
                'yhigh', "y channel range high must be > {0}".format(
                    experiment[self.ychannel].min()))
        if self.ylow >= experiment[self.ychannel].max():
            raise util.CytoflowOpError(
                'ylow', "y channel range low must be < {0}".format(
                    experiment[self.ychannel].max()))

        x = experiment[self.xchannel].between(self.xlow, self.xhigh)
        y = experiment[self.ychannel].between(self.ylow, self.yhigh)
        gate = pd.Series(x & y)

        new_experiment = experiment.clone()
        new_experiment.add_condition(self.name, "bool", gate)
        new_experiment.history.append(
            self.clone_traits(transient=lambda t: True))
        return new_experiment
Ejemplo n.º 34
0
 def _get_subset_str(self):
     if self.low == self.values[0] and self.high == self.values[-1]:
         return ""
      
     return "({0} >= {1} and {0} <= {2})" \
         .format(sanitize_identifier(self.name), self.low, self.high)
Ejemplo n.º 35
0
    def apply(self, experiment=None):
        """
        Load a new :class:`.Experiment`.  
        
        Returns
        -------
        Experiment
            The new :class:`.Experiment`.  New channels have the following
            metadata:
            
            - **voltage** - int
                The voltage that this channel was collected at.  Determined
                by the ``$PnV`` field from the first FCS file.
                
            - **range** - int
                The maximum range of this channel.  Determined by the ``$PnR``
                field from the first FCS file.
                
            New experimental conditions do not have **voltage** or **range**
            metadata, obviously.  Instead, they have **experiment** set to 
            ``True``, to distinguish the experimental variables from the
            conditions that were added by gates, etc.
            
            If :attr:`ignore_v` is set, it is added as a key to the 
            :class:`.Experiment`-wide metadata.
            
        """

        if not self.tubes or len(self.tubes) == 0:
            raise util.CytoflowOpError('tubes', "Must specify some tubes!")

        # if we have channel renaming, make sure the new names are valid
        # python identifiers
        if self.channels:
            for old_name, new_name in self.channels.items():
                if old_name != new_name and new_name != util.sanitize_identifier(
                        new_name):
                    raise util.CytoflowOpError(
                        'channels', "Channel name {} must be a "
                        "valid Python identifier.".format(new_name))

        # make sure each tube has the same conditions
        tube0_conditions = set(self.tubes[0].conditions)
        for tube in self.tubes:
            tube_conditions = set(tube.conditions)
            if len(tube0_conditions ^ tube_conditions) > 0:
                raise util.CytoflowOpError(
                    'tubes', "Tube {0} didn't have the same "
                    "conditions as tube {1}".format(tube.file,
                                                    self.tubes[0].file))

        # make sure experimental conditions are unique
        for idx, i in enumerate(self.tubes[0:-1]):
            for j in self.tubes[idx + 1:]:
                if i.conditions_equal(j):
                    raise util.CytoflowOpError(
                        'tubes', "The same conditions specified for "
                        "tube {0} and tube {1}".format(i.file, j.file))

        experiment = Experiment()

        experiment.metadata["ignore_v"] = self.ignore_v

        for condition, dtype in list(self.conditions.items()):
            experiment.add_condition(condition, dtype)
            experiment.metadata[condition]['experiment'] = True

        try:
            # silence warnings about duplicate channels;
            # we'll figure that out below
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                tube0_meta = fcsparser.parse(self.tubes[0].file,
                                             meta_data_only=True,
                                             reformat_meta=True)
        except Exception as e:
            raise util.CytoflowOpError(
                'tubes', "FCS reader threw an error reading metadata "
                "for tube {}".format(self.tubes[0].file)) from e

        meta_channels = tube0_meta["_channels_"]

        if self.name_metadata:
            experiment.metadata["name_metadata"] = self.name_metadata
        else:
            # try to autodetect the metadata
            if "$PnN" in meta_channels and not "$PnS" in meta_channels:
                experiment.metadata["name_metadata"] = "$PnN"
            elif "$PnN" not in meta_channels and "$PnS" in meta_channels:
                experiment.metadata["name_metadata"] = "$PnS"
            else:
                PnN = meta_channels["$PnN"]
                PnS = meta_channels["$PnS"]

                # sometimes one is unique and the other isn't
                if (len(set(PnN)) == len(PnN) and len(set(PnS)) != len(PnS)):
                    experiment.metadata["name_metadata"] = "$PnN"
                elif (len(set(PnN)) != len(PnN) and len(set(PnS)) == len(PnS)):
                    experiment.metadata["name_metadata"] = "$PnS"
                else:
                    # as per fcsparser.api, $PnN is the "short name" (like FL-1)
                    # and $PnS is the "actual name" (like "FSC-H").  so let's
                    # use $PnS.
                    experiment.metadata["name_metadata"] = "$PnS"

        meta_channels.set_index(experiment.metadata["name_metadata"],
                                inplace=True)

        channels = list(self.channels.keys()) if self.channels \
                   else list(tube0_meta["_channel_names_"])

        # make sure everything in self.channels is in the tube channels

        for channel in channels:
            if channel not in meta_channels.index:
                raise util.CytoflowOpError(
                    'channels', "Channel {0} not in tube {1}".format(
                        channel, self.tubes[0].file))

        # now that we have the metadata, load it into experiment

        for channel in channels:
            experiment.add_channel(channel)

            experiment.metadata[channel]["fcs_name"] = channel

            # keep track of the channel's PMT voltage
            if ("$PnV" in meta_channels.loc[channel]):
                v = meta_channels.loc[channel]['$PnV']
                if v: experiment.metadata[channel]["voltage"] = v

            # add the maximum possible value for this channel.
            data_range = meta_channels.loc[channel]['$PnR']
            data_range = float(data_range)
            experiment.metadata[channel]['range'] = data_range

        experiment.metadata['fcs_metadata'] = {}
        for tube in self.tubes:
            tube_meta, tube_data = parse_tube(tube.file, experiment)

            if self.events:
                if self.events <= len(tube_data):
                    tube_data = tube_data.loc[np.random.choice(tube_data.index,
                                                               self.events,
                                                               replace=False)]
                else:
                    warnings.warn(
                        "Only {0} events in tube {1}".format(
                            len(tube_data), tube.file), util.CytoflowWarning)

            experiment.add_events(tube_data[channels], tube.conditions)
            experiment.metadata['fcs_metadata'][tube.file] = tube_meta

        for channel in channels:
            if self.channels and channel in self.channels:
                new_name = self.channels[channel]
                if channel == new_name:
                    continue
                experiment.data.rename(columns={channel: new_name},
                                       inplace=True)
                experiment.metadata[new_name] = experiment.metadata[channel]
                experiment.metadata[new_name]["fcs_name"] = channel
                del experiment.metadata[channel]

        return experiment
Ejemplo n.º 36
0
    def apply(self, experiment):
        """
        Assign events to a cluster.
        
        Assigns each event to one of the k-means centroids from :meth:`estimate`,
        then groups together events in the same cluster hierarchy.
        
        Parameters
        ----------
        experiment : Experiment
            the :class:`.Experiment` to apply the gate to.
            
        Returns
        -------
        Experiment
            A new :class:`.Experiment` with the gate applied to it.  
            TODO - document the extra statistics
        """

        if experiment is None:
            raise util.CytoflowOpError('experiment', "No experiment specified")

        # make sure name got set!
        if not self.name:
            raise util.CytoflowOpError(
                'name', "You have to set the gate's name "
                "before applying it!")

        if self.name != util.sanitize_identifier(self.name):
            raise util.CytoflowOpError(
                'name',
                "Name can only contain letters, numbers and underscores.".
                format(self.name))

        if self.name in experiment.data.columns:
            raise util.CytoflowOpError(
                'name',
                "Experiment already has a column named {0}".format(self.name))

        if len(self.channels) == 0:
            raise util.CytoflowOpError('channels',
                                       "Must set at least one channel")

        if not self._peaks:
            raise util.CytoflowOpError(
                None, "No model found.  Did you forget to "
                "call estimate()?")

        for c in self.channels:
            if c not in experiment.data:
                raise util.CytoflowOpError(
                    'channels',
                    "Channel {0} not found in the experiment".format(c))

        for c in self.scale:
            if c not in self.channels:
                raise util.CytoflowOpError(
                    'scale', "Scale set for channel {0}, but it isn't "
                    "in the experiment".format(c))

        for b in self.by:
            if b not in experiment.conditions:
                raise util.CytoflowOpError(
                    'by', "Aggregation metadata {} not found, "
                    "must be one of {}".format(b, experiment.conditions))

        if self.by:
            groupby = experiment.data.groupby(self.by)
        else:
            # use a lambda expression to return a group that contains
            # all the events
            groupby = experiment.data.groupby(lambda _: True)

        event_assignments = pd.Series(["{}_None".format(self.name)] *
                                      len(experiment),
                                      dtype="object")

        # make the statistics
        #         clusters = [x + 1 for x in range(self.num_clusters)]
        #
        #         idx = pd.MultiIndex.from_product([experiment[x].unique() for x in self.by] + [clusters] + [self.channels],
        #                                          names = list(self.by) + ["Cluster"] + ["Channel"])
        #         centers_stat = pd.Series(index = idx, dtype = np.dtype(object)).sort_index()

        for group, data_subset in groupby:
            if len(data_subset) == 0:
                raise util.CytoflowOpError(
                    'by', "Group {} had no data".format(group))

            if group not in self._kmeans:
                raise util.CytoflowOpError(
                    'by', "Group {} not found in the estimated "
                    "model.  Do you need to re-run estimate()?".format(group))

            x = data_subset.loc[:, self.channels[:]]

            for c in self.channels:
                x[c] = self._scale[c](x[c])

            # which values are missing?

            x_na = pd.Series([False] * len(x))
            for c in self.channels:
                x_na[np.isnan(x[c]).values] = True

            x = x.values
            x_na = x_na.values
            group_idx = groupby.groups[group]

            kmeans = self._kmeans[group]

            predicted_km = np.full(len(x), -1, "int")
            predicted_km[~x_na] = kmeans.predict(x[~x_na])

            groups = np.asarray(self._cluster_group[group])
            predicted_group = np.full(len(x), -1, "int")
            predicted_group[~x_na] = groups[predicted_km[~x_na]]

            # outlier detection code.  this is disabled for the moment
            # because it is really slow.

            #             num_groups = len(set(groups))
            #             if self.find_outliers:
            #                 density = self._density[group]
            #                 max_d = [-1.0 * np.inf] * num_groups
            #
            #                 for xi in range(len(x)):
            #                     if x_na[xi]:
            #                         continue
            #
            #                     x_c = predicted_group[xi]
            #                     d_x_c = density(x[xi])
            #                     if d_x_c > max_d[x_c]:
            #                         max_d[x_c] = d_x_c
            #
            #                 group_density = [None] * num_groups
            #                 group_weight = [0.0] * num_groups
            #
            #                 for c in range(num_groups):
            #                     num_c = np.sum(predicted_group == c)
            #                     clusters = np.argwhere(groups == c).flatten()
            #
            #                     normals = []
            #                     weights = []
            #                     for k in range(len(clusters)):
            #                         num_k = np.sum(predicted_km == k)
            #                         weight_k = num_k / num_c
            #                         group_weight[c] += num_k / len(x)
            #                         weights.append(weight_k)
            #                         normals.append(self._normals[group][k])
            #
            #                     group_density[c] = lambda x, weights = weights, normals = normals: np.sum([w * n(x) for w, n in zip(weights, normals)], axis = 0)
            #
            #                 for xi in range(len(x)):
            #                     if x_na[xi]:
            #                         continue
            #
            #                     x_c = predicted_group[xi]
            #
            #                     if density(x[xi]) / max_d[x_c] < 0.01:
            #                         predicted_group[xi] = -1
            #                         continue
            #
            #                     sum_d = 0
            #                     for c in set(groups):
            #                         sum_d += group_weight[c] * group_density[c](x[xi])
            #
            #                     if group_weight[x_c] * group_density[x_c](x[xi]) / sum_d < 0.8:
            #                         predicted_group[xi] = -1

            #
            #                     max_d = -1.0 * np.inf
            #                     for x_c in x[predicted_group == c]:
            #                         x_c_d = density(x_c)
            #                         if x_c_d > max_d:
            #                             max_d = x_c_d
            #
            #                     for i in range(len(x)):
            #                         if predicted_group[i] == c and density(x[i]) / max_d <= 0.01:
            #                             predicted_group[i] = -1
            #
            #

            predicted_str = pd.Series(["(none)"] * len(predicted_group))
            for c in range(len(self._cluster_group[group])):
                predicted_str[predicted_group == c] = "{0}_{1}".format(
                    self.name, c + 1)
            predicted_str[predicted_group == -1] = "{0}_None".format(self.name)
            predicted_str.index = group_idx

            event_assignments.iloc[group_idx] = predicted_str

        new_experiment = experiment.clone()
        new_experiment.add_condition(self.name, "category", event_assignments)

        #         new_experiment.statistics[(self.name, "centers")] = pd.to_numeric(centers_stat)

        new_experiment.history.append(
            self.clone_traits(transient=lambda _: True))
        return new_experiment
Ejemplo n.º 37
0
 def _validate_condition_name(self, x):
     return util.sanitize_identifier(x)
Ejemplo n.º 38
0
 def _validate_condition_name(self, x):
     return util.sanitize_identifier(x)