Exemple #1
0
def check_tube(filename, experiment, ignore_v = False):
    try:
        tube_meta = fcsparser.parse( filename, 
                                     channel_naming = experiment.metadata["name_metadata"],
                                     meta_data_only = True,
                                     reformat_meta = True)
    except Exception as e:
        raise util.CytoflowOpError("FCS reader threw an error reading metadata "
                              " for tube {0}: {1}"
                              .format(filename, str(e)))
    
    # first make sure the tube has the right channels    
    if set(tube_meta["_channel_names_"]) != set(experiment.channels):
        raise util.CytoflowError("Tube {0} doesn't have the same channels "
                           "as the first tube added".format(filename))
     
    tube_channels = tube_meta["_channels_"]
    tube_channels.set_index(experiment.metadata["name_metadata"], 
                            inplace = True)
     
    # next check the per-channel parameters
    for channel in experiment.channels:        
        # first check voltage
        if "voltage" in experiment.metadata[channel]:    
            if not "$PnV" in tube_channels.ix[channel]:
                raise util.CytoflowError("Didn't find a voltage for channel {0}" \
                                   "in tube {1}".format(channel, filename))
            
            old_v = experiment.metadata[channel]["voltage"]
            new_v = tube_channels.ix[channel]['$PnV']
            
            if old_v != new_v and not ignore_v:
                raise util.CytoflowError("Tube {0} doesn't have the same voltages"
                                    .format(filename))
Exemple #2
0
    def add_channel(self, name, data=None):
        """
        Add a new column of per-event data (as opposed to metadata) to this
        :class:`Experiment`: ie, something that was measured per cell, or 
        derived from per-cell measurements.    
          
          .. note::
          
            :meth:`add_channel` operates *in place*.
        
        Parameters
        ----------
        name : String
            The name of the new column to be added to :attr:`data`.
            
        data : pandas.Series
            The :class:`pandas.Series` to add to :attr:`data`.  Must be the same
            length as :attr:`data`, and it must be convertable to a 
            dtype of ``float64``.  If ``None``, will add an empty column to 
            the :class:`Experiment` ... but the :class:`Experiment` must be 
            empty to do so!
             
        Raises
        ------
        :exc:`.CytoflowError`
            If the :class:`pandas.Series` passed in ``data`` isn't the same length
            as :attr:`data`, or isn't convertable to a dtype ``float64``.          
            
        Examples
        --------
        >>> ex.add_channel("FSC_over_2", ex.data["FSC-A"] / 2.0) 
        
        """

        if name in self.data:
            raise util.CytoflowError(
                "Already a column named {0} in self.data".format(name))

        if data is None and len(self) > 0:
            raise util.CytoflowError(
                "If data is None, self.data must be empty!")

        if data is not None and len(self) != len(data):
            raise util.CytoflowError(
                "data must be the same length as self.data")

        try:
            if data is not None:
                self.data[name] = data.astype("float64", copy=True)
            else:
                self.data[name] = pd.Series(dtype="float64")

        except (ValueError, TypeError) as exc:
            raise util.CytoflowError(
                "Had trouble converting data to type \"float64\"") from exc

        self.metadata[name] = {}
        self.metadata[name]['type'] = "channel"
Exemple #3
0
    def subset(self, name, value):
        """
        A fast way to get a subset of the data where a condition equals a
        particular value.

        This method "sanitizes" column names first, replacing characters that
        are not valid in a Python identifier with an underscore '_'. So, the
        column name `a column` becomes `a_column`, and can be queried with
        an `a_column == True` or such.

        Parameters
        ----------
        name : Str
            A condition; ie, a key in `self.conditions`.

        value : Any
            The value to look for.  Will be checked with equality, ie `==`

        """
        new_name = util.sanitize_identifier(name)

        if new_name not in self.conditions:
            raise util.CytoflowError("Can't find condition '{}'"
                                     .format(name))

        ret = self.clone()
        ret.data = self.data[ self.data[new_name] == value ]
        ret.data.reset_index(drop = True, inplace = True)
        return ret
Exemple #4
0
def parse_tube(filename, experiment=None, data_set=0, metadata_only=False):

    if experiment:
        check_tube(filename, experiment)
        name_metadata = experiment.metadata["name_metadata"]
    else:
        name_metadata = '$PnS'

    try:
        if metadata_only:
            tube_data = None
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                tube_meta = fcsparser.parse(filename,
                                            meta_data_only=True,
                                            data_set=data_set,
                                            channel_naming=name_metadata)
        else:
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                tube_meta, tube_data = fcsparser.parse(
                    filename,
                    meta_data_only=metadata_only,
                    data_set=data_set,
                    channel_naming=name_metadata)
    except Exception as e:
        raise util.CytoflowError(
            "FCS reader threw an error reading data for tube {}".format(
                filename)) from e

    del tube_meta['__header__']

    return tube_meta, tube_data
Exemple #5
0
    def query(self, expr, **kwargs):
        """
        Expose pandas.DataFrame.query() to the outside world

        This method "sanitizes" column names first, replacing characters that
        are not valid in a Python identifier with an underscore '_'. So, the
        column name `a column` becomes `a_column`, and can be queried with
        an `a_column == True` or such.
        
        Parameters
        ----------
        expr : string
            The expression to pass to `pandas.DataFrame.query()`.  Must be
            a valid Python expression, something you could pass to `eval()`.
            
        **kwargs : dict
            Other named parameters to pass to `pandas.DataFrame.query()`.
        """

        resolvers = {}
        for name, col in self.data.iteritems():
            new_name = util.sanitize_identifier(name)
            if new_name in resolvers:
                raise util.CytoflowError(
                    "Tried to sanitize column name {1} to "
                    "{2} but it already existed in the "
                    " DataFrame.".format(name, new_name))
            else:
                resolvers[new_name] = col

        return self.data.query(expr, resolvers=({}, resolvers), **kwargs)
Exemple #6
0
    def subset(self, conditions, values):
        """
        Returns a subset of this experiment including only the events where
        each condition in ``condition`` equals the corresponding value in 
        ``values``.
        
        
        Parameters
        ----------
        conditions : Str or Tuple(Str)
            A condition or list of conditions
            
        values : Any or Tuple(Any)
            The value(s) of the condition(s)
            
        Returns
        -------
        Experiment
            A new :class:`Experiment` containing only the events specified in 
            ``conditions`` and ``values``.
            
        """

        if isinstance(conditions, str):
            c = conditions
            v = values
            if c not in self.conditions:
                raise util.CytoflowError("{} is not a condition".format(c))
            if v not in list(self.conditions[c]):
                raise util.CytoflowError(
                    "{} is not a value of condition {}".format(v, c))
        else:
            for c, v in zip(conditions, values):
                if c not in self.conditions:
                    raise util.CytoflowError("{} is not a condition".format(c))
                if v not in list(self.conditions[c]):
                    raise util.CytoflowError(
                        "{} is not a value of condition {}".format(v, c))

        g = self.data.groupby(conditions)

        ret = self.clone()
        ret.data = g.get_group(values)
        ret.data.reset_index(drop=True, inplace=True)

        return ret
Exemple #7
0
    def query(self, expr, **kwargs):
        """
        Return an experiment whose data is a subset of this one where ``expr``
        evaluates to ``True``.

        This method "sanitizes" column names first, replacing characters that
        are not valid in a Python identifier with an underscore ``_``. So, the
        column name ``a column`` becomes ``a_column``, and can be queried with
        an ``a_column == True`` or such.
        
        Parameters
        ----------
        expr : string
            The expression to pass to :meth:`pandas.DataFrame.query`.  Must be
            a valid Python expression, something you could pass to :func:`eval`.
            
        **kwargs : dict
            Other named parameters to pass to :meth:`pandas.DataFrame.query`.
            
        Returns
        -------
        Experiment
            A new :class:`Experiment`, a clone of this one with the data 
            returned by :meth:`pandas.DataFrame.query()`
        """

        resolvers = {}
        for name, col in self.data.iteritems():
            new_name = util.sanitize_identifier(name)
            if new_name in resolvers:
                raise util.CytoflowError(
                    "Tried to sanitize column name {1} to "
                    "{2} but it already existed in the "
                    " DataFrame.".format(name, new_name))
            else:
                resolvers[new_name] = col

        ret = self.clone()
        ret.data = self.data.query(expr, resolvers=({}, resolvers), **kwargs)
        ret.data.reset_index(drop=True, inplace=True)

        if len(ret.data) == 0:
            raise util.CytoflowError("No events matched {}".format(expr))

        return ret
Exemple #8
0
def check_tube(filename, experiment, data_set=0):

    if experiment is None:
        raise util.CytoflowError("No experiment specified")

    ignore_v = experiment.metadata['ignore_v']

    try:
        tube_meta = fcsparser.parse(
            filename,
            channel_naming=experiment.metadata["name_metadata"],
            data_set=data_set,
            meta_data_only=True,
            reformat_meta=True)
    except Exception as e:
        raise util.CytoflowError("FCS reader threw an error reading metadata "
                                 "for tube {0}".format(filename)) from e

    # first make sure the tube has the right channels
    if not set(
        [experiment.metadata[c]["fcs_name"]
         for c in experiment.channels]) <= set(tube_meta["_channel_names_"]):
        raise util.CytoflowError(
            "Tube {0} doesn't have the same channels".format(filename))

    tube_channels = tube_meta["_channels_"]
    tube_channels.set_index(experiment.metadata["name_metadata"], inplace=True)

    # next check the per-channel parameters
    for channel in experiment.channels:
        fcs_name = experiment.metadata[channel]["fcs_name"]
        # first check voltage
        if "voltage" in experiment.metadata[channel]:
            if not "$PnV" in tube_channels.loc[fcs_name]:
                raise util.CytoflowError("Didn't find a voltage for channel {0}" \
                                   "in tube {1}".format(channel, filename))

            old_v = experiment.metadata[channel]["voltage"]
            new_v = tube_channels.loc[fcs_name]['$PnV']

            if old_v != new_v and not channel in ignore_v:
                raise util.CytoflowError(
                    "Tube {0} doesn't have the same voltages for channel ".
                    format(filename) + str(channel))
 def include_condition(self, condition):
     if not self.when:
         return True
     
     if condition in self.metadata:
         try:
             return eval(self.when, globals(), self.metadata[condition])
         except:
             raise util.CytoflowError("Bad when statement: {}"
                                      .format(self.when))
     else:
         return False
Exemple #10
0
def parse_tube(filename, experiment):

    check_tube(filename, experiment)

    try:
        tube_meta, tube_data = fcsparser.parse(
            filename, channel_naming=experiment.metadata["name_metadata"])
    except Exception as e:
        raise util.CytoflowError(
            "FCS reader threw an error reading data for tube {}".format(
                filename)) from e

    return tube_meta, tube_data
Exemple #11
0
    def _on_conditions_change(self, obj, name, old, new):
        value_names = set([subset.name for subset in self.value])
        condition_names = set([
            x for x in list(self.conditions.keys())
            if self.include_condition(x)
        ])

        loading = (self.ui.context["context"].status == "loading")

        if not loading:
            for name in value_names - condition_names:
                # remove subsets that aren't in conditions
                subset = next((x for x in self.value if x.name == name))
                self.value.remove(subset)

        for name in condition_names - value_names:
            # add subsets that are new conditions
            values = self.conditions[name].sort_values()
            dtype = pd.Series(list(values)).dtype
            if dtype.kind == 'b':
                subset = BoolSubset(name=name)
            elif dtype.kind in "ifu":
                subset = RangeSubset(name=name, values=list(values))
            elif dtype.kind in "OSU":
                subset = CategorySubset(name=name, values=sorted(list(values)))
            else:
                raise util.CytoflowError(
                    "Unknown dtype {} in ViewController".format(dtype))

            self.value.append(subset)

        for name in condition_names & value_names:
            # update values for subsets we're already tracking
            subset = next((x for x in self.value if x.name == name))
            if set(subset.values) != set(self.conditions[name]):
                subset.values = list(self.conditions[name].sort_values())

        self.value = sorted(self.value, key=lambda x: x.name)
 def _on_conditions_change(self, obj, name, old, new):
     
     # to prevent unnecessary updates, be careful about how these are
     # updated
     
     # first, check current models against the new conditions.  remove any
     # that are no longer present, and update the values for the rest
     for model in list(self.condition_models):
         if model.name not in self.conditions or not self.include_condition(model.name):
             self.condition_models.remove(model)
             continue
         else:
             if set(model.values) != set(self.conditions[model.name]):
                 model.values = list(self.conditions[model.name])
                 
     # then, see if there are any new conditions to add
     for name, values in self.conditions.iteritems(): 
         if len([x for x in self.condition_models if x.name == name]) > 0:
             continue
         
         if not self.include_condition(name):
             continue
         
         dtype = pd.Series(list(values)).dtype
         if dtype.kind == 'b':
             model = BoolCondition(name = name)
         elif dtype.kind in "ifu":
             model = RangeCondition(name = name,
                                    values = list(values))
         elif dtype.kind in "OSU":
             model = CategoryCondition(name = name,
                                       values = list(values))
         else:
             raise util.CytoflowError("Unknown dtype {} in SubsetEditor"
                                      .format(dtype))
             
         self.condition_models.append(model)
Exemple #13
0
    def apply(self, experiment):
        """
        Assigns new metadata to events using the mixture model estimated
        in `estimate`.
        """
            
        if not experiment:
            raise util.CytoflowOpError("No experiment specified")
        
        # make sure name got set!
        if not self.name:
            raise util.CytoflowOpError("You have to set the gate's name "
                                  "before applying it!")

        if self.name in experiment.data.columns:
            raise util.CytoflowOpError("Experiment already has a column named {0}"
                                  .format(self.name))
        
        if not self._gmms:
            raise util.CytoflowOpError("No components found.  Did you forget to "
                                  "call estimate()?")
            
        if not self._xscale:
            raise util.CytoflowOpError("Couldn't find _xscale.  What happened??")
        
        if not self._yscale:
            raise util.CytoflowOpError("Couldn't find _yscale.  What happened??")

        if self.xchannel not in experiment.data:
            raise util.CytoflowOpError("Column {0} not found in the experiment"
                                  .format(self.xchannel))

        if self.ychannel not in experiment.data:
            raise util.CytoflowOpError("Column {0} not found in the experiment"
                                  .format(self.ychannel))
            
        if (self.name + "_Posterior") in experiment.data:
            raise util.CytoflowOpError("Column {0} already found in the experiment"
                                  .format(self.name + "_Posterior"))
            
        if self.num_components == 1 and self.sigma == 0.0:
            raise util.CytoflowError("If num_components == 1, sigma must be > 0")

        if self.posteriors:
            col_name = "{0}_Posterior".format(self.name)
            if col_name in experiment.data:
                raise util.CytoflowOpError("Column {0} already found in the experiment"
                              .format(col_name))
       
        for b in self.by:
            if b not in experiment.data:
                raise util.CytoflowOpError("Aggregation metadata {0} not found"
                                      " in the experiment"
                                      .format(b))

            if len(experiment.data[b].unique()) > 100: #WARNING - magic number
                raise util.CytoflowOpError("More than 100 unique values found for"
                                      " aggregation metadata {0}.  Did you"
                                      " accidentally specify a data channel?"
                                      .format(b))
                           
        if self.sigma < 0.0:
            raise util.CytoflowOpError("sigma must be >= 0.0")
        
        event_assignments = pd.Series([None] * len(experiment), dtype = "object")

        if self.posteriors:
            event_posteriors = pd.Series([0.0] * len(experiment))
            
        # what we DON'T want to do is iterate through event-by-event.
        # the more of this we can push into numpy, sklearn and pandas,
        # the faster it's going to be.  for example, this is why
        # we don't use Ellipse.contains().  
        
        if self.by:
            groupby = experiment.data.groupby(self.by)
        else:
            # use a lambda expression to return a group that
            # contains all the events
            groupby = experiment.data.groupby(lambda x: True)
        
        for group, data_subset in groupby:
            gmm = self._gmms[group]
            x = data_subset.loc[:, [self.xchannel, self.ychannel]]
            x[self.xchannel] = self._xscale(x[self.xchannel])
            x[self.ychannel] = self._yscale(x[self.ychannel])
            
            # which values are missing?
            x_na = np.isnan(x[self.xchannel]) | np.isnan(x[self.ychannel])
            x_na = x_na.values
            
            x = x.values
            group_idx = groupby.groups[group]

            # make a preliminary assignment
            predicted = np.full(len(x), -1, "int")
            predicted[~x_na] = gmm.predict(x[~x_na])
            
            # if we're doing sigma-based gating, for each component check
            # to see if the event is in the sigma gate.
            if self.sigma > 0.0:
                
                # make a quick dataframe with the value and the predicted
                # component
                gate_df = pd.DataFrame({"x" : x[:, 0], 
                                        "y" : x[:, 1],
                                        "p" : predicted})

                # for each component, get the ellipse that follows the isoline
                # around the mixture component
                # cf. http://scikit-learn.org/stable/auto_examples/mixture/plot_gmm.html
                # and http://www.mathworks.com/matlabcentral/newsreader/view_thread/298389
                # and http://stackoverflow.com/questions/7946187/point-and-ellipse-rotated-position-test-algorithm
                # i am not proud of how many tries this took me to get right.

                for c in range(0, self.num_components):
                    mean = gmm.means_[c]
                    covar = gmm._get_covars()[c]
                    
                    # xc is the center on the x axis
                    # yc is the center on the y axis
                    xc = mean[0]  # @UnusedVariable
                    yc = mean[1]  # @UnusedVariable
                    
                    v, w = linalg.eigh(covar)
                    u = w[0] / linalg.norm(w[0])
                    
                    # xl is the length along the x axis
                    # yl is the length along the y axis
                    xl = np.sqrt(v[0]) * self.sigma  # @UnusedVariable
                    yl = np.sqrt(v[1]) * self.sigma  # @UnusedVariable
                    
                    # t is the rotation in radians (counter-clockwise)
                    t = 2 * np.pi - np.arctan(u[1] / u[0])
                    
                    sin_t = np.sin(t)  # @UnusedVariable
                    cos_t = np.cos(t)  # @UnusedVariable
                                        
                    # and build an expression with numexpr so it evaluates fast!

                    gate_bool = gate_df.eval("p == @c and "
                                             "((x - @xc) * @cos_t - (y - @yc) * @sin_t) ** 2 / ((@xl / 2) ** 2) + "
                                             "((x - @xc) * @sin_t + (y - @yc) * @cos_t) ** 2 / ((@yl / 2) ** 2) <= 1").values

                    predicted[np.logical_and(predicted == c, gate_bool == False)] = -1
            
            predicted_str = pd.Series(["(none)"] * len(predicted))
            for c in range(0, self.num_components):
                predicted_str[predicted == c] = "{0}_{1}".format(self.name, c + 1)
            predicted_str[predicted == -1] = "{0}_None".format(self.name)
            predicted_str.index = group_idx

            event_assignments.iloc[group_idx] = predicted_str
                    
            if self.posteriors:
                probability = np.full((len(x), self.num_components), 0.0, "float")
                probability[~x_na, :] = gmm.predict_proba(x[~x_na, :])
                posteriors = pd.Series([0.0] * len(predicted))
                for c in range(0, self.num_components):
                    posteriors[predicted == c] = probability[predicted == c, c]
                posteriors.index = group_idx
                event_posteriors.iloc[group_idx] = posteriors
                    
        new_experiment = experiment.clone()
        
        if self.num_components == 1:
            new_experiment.add_condition(self.name, "bool", event_assignments == "{0}_1".format(self.name))
        else:
            new_experiment.add_condition(self.name, "category", event_assignments)
            
        if self.posteriors:
            col_name = "{0}_Posterior".format(self.name)
            new_experiment.add_condition(col_name, "float", event_posteriors)
                    
        new_experiment.history.append(self.clone_traits())
        return new_experiment
Exemple #14
0
    def add_condition(self, name, dtype, data = None):
        """Add a new column of per-event metadata to this `Experiment`.  Operates
           *in place*.

        There are two places to call `add_condition`.
          - As you're setting up a new `Experiment`, call `add_condition()`
            with `data` set to `None` to specify the conditions the new events
            will have.
          - If you compute some new per-event metadata on an existing
            `Experiment`, call `add_condition()` to add it.

        Parameters
        ----------
        name : String
            The name of the new column in `self.data`.

        dtype : String
            The type of the new column in `self.data`.  Must be a string that
            `pandas.Series` recognizes as a `dtype`: common types are
            "category", "float", "int", and "bool".

        data : pandas.Series (default = None)
            The `pandas.Series` to add to `self.data`.  Must be the same
            length as `self.data`, and it must be convertable to a
            `pandas.Series` of type `dtype`.  If `None`, will add an
            empty column to the `Experiment` ... but the `Experiment` must
            be empty to do so!

        Raises
        ------
        CytoflowError
            If the `pandas.Series` passed in `data` isn't the same length
            as `self.data`, or isn't convertable to type `dtype`.

        Examples
        --------
        >>> import cytoflow as flow
        >>> ex = flow.Experiment()
        >>> ex.add_condition("Time", "float")
        >>> ex.add_condition("Strain", "category")
        """

        if name in self.data:
            raise util.CytoflowError("Already a column named {0} in self.data"
                                     .format(name))

        if data is None and len(self) > 0:
            raise util.CytoflowError("If data is None, self.data must be empty!")

        if data is not None and len(self) != len(data):
            raise util.CytoflowError("data must be the same length as self.data")

        try:
            if data is not None:
                self.data[name] = data.astype(dtype, copy = True)
            else:
                self.data[name] = pd.Series(dtype = dtype)

            self.metadata[name] = {}
            self.metadata[name]['type'] = dtype
        except (ValueError, TypeError):
                raise util.CytoflowError("Had trouble converting data to type {0}"
                                    .format(dtype))
Exemple #15
0
    def add_events(self, data, conditions):
        """
        Add new events to this :class:`Experiment`.
        
        Each new event in ``data`` is appended to :attr:`data`, and its 
        per-event metadata columns will be set with the values specified in 
        ``conditions``.  Thus, it is particularly useful for adding tubes of 
        data to new experiments, before additional per-event metadata is added 
        by gates, etc.
        
        .. note::
        
            *Every* column in :attr:`data` must be accounted for.  Each column 
            of type ``channel`` must appear in ``data``; each column of 
            metadata must have a key:value pair in ``conditions``.
        
        Parameters
        ----------
        tube : pandas.DataFrame
            A single tube or well's worth of data. Must be a DataFrame with
            the same columns as :attr:`channels`
        
        conditions : Dict(Str, Any)
            A dictionary of the tube's metadata.  The keys must match 
            :attr:`conditions`, and the values must be coercable to the
            relevant ``numpy`` dtype.
 
        Raises
        ------
        :exc:`.CytoflowError`
            :meth:`add_events` pukes if:
    
                - there are columns in ``data`` that aren't channels in the 
                  experiment, or vice versa. 
                - there are keys in ``conditions`` that aren't conditions in
                  the experiment, or vice versa.
                - there is metadata specified in ``conditions`` that can't be
                  converted to the corresponding metadata ``dtype``.
            
        Examples
        --------
        >>> import cytoflow as flow
        >>> import fcsparser
        >>> ex = flow.Experiment()
        >>> ex.add_condition("Time", "float")
        >>> ex.add_condition("Strain", "category")
        >>> tube1, _ = fcparser.parse('CFP_Well_A4.fcs')
        >>> tube2, _ = fcparser.parse('RFP_Well_A3.fcs')
        >>> ex.add_events(tube1, {"Time" : 1, "Strain" : "BL21"})
        >>> ex.add_events(tube2, {"Time" : 1, "Strain" : "Top10G"})
        
        """

        # make sure the new tube's channels match the rest of the
        # channels in the Experiment

        if len(self) > 0 and set(data.columns) != set(self.channels):
            raise util.CytoflowError("New events don't have the same channels")

        # check that the conditions for this tube exist in the experiment
        # already

        if( any(True for k in conditions if k not in self.conditions) or \
            any(True for k in self.conditions if k not in conditions) ):
            raise util.CytoflowError(
                "Metadata for this tube should be {}".format(
                    list(self.conditions.keys())))

        # add the conditions to tube's internal data frame.  specify the conditions
        # dtype using self.conditions.  check for errors as we do so.

        # take this chance to up-convert the float32s to float64.
        # this happened automatically in DataFrame.append(), below, but
        # only in certain cases.... :-/

        # TODO - the FCS standard says you can specify the precision.
        # check with int/float/double files!

        new_data = data.astype("float64", copy=True)

        for meta_name, meta_value in conditions.items():
            meta_type = self.conditions[meta_name].dtype

            if is_categorical_dtype(meta_type):
                meta_type = CategoricalDtype([meta_value])

            new_data[meta_name] = \
                pd.Series(data = [meta_value] * len(new_data),
                          index = new_data.index,
                          dtype = meta_type)

            # if we're categorical, merge the categories
            if is_categorical_dtype(meta_type) and meta_name in self.data:
                cats = set(self.data[meta_name].cat.categories) | set(
                    new_data[meta_name].cat.categories)
                self.data[meta_name] = self.data[meta_name].cat.set_categories(
                    cats)
                new_data[meta_name] = new_data[meta_name].cat.set_categories(
                    cats)

        self.data = self.data.append(new_data, ignore_index=True)
        del new_data
Exemple #16
0
    def apply(self, experiment):
        """
        Assigns new metadata to events using the mixture model estimated
        in `estimate`.
        """
            
        if not experiment:
            raise util.CytoflowOpError("No experiment specified")
        
        # make sure name got set!
        if not self.name:
            raise util.CytoflowOpError("You have to set the gate's name "
                                  "before applying it!")

        if self.name in experiment.data.columns:
            raise util.CytoflowOpError("Experiment already has a column named {0}"
                                  .format(self.name))
        
        if not self._gmms:
            raise util.CytoflowOpError("No components found.  Did you forget to "
                                  "call estimate()?")
            
        if not self._scale:
            raise util.CytoflowOpError("Couldn't find _scale.  What happened??")

        if self.channel not in experiment.data:
            raise util.CytoflowOpError("Column {0} not found in the experiment"
                                  .format(self.channel))
            
        if self.num_components == 1 and self.sigma == 0.0:
            raise util.CytoflowError("If num_components == 1, sigma must be > 0")
            
        if (self.name + "_Posterior") in experiment.data:
            raise util.CytoflowOpError("Column {0} already found in the experiment"
                                  .format(self.name + "_Posterior"))

        if self.posteriors:
            col_name = "{0}_Posterior".format(self.name)
            if col_name in experiment.data:
                raise util.CytoflowOpError("Column {0} already found in the experiment"
                              .format(col_name))
       
        for b in self.by:
            if b not in experiment.data:
                raise util.CytoflowOpError("Aggregation metadata {0} not found"
                                      " in the experiment"
                                      .format(b))

            if len(experiment.data[b].unique()) > 100: #WARNING - magic number
                raise util.CytoflowOpError("More than 100 unique values found for"
                                      " aggregation metadata {0}.  Did you"
                                      " accidentally specify a data channel?"
                                      .format(b))
                           
        if self.sigma < 0.0:
            raise util.CytoflowOpError("sigma must be >= 0.0")

        event_assignments = pd.Series([None] * len(experiment), dtype = "object")
                                      
        if self.posteriors:
            event_posteriors = pd.Series([0.0] * len(experiment))
            
        # what we DON'T want to do is iterate through event-by-event.
        # the more of this we can push into numpy, sklearn and pandas,
        # the faster it's going to be.
        
        if self.by:
            groupby = experiment.data.groupby(self.by)
        else:
            # use a lambda expression to return a group that
            # contains all the events
            groupby = experiment.data.groupby(lambda x: True)
        
        for group, data_subset in groupby:
            gmm = self._gmms[group]
            x = data_subset[self.channel]
            x = self._scale(x)
            
            # which values are missing?
            x_na = np.isnan(x)
            
            group_idx = groupby.groups[group]
            
            # make a preliminary assignment
            predicted = np.full(len(x), -1, "int")
            predicted[~x_na] = gmm.predict(x[~x_na, np.newaxis])
            
            # if we're doing sigma-based gating, for each component check
            # to see if the event is in the sigma gate.
            if self.sigma > 0.0:
                
                # make a quick dataframe with the value and the predicted
                # component
                gate_df = pd.DataFrame({"x" : x, "p" : predicted})

                # for each component, get the low and the high threshold
                for c in range(0, self.num_components):
                    lo = (gmm.means_[c][0]    # @UnusedVariable
                          - self.sigma * np.sqrt(gmm.covars_[c][0]))
                    hi = (gmm.means_[c][0]    # @UnusedVariable
                          + self.sigma * np.sqrt(gmm.covars_[c][0]))
                    
                    # and build an expression with numexpr so it evaluates fast!
                    gate_bool = gate_df.eval("p == @c and x >= @lo and x <= @hi").values
                    predicted[np.logical_and(predicted == c, gate_bool == False)] = -1
        
            predicted_str = pd.Series(["(none)"] * len(predicted))
            for c in range(0, self.num_components):
                predicted_str[predicted == c] = "{0}_{1}".format(self.name, c + 1)
            predicted_str[predicted == -1] = "{0}_None".format(self.name)
            predicted_str.index = group_idx

            event_assignments.iloc[group_idx] = predicted_str
                                
            if self.posteriors:
                probability = np.full((len(x), self.num_components), 0.0, "float")
                probability[~x_na, :] = gmm.predict_proba(x[~x_na, np.newaxis])
                posteriors = pd.Series([0.0] * len(predicted))
                for i in range(0, self.num_components):
                    posteriors[predicted == i] = probability[predicted == i, i]
                posteriors.index = group_idx
                event_posteriors.iloc[group_idx] = posteriors
                    
        new_experiment = experiment.clone()
        
        if self.num_components == 1:
            new_experiment.add_condition(self.name, "bool", event_assignments == "{0}_1".format(self.name))
        else:
            new_experiment.add_condition(self.name, "category", event_assignments)
            
        if self.posteriors:
            col_name = "{0}_Posterior".format(self.name)
            new_experiment.add_condition(col_name, "float", event_posteriors)
            
        new_experiment.history.append(self.clone_traits())
        return new_experiment
Exemple #17
0
    def add_condition(self, name, dtype, data=None):
        """
        Add a new column of per-event metadata to this :class:`Experiment`.
        
        .. note::
            :meth:`add_condition` operates **in place.**
        
        There are two places to call `add_condition`.
        
          - As you're setting up a new :class:`Experiment`, call 
            :meth:`add_condition` with ``data`` set to ``None`` to specify the 
            conditions the new events will have.
          - If you compute some new per-event metadata on an existing 
            :class:`Experiment`, call :meth:`add_condition` to add it. 
        
        Parameters
        ----------
        name : String
            The name of the new column in :attr:`data`.  Must be a valid Python
            identifier: must start with ``[A-Za-z_]`` and contain only the 
            characters ``[A-Za-z0-9_]``.
        
        dtype : String
            The type of the new column in :attr:`data`.  Must be a string that
            :class:`pandas.Series` recognizes as a ``dtype``: common types are 
            ``category``, ``float``, ``int``, and ``bool``.
            
        data : pandas.Series (default = None)
            The :class:`pandas.Series` to add to :attr:`data`.  Must be the same
            length as :attr:`data`, and it must be convertable to a 
            :class:`pandas.Series` of type ``dtype``.  If ``None``, will add an
            empty column to the :class:`Experiment` ... but the 
            :class:`Experiment` must be empty to do so!
             
        Raises
        ------
        :class:`.CytoflowError`
            If the :class:`pandas.Series` passed in ``data`` isn't the same 
            length as :attr:`data`, or isn't convertable to type ``dtype``.          
            
        Examples
        --------
        >>> import cytoflow as flow
        >>> ex = flow.Experiment()
        >>> ex.add_condition("Time", "float")
        >>> ex.add_condition("Strain", "category")      
        
        """

        if name != util.sanitize_identifier(name):
            raise util.CytoflowError(
                "Name '{}' is not a valid Python identifier".format(name))

        if name in self.data:
            raise util.CytoflowError(
                "Already a column named {0} in self.data".format(name))

        if data is None and len(self) > 0:
            raise util.CytoflowError(
                "If data is None, self.data must be empty!")

        if data is not None and len(self) != len(data):
            raise util.CytoflowError(
                "data must be the same length as self.data")

        try:
            if data is not None:
                self.data[name] = data.astype(dtype, copy=True)
            else:
                self.data[name] = pd.Series(dtype=dtype)

        except (ValueError, TypeError) as exc:
            raise util.CytoflowError(
                "Had trouble converting data to type {0}".format(
                    dtype)) from exc

        self.metadata[name] = {}
        self.metadata[name]['type'] = "condition"
    def apply(self, experiment = None, metadata_only = False):
        """
        Load a new :class:`.Experiment`.  
        
        Parameters
        ----------
        experiment : Experiment
            Ignored
            
        metadata_only : bool (default = False)
            Only "import" the metadata, creating an Experiment with all the
            expected metadata and structure but 0 events.
        
        Returns
        -------
        Experiment
            The new :class:`.Experiment`.  New channels have the following
            metadata:
            
            - **voltage** - int
                The voltage that this channel was collected at.  Determined
                by the ``$PnV`` field from the first FCS file.
                
            - **range** - int
                The maximum range of this channel.  Determined by the ``$PnR``
                field from the first FCS file.
                
            New experimental conditions do not have **voltage** or **range**
            metadata, obviously.  Instead, they have **experiment** set to 
            ``True``, to distinguish the experimental variables from the
            conditions that were added by gates, etc.
            
            If :attr:`ignore_v` is set, it is added as a key to the 
            :class:`.Experiment`-wide metadata.
            
        """
        
        if not self.tubes or len(self.tubes) == 0:
            raise util.CytoflowOpError('tubes',
                                       "Must specify some tubes!")
        
        # if we have channel renaming, make sure the new names are valid
        # python identifiers
        if self.channels:
            for old_name, new_name in self.channels.items():
                if old_name != new_name and new_name != util.sanitize_identifier(new_name):
                    raise util.CytoflowOpError('channels',
                                               "Channel name {} must be a "
                                               "valid Python identifier."
                                               .format(new_name))
        
        # make sure each tube has the same conditions
        tube0_conditions = set(self.tubes[0].conditions)
        for tube in self.tubes:
            tube_conditions = set(tube.conditions)
            if len(tube0_conditions ^ tube_conditions) > 0:
                raise util.CytoflowOpError('tubes',
                                           "Tube {0} didn't have the same "
                                           "conditions as tube {1}"
                                           .format(tube.file, self.tubes[0].file))

        # make sure experimental conditions are unique
        for idx, i in enumerate(self.tubes[0:-1]):
            for j in self.tubes[idx+1:]:
                if i.conditions_equal(j):
                    raise util.CytoflowOpError('tubes',
                                               "The same conditions specified for "
                                               "tube {0} and tube {1}"
                                               .format(i.file, j.file))
        
        experiment = Experiment()
        
        experiment.metadata["ignore_v"] = self.ignore_v
            
        for condition, dtype in list(self.conditions.items()):
            experiment.add_condition(condition, dtype)
            experiment.metadata[condition]['experiment'] = True

        if (self.tubes[0].file):
            try:
                # silence warnings about duplicate channels;
                # we'll figure that out below
                with warnings.catch_warnings():
                    warnings.simplefilter("ignore")
                    tube0_meta = fcsparser.parse(self.tubes[0].file,
                                                 data_set = self.data_set,
                                                 meta_data_only = True,
                                                 reformat_meta = True)
            except Exception as e:
                raise util.CytoflowOpError('tubes',
                                           "FCS reader threw an error reading metadata "
                                           "for tube {}: {}"
                                           .format(self.tubes[0].file, str(e))) from e

            meta_channels = tube0_meta["_channels_"]

            if self.name_metadata:
                experiment.metadata["name_metadata"] = self.name_metadata
            else:
                experiment.metadata["name_metadata"] = autodetect_name_metadata(self.tubes[0].file,
                                                                                data_set = self.data_set)

            meta_channels['Index'] = meta_channels.index
            meta_channels.set_index(experiment.metadata["name_metadata"], 
                                    inplace = True)

            channels = list(self.channels.keys()) if self.channels \
                       else list(meta_channels.index.values)

            # make sure everything in self.channels is in the tube channels
            for channel in channels:
                if channel not in meta_channels.index:
                    raise util.CytoflowOpError('channels',
                                               "Channel {0} not in tube {1}"
                                               .format(channel, self.tubes[0].file))                         
        else:
            channels = list(self.channels.keys()) if self.channels else list(self.tubes[0].frame)
            meta_channels = DataFrame()
            experiment.metadata["name_metadata"] = None
            tube0_meta = {}

        # now that we have the metadata, load it into experiment

        for channel in channels:
            experiment.add_channel(channel)
            
            experiment.metadata[channel]["fcs_name"] = channel
            
            if (not tube.file):
                # experiment.metadata[channel]['range'] = 65535*2
                experiment.metadata[channel]['range'] = 1e6

            if (list(meta_channels)):
                # keep track of the channel's PMT voltage
                if("$PnV" in meta_channels.loc[channel]):
                    v = meta_channels.loc[channel]['$PnV']
                    if v: experiment.metadata[channel]["voltage"] = v
                            
                # add the maximum possible value for this channel.
                data_range = meta_channels.loc[channel]['$PnR']
                data_range = float(data_range)
                experiment.metadata[channel]['range'] = data_range
                
                                
        experiment.metadata['fcs_metadata'] = {}
        for tube in self.tubes:
            if (tube.file and tube.frame != None):
                raise util.CytoflowError("Both a DataFrame and an FCS file were specified, "
                                         "tube with file {0} and conditions {1}".format(tube.file,tube.conditions))
            elif (tube.file and tube.frame == None):
                if metadata_only:
                    tube_meta, tube_data = parse_tube(tube.file,
                                                      experiment,
                                                      data_set = self.data_set,
                                                      metadata_only = True)
                else:
                    tube_meta, tube_data = parse_tube(tube.file, 
                                                      experiment, 
                                                      data_set = self.data_set)
    
            elif (not tube.file and not tube.frame.empty):
                tube_meta = {} # probably incorrect --tsj
                tube_data = tube.frame
                
            
            if self.events:
                if self.events <= len(tube_data):
                    tube_data = tube_data.loc[np.random.choice(tube_data.index,
                                                               self.events,
                                                               replace = False)]
                else:
                    warnings.warn("Only {0} events in tube {1}"
                                  .format(len(tube_data), tube.file),
                                  util.CytoflowWarning)
    
            experiment.add_events(tube_data[channels], tube.conditions)
                        
            # extract the row and column from wells collected on a 
            # BD HTS
            if 'WELL ID' in tube_meta:               
                pos = tube_meta['WELL ID']
                tube_meta['CF_Row'] = pos[0]
                tube_meta['CF_Col'] = int(pos[1:3])
                
            for i, channel in enumerate(channels):
                # remove the PnV tube metadata

                if '$P{}V'.format(i+1) in tube_meta:
                    del tube_meta['$P{}V'.format(i+1)]
                    
                # work around a bug where the PnR is sometimes not the detector range
                # but the data range.
                pnr = '$P{}R'.format(i+1)
                if pnr in tube_meta and float(tube_meta[pnr]) > experiment.metadata[channel]['range']:
                    experiment.metadata[channel]['range'] = float(tube_meta[pnr])
            
                
            tube_meta['CF_File'] = Path(tube.file).stem
                             
            experiment.metadata['fcs_metadata'][tube.file] = tube_meta
                        
        for channel in channels:
            if self.channels and channel in self.channels:
                new_name = self.channels[channel]
                if channel == new_name:
                    continue
                experiment.data.rename(columns = {channel : new_name}, inplace = True)
                experiment.metadata[new_name] = experiment.metadata[channel]
                experiment.metadata[new_name]["fcs_name"] = channel
                del experiment.metadata[channel]
              
            if (self.tubes[0].file):
            # this catches an odd corner case where some instruments store
                # instrument-specific info in the "extra" bits.  we have to
                # clear them out.
                if '$DATATYPE' in tube0_meta and tube0_meta['$DATATYPE'] == 'I':
                    data_bits  = int(meta_channels.loc[channel]['$PnB'])
                    data_range = float(meta_channels.loc[channel]['$PnR'])
                    range_bits = int(math.log(data_range, 2))

                    if range_bits < data_bits:
                        mask = 1
                        for _ in range(1, range_bits):
                            mask = mask << 1 | 1

                        experiment.data[channel] = experiment.data[channel].values.astype('int') & mask

                # re-scale the data to linear if if's recorded as log-scaled with
                # integer channels
                data_range = float(meta_channels.loc[channel]['$PnR'])
                f1 = float(meta_channels.loc[channel]['$PnE'][0])
                f2 = float(meta_channels.loc[channel]['$PnE'][1])

                if f1 > 0.0 and f2 == 0.0:
                    warnings.warn('Invalid $PnE = {},{} for channel {}, changing it to {},1.0'
                                  .format(f1, f2, channel, f1),
                                  util.CytoflowWarning)
                    f2 = 1.0

                if f1 > 0.0 and f2 > 0.0 and tube0_meta['$DATATYPE'] == 'I':
                    warnings.warn('Converting channel {} from logarithmic to linear'
                                  .format(channel),
                                  util.CytoflowWarning)
    #                 experiment.data[channel] = 10 ** (f1 * experiment.data[channel] / data_range) * f2


        return experiment
Exemple #19
0
    def plot(self, experiment, **kwargs):
        """
        Plot some data from an experiment.  This function takes care of
        checking for facet name validity and subsetting, then passes the
        underlying dataframe to `BaseView.plot`
        
        Parameters
        ----------
        min_quantile : float (>0.0 and <1.0, default = 0.001)
            Clip data that is less than this quantile.
            
        max_quantile : float (>0.0 and <1.0, default = 1.00)
            Clip data that is greater than this quantile.
        
        Other Parameters
        ----------------
        lim : Dict(Str : (float, float))
            Set the range of each channel's axis.  If unspecified, assume
            that the limits are the minimum and maximum of the clipped data.
            Required.
            
        scale : Dict(Str : IScale)
            Scale the data on each axis.  Required.
            
        """
        if experiment is None:
            raise util.CytoflowViewError('experiment',
                                         "No experiment specified")

        if self.xfacet and self.xfacet not in experiment.conditions:
            raise util.CytoflowViewError(
                'xfacet',
                "X facet {0} not in the experiment".format(self.xfacet))

        if self.yfacet and self.yfacet not in experiment.conditions:
            raise util.CytoflowViewError(
                'yfacet',
                "Y facet {0} not in the experiment".format(self.yfacet))

        if self.huefacet and self.huefacet not in experiment.conditions:
            raise util.CytoflowViewError(
                'huefacet',
                "Hue facet {0} not in the experiment".format(self.huefacet))

        # adjust the limits to clip extreme values
        min_quantile = kwargs.pop("min_quantile", 0.001)
        max_quantile = kwargs.pop("max_quantile", 1.0)

        if min_quantile < 0.0 or min_quantile > 1:
            raise util.CytoflowViewError(
                'min_quantile', "min_quantile must be between 0 and 1")

        if max_quantile < 0.0 or max_quantile > 1:
            raise util.CytoflowViewError(
                'max_quantile', "max_quantile must be between 0 and 1")

        if min_quantile >= max_quantile:
            raise util.CytoflowViewError(
                'min_quantile', "min_quantile must be less than max_quantile")

        lim = kwargs.get('lim')
        scale = kwargs.get('scale')

        for c in lim.keys():
            if lim[c] is None:
                lim[c] = (experiment[c].quantile(min_quantile),
                          experiment[c].quantile(max_quantile))
            elif isinstance(lim[c], list) or isinstance(lim[c], tuple):
                if len(lim[c]) != 2:
                    raise util.CytoflowError(
                        'lim', 'Length of lim\[{}\] must be 2'.format(c))
                if lim[c][0] is None:
                    lim[c] = (experiment[c].quantile(min_quantile), lim[c][1])

                if lim[c][1] is None:
                    lim[c] = (lim[c][0], experiment[c].quantile(max_quantile))

            else:
                raise util.CytoflowError(
                    'lim', "lim\[{}\] is an unknown data type".format(c))

            lim[c] = [scale[c].clip(x) for x in lim[c]]

        facets = [x for x in [self.xfacet, self.yfacet, self.huefacet] if x]

        if len(facets) != len(set(facets)):
            raise util.CytoflowViewError(None, "Can't reuse facets")

        if self.subset:
            try:
                experiment = experiment.query(self.subset)
            except util.CytoflowError as e:
                raise util.CytoflowViewError('subset', str(e)) from e
            except Exception as e:
                raise util.CytoflowViewError(
                    'subset', "Subset string '{0}' isn't valid".format(
                        self.subset)) from e

            if len(experiment) == 0:
                raise util.CytoflowViewError(
                    'subset', "Subset string '{0}' returned no events".format(
                        self.subset))

        super().plot(experiment, experiment.data, **kwargs)
    def plot(self, experiment, plot_name=None, **kwargs):
        """Plot a chart of a variable's values against a statistic.
        
        Parameters
        ----------
        
        variable_lim : (float, float)
            The limits on the variable axis
        
        color : a matplotlib color
            The color to plot with.  Overridden if `huefacet` is not `None`
            
        linewidth : float
            The width of the line, in points
            
        linestyle : ['solid' | 'dashed', 'dashdot', 'dotted' | (offset, on-off-dash-seq) | '-' | '--' | '-.' | ':' | 'None' | ' ' | '']
            
        marker : a matplotlib marker style
            See http://matplotlib.org/api/markers_api.html#module-matplotlib.markers
            
        markersize : int
            The marker size in points
            
        markerfacecolor : a matplotlib color
            The color to make the markers.  Overridden (?) if `huefacet` is not `None`
            
        alpha : the alpha blending value, from 0.0 (transparent) to 1.0 (opaque)
        
        capsize : scalar
            The size of the error bar caps, in points
            
        shade_error : bool
            If `False` (the default), plot the error statistic as traditional 
            "error bars."  If `True`, plot error statistic as a filled, shaded
            region.
            
        shade_alpha : float
            The transparency of the shaded error region, from 0.0 (transparent)
            to 1.0 (opaque.)  Default is 0.2.
        
        Notes
        -----
                
        Other `kwargs` are passed to `matplotlib.pyplot.plot <https://matplotlib.org/devdocs/api/_as_gen/matplotlib.pyplot.plot.html>`_
        
        """

        if experiment is None:
            raise util.CytoflowViewError('experiment',
                                         "No experiment specified")

        if self.variable not in experiment.conditions:
            raise util.CytoflowError(
                'variable',
                "Variable {} not in the experiment".format(self.variable))

        if not util.is_numeric(experiment[self.variable]):
            raise util.CytoflowError(
                'variable',
                "Variable {} must be numeric".format(self.variable))

        variable_scale = util.scale_factory(self.variable_scale,
                                            experiment,
                                            condition=self.variable)

        super().plot(experiment,
                     plot_name,
                     variable_scale=variable_scale,
                     **kwargs)