Exemple #1
0
    def get_subsample(self, df=None, compartment="cells", rename_col=True):
        """Apply the subsampling procedure.

        Parameters
        ----------
        df : pandas.core.frame.DataFrame
            DataFrame of a single cell profile.
        compartment : str, default "cells"
            The compartment to process.
        rename_col : bool, default True
            Whether or not to rename the columns.

        Returns
        -------
        None
            Nothing is returned.

        """

        check_compartments(compartment)

        query_cols = "TableNumber, ImageNumber, ObjectNumber"
        query = "select {} from {}".format(query_cols, compartment)

        # Load query and merge with image_df
        if df is None:
            df = pd.read_sql(sql=query, con=self.conn)

        query_df = self.image_df.merge(df, how="inner", on=self.merge_cols)

        self.subset_data_df = (query_df.groupby(
            self.strata).apply(lambda x: self.subsample_profiles(
                x, rename_col=rename_col)).reset_index(drop=True))

        self.is_subset_computed = True
Exemple #2
0
    def aggregate_compartment(self, compartment, compute_subsample=False):
        """
        Aggregate morphological profiles

        Arguments:
        compartment - str indicating specific compartment to extract

        Return:
        Either the merged object file or write object to disk
        """
        check_compartments(compartment)

        compartment_query = "select * from {}".format(compartment)

        if (self.subsample_frac < 1
                or self.subsample_n != "all") and compute_subsample:
            self.get_subsample(compartment=compartment)

        population_df = self.image_df.merge(
            pd.read_sql(sql=compartment_query, con=self.conn),
            how="inner",
            on=self.merge_cols,
        )

        object_df = aggregate(
            population_df=population_df,
            strata=self.strata,
            features=self.features,
            operation=self.operation,
            subset_data_df=self.subset_data_df,
        )

        return object_df
Exemple #3
0
    def count_cells(self, compartment="cells", count_subset=False):
        """
        Determine how many cells are measured per well.

        Arguments:
        compartment - string indicating the compartment to subset
        count_subset - [default: False] count the number of cells in subset partition
        """
        check_compartments(compartment)

        if count_subset:
            assert self.is_aggregated, "Make sure to aggregate_profiles() first!"
            assert self.is_subset_computed, "Make sure to get_subsample() first!"
            count_df = (self.subset_data_df.groupby(
                self.strata)["ObjectNumber"].count().reset_index().rename(
                    {"ObjectNumber": "cell_count"}, axis="columns"))
        else:
            query_cols = "TableNumber, ImageNumber, ObjectNumber"
            query = "select {} from {}".format(query_cols, compartment)
            count_df = self.image_df.merge(pd.read_sql(sql=query,
                                                       con=self.conn),
                                           how="inner",
                                           on=self.merge_cols)
            count_df = (count_df.groupby(
                self.strata)["ObjectNumber"].count().reset_index().rename(
                    {"ObjectNumber": "cell_count"}, axis="columns"))

        return count_df
Exemple #4
0
    def count_cells(self, compartment="cells", count_subset=False):
        """Determine how many cells are measured per well.

        Parameters
        ----------
        compartment : str, default "cells"
            Compartment to subset.
        count_subset : bool, default False
            Whether or not count the number of cells as specified by the strata groups.

        Returns
        -------
        pandas.core.frame.DataFrame
            DataFrame of cell counts in the experiment.

        """

        check_compartments(compartment)

        if count_subset:
            assert self.is_aggregated, "Make sure to aggregate_profiles() first!"
            assert self.is_subset_computed, "Make sure to get_subsample() first!"
            count_df = (
                self.subset_data_df.groupby(self.strata)["Metadata_ObjectNumber"]
                .count()
                .reset_index()
                .rename({"Metadata_ObjectNumber": "cell_count"}, axis="columns")
            )
        else:
            query_cols = "TableNumber, ImageNumber, ObjectNumber"
            query = "select {} from {}".format(query_cols, compartment)
            count_df = self.image_df.merge(
                pd.read_sql(sql=query, con=self.conn), how="inner", on=self.merge_cols
            )
            count_df = (
                count_df.groupby(self.strata)["ObjectNumber"]
                .count()
                .reset_index()
                .rename({"ObjectNumber": "cell_count"}, axis="columns")
            )

        return count_df
Exemple #5
0
    def get_subsample(self, compartment="cells"):
        """
        Extract subsample from sqlite file

        Arguments:
        compartment - [default: "cells"] string indicating the compartment to subset
        """
        check_compartments(compartment)

        query_cols = "TableNumber, ImageNumber, ObjectNumber"
        query = "select {} from {}".format(query_cols, compartment)

        # Load query and merge with image_df
        query_df = self.image_df.merge(pd.read_sql(sql=query, con=self.conn),
                                       how="inner",
                                       on=self.merge_cols)

        self.subset_data_df = (query_df.groupby(self.strata).apply(
            lambda x: self.subsample_profiles(x)).reset_index(drop=True))

        self.is_subset_computed = True
Exemple #6
0
    def __init__(
        self,
        file_or_conn,
        strata=["Metadata_Plate", "Metadata_Well"],
        aggregation_operation="median",
        output_file="none",
        compartments=default_compartments,
        compartment_linking_cols=default_linking_cols,
        merge_cols=["TableNumber", "ImageNumber"],
        image_cols=["TableNumber", "ImageNumber", "Metadata_Site"],
        add_image_features=False,
        image_feature_categories=None,
        features="infer",
        load_image_data=True,
        subsample_frac=1,
        subsample_n="all",
        subsampling_random_state="none",
        fields_of_view="all",
        fields_of_view_feature="Metadata_Site",
        object_feature="Metadata_ObjectNumber",
    ):
        """Constructor method"""
        # Check compartments specified
        check_compartments(compartments)

        # Check if correct operation is specified
        aggregation_operation = check_aggregate_operation(aggregation_operation)

        # Check that the subsample_frac is between 0 and 1
        assert (
            0 < subsample_frac and 1 >= subsample_frac
        ), "subsample_frac must be between 0 and 1"

        self.file_or_conn = file_or_conn
        self.strata = strata
        self.load_image_data = load_image_data
        self.aggregation_operation = aggregation_operation.lower()
        self.output_file = output_file
        self.merge_cols = merge_cols
        self.image_cols = image_cols
        self.add_image_features = add_image_features
        self.image_feature_categories = image_feature_categories
        self.features = features
        self.subsample_frac = subsample_frac
        self.subsample_n = subsample_n
        self.subset_data_df = "none"
        self.subsampling_random_state = subsampling_random_state
        self.is_aggregated = False
        self.is_subset_computed = False
        self.compartments = compartments
        self.compartment_linking_cols = compartment_linking_cols
        self.fields_of_view_feature = fields_of_view_feature
        self.object_feature = object_feature

        # Confirm that the compartments and linking cols are formatted properly
        assert_linking_cols_complete(
            compartments=self.compartments, linking_cols=self.compartment_linking_cols
        )

        # Build a dictionary to update linking column feature names
        self.linking_col_rename = provide_linking_cols_feature_name_update(
            self.compartment_linking_cols
        )

        if self.subsample_n != "all":
            self.set_subsample_n(self.subsample_n)

        # Connect to sqlite engine
        self.engine = create_engine(self.file_or_conn)
        self.conn = self.engine.connect()

        # Throw an error if both subsample_frac and subsample_n is set
        self._check_subsampling()

        # Confirm that the input fields of view is valid
        self.fields_of_view = check_fields_of_view_format(fields_of_view)

        if self.load_image_data:
            self.load_image()
Exemple #7
0
    def aggregate_compartment(
        self,
        compartment,
        compute_subsample=False,
        compute_counts=False,
        add_image_features=False,
        n_aggregation_memory_strata=1,
    ):
        """Aggregate morphological profiles. Uses pycytominer.aggregate()

        Parameters
        ----------
        compartment : str
            Compartment to aggregate.
        compute_subsample : bool, default False
            Whether or not to subsample.
        compute_counts : bool, default False
            Whether or not to compute the number of objects in each compartment
            and the number of fields of view per well.
        add_image_features : bool, default False
            Whether or not to add image features.
        n_aggregation_memory_strata : int, default 1
            Number of unique strata to pull from the database into working memory
            at once.  Typically 1 is fastest.  A larger number uses more memory.
            For example, if aggregating by "well", then n_aggregation_memory_strata=1
            means that one "well" will be pulled from the SQLite database into
            memory at a time.

        Returns
        -------
        pandas.core.frame.DataFrame
            DataFrame of aggregated profiles.

        """

        check_compartments(compartment)

        if (self.subsample_frac < 1 or self.subsample_n != "all") and compute_subsample:
            self.get_subsample(compartment=compartment)

        # Load image data if not already loaded
        if not self.load_image_data:
            self.load_image()
            self.load_image_data = True

        # Iteratively call aggregate() on chunks of the full compartment table
        object_dfs = []
        for compartment_df in self._compartment_df_generator(
            compartment=compartment,
            n_aggregation_memory_strata=n_aggregation_memory_strata,
        ):

            population_df = self.image_df.merge(
                compartment_df,
                how="inner",
                on=self.merge_cols,
            ).rename(self.linking_col_rename, axis="columns")

            if self.features == "infer":
                aggregate_features = infer_cp_features(
                    population_df, compartments=compartment
                )
            else:
                aggregate_features = self.features

            partial_object_df = aggregate(
                population_df=population_df,
                strata=self.strata,
                compute_object_count=compute_counts,
                operation=self.aggregation_operation,
                subset_data_df=self.subset_data_df,
                features=aggregate_features,
                object_feature=self.object_feature,
            )

            if compute_counts and self.fields_of_view_feature not in self.strata:
                fields_count_df = aggregate_fields_count(
                    self.image_df, self.strata, self.fields_of_view_feature
                )

                if add_image_features:
                    fields_count_df = aggregate_image_features(
                        fields_count_df,
                        self.image_features_df,
                        self.image_feature_categories,
                        self.image_cols,
                        self.strata,
                        self.aggregation_operation,
                    )

                partial_object_df = fields_count_df.merge(
                    partial_object_df,
                    on=self.strata,
                    how="right",
                )

                # Separate all the metadata and feature columns.
                metadata_cols = infer_cp_features(partial_object_df, metadata=True)
                feature_cols = infer_cp_features(partial_object_df, image_features=True)

                partial_object_df = partial_object_df.reindex(
                    columns=metadata_cols + feature_cols
                )

            object_dfs.append(partial_object_df)

        # Concatenate one or more aggregated dataframes row-wise into final output
        object_df = pd.concat(object_dfs, axis=0).reset_index(drop=True)

        return object_df
Exemple #8
0
    def aggregate_compartment(
        self,
        compartment,
        compute_subsample=False,
        compute_counts=False,
        aggregate_args=None,
    ):
        """Aggregate morphological profiles. Uses pycytominer.aggregate()

        Parameters
        ----------
        compartment : str
            Compartment to aggregate.
        compute_subsample : bool, default False
            Whether or not to subsample.
        compute_counts : bool, default False
            Whether or not to compute the number of objects in each compartment and the number of fields of view per well.
        aggregate_args : dict, optional
            Additional arguments passed as input to pycytominer.aggregate().

        Returns
        -------
        pandas.core.frame.DataFrame
            DataFrame of aggregated profiles.

        """

        check_compartments(compartment)

        if (self.subsample_frac < 1 or self.subsample_n != "all") and compute_subsample:
            self.get_subsample(compartment=compartment)

        # Load image data if not already loaded
        if not self.load_image_data:
            self.load_image()
            self.load_image_data = True

        population_df = self.image_df.merge(
            self.load_compartment(compartment=compartment),
            how="inner",
            on=self.merge_cols,
        ).rename(self.linking_col_rename, axis="columns")

        # Infering features is tricky with non-canonical data
        if aggregate_args is None:
            aggregate_args = {}
            features = infer_cp_features(population_df, compartments=compartment)
        elif "features" not in aggregate_args:
            features = infer_cp_features(population_df, compartments=compartment)
        elif aggregate_args["features"] == "infer":
            features = infer_cp_features(population_df, compartments=compartment)
        else:
            features = aggregate_args["features"]

        aggregate_args["features"] = features
        if "object_feature" not in aggregate_args:
            aggregate_args["object_feature"] = self.object_feature

        object_df = aggregate(
            population_df=population_df,
            strata=self.strata,
            compute_object_count=compute_counts,
            operation=self.aggregation_operation,
            subset_data_df=self.subset_data_df,
            **aggregate_args,
        )

        if compute_counts and self.fields_of_view_feature not in self.strata:
            fields_count_df = self.image_df.loc[
                :, list(np.union1d(self.strata, self.fields_of_view_feature))
            ]
            fields_count_df = (
                fields_count_df.groupby(self.strata)[self.fields_of_view_feature]
                .count()
                .reset_index()
                .rename(
                    columns={f"{self.fields_of_view_feature}": f"Metadata_Site_Count"}
                )
            )

            object_df = fields_count_df.merge(object_df, on=self.strata, how="right")

        return object_df
Exemple #9
0
    def __init__(
        self,
        sql_file,
        strata=["Metadata_Plate", "Metadata_Well"],
        features="infer",
        operation="median",
        output_file="none",
        compartments=["cells", "cytoplasm", "nuclei"],
        merge_cols=["TableNumber", "ImageNumber"],
        load_image_data=True,
        subsample_frac=1,
        subsample_n="all",
        subsampling_random_state="none",
    ):
        """
        Arguments:
        sql_file - string or sqlalchemy connection
        strata - [default: ["Metadata_Plate", "Metadata_Well"]] list indicating the columns to groupby and aggregate
        features - [default: "all"] or list indicating features that should be aggregated
        operation - [default: "median"] a string indicating how the data is aggregated
                    currently only supports one of ['mean', 'median']
        output_file - [default: "none"] string if specified, write to location
        compartments - list of compartments to process
        merge_cols - column indicating which columns to merge images and compartments
        subsample_frac - [default: 1] float (0 < subsample <= 1) indicating percentage of
                         single cells to select
        subsample_n - [default: "all"] int indicating how many samples to include
        subsampling_random_state - [default: "none"] the random state to init subsample
        """
        # Check compartments specified
        check_compartments(compartments)

        # Check if correct operation is specified
        operation = check_aggregate_operation(operation)

        # Check that the subsample_frac is between 0 and 1
        assert (0 < subsample_frac and
                1 >= subsample_frac), "subsample_frac must be between 0 and 1"

        self.sql_file = sql_file
        self.strata = strata
        self.features = features
        self.operation = operation.lower()
        self.output_file = output_file
        self.compartments = compartments
        self.merge_cols = merge_cols
        self.subsample_frac = subsample_frac
        self.subsample_n = subsample_n
        self.subset_data_df = "none"
        self.subsampling_random_state = subsampling_random_state
        self.is_aggregated = False
        self.is_subset_computed = False

        if self.subsample_n != "all":
            self.set_subsample_n(self.subsample_n)

        # Connect to sqlite engine
        self.engine = create_engine(self.sql_file)
        self.conn = self.engine.connect()

        # Throw an error if both subsample_frac and subsample_n is set
        self._check_subsampling()

        if load_image_data:
            self.load_image()