def get_subsample(self, df=None, compartment="cells", rename_col=True): """Apply the subsampling procedure. Parameters ---------- df : pandas.core.frame.DataFrame DataFrame of a single cell profile. compartment : str, default "cells" The compartment to process. rename_col : bool, default True Whether or not to rename the columns. Returns ------- None Nothing is returned. """ check_compartments(compartment) query_cols = "TableNumber, ImageNumber, ObjectNumber" query = "select {} from {}".format(query_cols, compartment) # Load query and merge with image_df if df is None: df = pd.read_sql(sql=query, con=self.conn) query_df = self.image_df.merge(df, how="inner", on=self.merge_cols) self.subset_data_df = (query_df.groupby( self.strata).apply(lambda x: self.subsample_profiles( x, rename_col=rename_col)).reset_index(drop=True)) self.is_subset_computed = True
def aggregate_compartment(self, compartment, compute_subsample=False): """ Aggregate morphological profiles Arguments: compartment - str indicating specific compartment to extract Return: Either the merged object file or write object to disk """ check_compartments(compartment) compartment_query = "select * from {}".format(compartment) if (self.subsample_frac < 1 or self.subsample_n != "all") and compute_subsample: self.get_subsample(compartment=compartment) population_df = self.image_df.merge( pd.read_sql(sql=compartment_query, con=self.conn), how="inner", on=self.merge_cols, ) object_df = aggregate( population_df=population_df, strata=self.strata, features=self.features, operation=self.operation, subset_data_df=self.subset_data_df, ) return object_df
def count_cells(self, compartment="cells", count_subset=False): """ Determine how many cells are measured per well. Arguments: compartment - string indicating the compartment to subset count_subset - [default: False] count the number of cells in subset partition """ check_compartments(compartment) if count_subset: assert self.is_aggregated, "Make sure to aggregate_profiles() first!" assert self.is_subset_computed, "Make sure to get_subsample() first!" count_df = (self.subset_data_df.groupby( self.strata)["ObjectNumber"].count().reset_index().rename( {"ObjectNumber": "cell_count"}, axis="columns")) else: query_cols = "TableNumber, ImageNumber, ObjectNumber" query = "select {} from {}".format(query_cols, compartment) count_df = self.image_df.merge(pd.read_sql(sql=query, con=self.conn), how="inner", on=self.merge_cols) count_df = (count_df.groupby( self.strata)["ObjectNumber"].count().reset_index().rename( {"ObjectNumber": "cell_count"}, axis="columns")) return count_df
def count_cells(self, compartment="cells", count_subset=False): """Determine how many cells are measured per well. Parameters ---------- compartment : str, default "cells" Compartment to subset. count_subset : bool, default False Whether or not count the number of cells as specified by the strata groups. Returns ------- pandas.core.frame.DataFrame DataFrame of cell counts in the experiment. """ check_compartments(compartment) if count_subset: assert self.is_aggregated, "Make sure to aggregate_profiles() first!" assert self.is_subset_computed, "Make sure to get_subsample() first!" count_df = ( self.subset_data_df.groupby(self.strata)["Metadata_ObjectNumber"] .count() .reset_index() .rename({"Metadata_ObjectNumber": "cell_count"}, axis="columns") ) else: query_cols = "TableNumber, ImageNumber, ObjectNumber" query = "select {} from {}".format(query_cols, compartment) count_df = self.image_df.merge( pd.read_sql(sql=query, con=self.conn), how="inner", on=self.merge_cols ) count_df = ( count_df.groupby(self.strata)["ObjectNumber"] .count() .reset_index() .rename({"ObjectNumber": "cell_count"}, axis="columns") ) return count_df
def get_subsample(self, compartment="cells"): """ Extract subsample from sqlite file Arguments: compartment - [default: "cells"] string indicating the compartment to subset """ check_compartments(compartment) query_cols = "TableNumber, ImageNumber, ObjectNumber" query = "select {} from {}".format(query_cols, compartment) # Load query and merge with image_df query_df = self.image_df.merge(pd.read_sql(sql=query, con=self.conn), how="inner", on=self.merge_cols) self.subset_data_df = (query_df.groupby(self.strata).apply( lambda x: self.subsample_profiles(x)).reset_index(drop=True)) self.is_subset_computed = True
def __init__( self, file_or_conn, strata=["Metadata_Plate", "Metadata_Well"], aggregation_operation="median", output_file="none", compartments=default_compartments, compartment_linking_cols=default_linking_cols, merge_cols=["TableNumber", "ImageNumber"], image_cols=["TableNumber", "ImageNumber", "Metadata_Site"], add_image_features=False, image_feature_categories=None, features="infer", load_image_data=True, subsample_frac=1, subsample_n="all", subsampling_random_state="none", fields_of_view="all", fields_of_view_feature="Metadata_Site", object_feature="Metadata_ObjectNumber", ): """Constructor method""" # Check compartments specified check_compartments(compartments) # Check if correct operation is specified aggregation_operation = check_aggregate_operation(aggregation_operation) # Check that the subsample_frac is between 0 and 1 assert ( 0 < subsample_frac and 1 >= subsample_frac ), "subsample_frac must be between 0 and 1" self.file_or_conn = file_or_conn self.strata = strata self.load_image_data = load_image_data self.aggregation_operation = aggregation_operation.lower() self.output_file = output_file self.merge_cols = merge_cols self.image_cols = image_cols self.add_image_features = add_image_features self.image_feature_categories = image_feature_categories self.features = features self.subsample_frac = subsample_frac self.subsample_n = subsample_n self.subset_data_df = "none" self.subsampling_random_state = subsampling_random_state self.is_aggregated = False self.is_subset_computed = False self.compartments = compartments self.compartment_linking_cols = compartment_linking_cols self.fields_of_view_feature = fields_of_view_feature self.object_feature = object_feature # Confirm that the compartments and linking cols are formatted properly assert_linking_cols_complete( compartments=self.compartments, linking_cols=self.compartment_linking_cols ) # Build a dictionary to update linking column feature names self.linking_col_rename = provide_linking_cols_feature_name_update( self.compartment_linking_cols ) if self.subsample_n != "all": self.set_subsample_n(self.subsample_n) # Connect to sqlite engine self.engine = create_engine(self.file_or_conn) self.conn = self.engine.connect() # Throw an error if both subsample_frac and subsample_n is set self._check_subsampling() # Confirm that the input fields of view is valid self.fields_of_view = check_fields_of_view_format(fields_of_view) if self.load_image_data: self.load_image()
def aggregate_compartment( self, compartment, compute_subsample=False, compute_counts=False, add_image_features=False, n_aggregation_memory_strata=1, ): """Aggregate morphological profiles. Uses pycytominer.aggregate() Parameters ---------- compartment : str Compartment to aggregate. compute_subsample : bool, default False Whether or not to subsample. compute_counts : bool, default False Whether or not to compute the number of objects in each compartment and the number of fields of view per well. add_image_features : bool, default False Whether or not to add image features. n_aggregation_memory_strata : int, default 1 Number of unique strata to pull from the database into working memory at once. Typically 1 is fastest. A larger number uses more memory. For example, if aggregating by "well", then n_aggregation_memory_strata=1 means that one "well" will be pulled from the SQLite database into memory at a time. Returns ------- pandas.core.frame.DataFrame DataFrame of aggregated profiles. """ check_compartments(compartment) if (self.subsample_frac < 1 or self.subsample_n != "all") and compute_subsample: self.get_subsample(compartment=compartment) # Load image data if not already loaded if not self.load_image_data: self.load_image() self.load_image_data = True # Iteratively call aggregate() on chunks of the full compartment table object_dfs = [] for compartment_df in self._compartment_df_generator( compartment=compartment, n_aggregation_memory_strata=n_aggregation_memory_strata, ): population_df = self.image_df.merge( compartment_df, how="inner", on=self.merge_cols, ).rename(self.linking_col_rename, axis="columns") if self.features == "infer": aggregate_features = infer_cp_features( population_df, compartments=compartment ) else: aggregate_features = self.features partial_object_df = aggregate( population_df=population_df, strata=self.strata, compute_object_count=compute_counts, operation=self.aggregation_operation, subset_data_df=self.subset_data_df, features=aggregate_features, object_feature=self.object_feature, ) if compute_counts and self.fields_of_view_feature not in self.strata: fields_count_df = aggregate_fields_count( self.image_df, self.strata, self.fields_of_view_feature ) if add_image_features: fields_count_df = aggregate_image_features( fields_count_df, self.image_features_df, self.image_feature_categories, self.image_cols, self.strata, self.aggregation_operation, ) partial_object_df = fields_count_df.merge( partial_object_df, on=self.strata, how="right", ) # Separate all the metadata and feature columns. metadata_cols = infer_cp_features(partial_object_df, metadata=True) feature_cols = infer_cp_features(partial_object_df, image_features=True) partial_object_df = partial_object_df.reindex( columns=metadata_cols + feature_cols ) object_dfs.append(partial_object_df) # Concatenate one or more aggregated dataframes row-wise into final output object_df = pd.concat(object_dfs, axis=0).reset_index(drop=True) return object_df
def aggregate_compartment( self, compartment, compute_subsample=False, compute_counts=False, aggregate_args=None, ): """Aggregate morphological profiles. Uses pycytominer.aggregate() Parameters ---------- compartment : str Compartment to aggregate. compute_subsample : bool, default False Whether or not to subsample. compute_counts : bool, default False Whether or not to compute the number of objects in each compartment and the number of fields of view per well. aggregate_args : dict, optional Additional arguments passed as input to pycytominer.aggregate(). Returns ------- pandas.core.frame.DataFrame DataFrame of aggregated profiles. """ check_compartments(compartment) if (self.subsample_frac < 1 or self.subsample_n != "all") and compute_subsample: self.get_subsample(compartment=compartment) # Load image data if not already loaded if not self.load_image_data: self.load_image() self.load_image_data = True population_df = self.image_df.merge( self.load_compartment(compartment=compartment), how="inner", on=self.merge_cols, ).rename(self.linking_col_rename, axis="columns") # Infering features is tricky with non-canonical data if aggregate_args is None: aggregate_args = {} features = infer_cp_features(population_df, compartments=compartment) elif "features" not in aggregate_args: features = infer_cp_features(population_df, compartments=compartment) elif aggregate_args["features"] == "infer": features = infer_cp_features(population_df, compartments=compartment) else: features = aggregate_args["features"] aggregate_args["features"] = features if "object_feature" not in aggregate_args: aggregate_args["object_feature"] = self.object_feature object_df = aggregate( population_df=population_df, strata=self.strata, compute_object_count=compute_counts, operation=self.aggregation_operation, subset_data_df=self.subset_data_df, **aggregate_args, ) if compute_counts and self.fields_of_view_feature not in self.strata: fields_count_df = self.image_df.loc[ :, list(np.union1d(self.strata, self.fields_of_view_feature)) ] fields_count_df = ( fields_count_df.groupby(self.strata)[self.fields_of_view_feature] .count() .reset_index() .rename( columns={f"{self.fields_of_view_feature}": f"Metadata_Site_Count"} ) ) object_df = fields_count_df.merge(object_df, on=self.strata, how="right") return object_df
def __init__( self, sql_file, strata=["Metadata_Plate", "Metadata_Well"], features="infer", operation="median", output_file="none", compartments=["cells", "cytoplasm", "nuclei"], merge_cols=["TableNumber", "ImageNumber"], load_image_data=True, subsample_frac=1, subsample_n="all", subsampling_random_state="none", ): """ Arguments: sql_file - string or sqlalchemy connection strata - [default: ["Metadata_Plate", "Metadata_Well"]] list indicating the columns to groupby and aggregate features - [default: "all"] or list indicating features that should be aggregated operation - [default: "median"] a string indicating how the data is aggregated currently only supports one of ['mean', 'median'] output_file - [default: "none"] string if specified, write to location compartments - list of compartments to process merge_cols - column indicating which columns to merge images and compartments subsample_frac - [default: 1] float (0 < subsample <= 1) indicating percentage of single cells to select subsample_n - [default: "all"] int indicating how many samples to include subsampling_random_state - [default: "none"] the random state to init subsample """ # Check compartments specified check_compartments(compartments) # Check if correct operation is specified operation = check_aggregate_operation(operation) # Check that the subsample_frac is between 0 and 1 assert (0 < subsample_frac and 1 >= subsample_frac), "subsample_frac must be between 0 and 1" self.sql_file = sql_file self.strata = strata self.features = features self.operation = operation.lower() self.output_file = output_file self.compartments = compartments self.merge_cols = merge_cols self.subsample_frac = subsample_frac self.subsample_n = subsample_n self.subset_data_df = "none" self.subsampling_random_state = subsampling_random_state self.is_aggregated = False self.is_subset_computed = False if self.subsample_n != "all": self.set_subsample_n(self.subsample_n) # Connect to sqlite engine self.engine = create_engine(self.sql_file) self.conn = self.engine.connect() # Throw an error if both subsample_frac and subsample_n is set self._check_subsampling() if load_image_data: self.load_image()