def aggregate_profiles( self, compute_subsample="False", output_file="none", compression=None, float_format=None, ): """ Aggregate and merge compartments. This is the primary entry to this class. Arguments: compute_subsample - [default: False] boolean if subsample should be computed. NOTE: Must be specified to perform subsampling. Will not apply subsetting if set to False even if subsample is initialized output_file - [default: "none"] if provided, will write annotated profiles to file if not specified, will return the annotated profiles. We recommend that this output file be suffixed with "_augmented.csv". compression - the mechanism to compress [default: None] float_format - decimal precision to use in writing output file [default: None] For example, use "%.3g" for 3 decimal precision. Return: if output_file is set, then write to file. If not then return """ if output_file != "none": self.set_output_file(output_file) aggregated = (self.aggregate_compartment( compartment="cells", compute_subsample=compute_subsample).merge( self.aggregate_compartment(compartment="cytoplasm"), on=self.strata, how="inner", ).merge( self.aggregate_compartment(compartment="nuclei"), on=self.strata, how="inner", )) self.is_aggregated = True if self.output_file != "none": output( df=aggregated, output_filename=self.output_file, compression=compression, float_format=float_format, ) else: return aggregated
def aggregate_profiles( self, compute_subsample=False, output_file="none", compression_options=None, float_format=None, n_aggregation_memory_strata=1, ): """Aggregate and merge compartments. This is the primary entry to this class. Parameters ---------- compute_subsample : bool, default False Whether or not to compute subsample. compute_subsample must be specified to perform subsampling. The function aggregate_profiles(compute_subsample=True) will apply subsetting even if subsample is initialized. output_file : str, optional The name of a file to output. We recommended that, if provided, the output file be suffixed with "_augmented". compression_options : str, optional Compression arguments as input to pandas.to_csv() with pandas version >= 1.2. float_format : str, optional Decimal precision to use in writing output file. n_aggregation_memory_strata : int, default 1 Number of unique strata to pull from the database into working memory at once. Typically 1 is fastest. A larger number uses more memory. Returns ------- pandas.core.frame.DataFrame Either a dataframe (if output_file="none") or will write to file. """ if output_file != "none": self.set_output_file(output_file) compartment_idx = 0 for compartment in self.compartments: if compartment_idx == 0: aggregated = self.aggregate_compartment( compartment=compartment, compute_subsample=compute_subsample, compute_counts=True, add_image_features=self.add_image_features, n_aggregation_memory_strata=n_aggregation_memory_strata, ) else: aggregated = aggregated.merge( self.aggregate_compartment( compartment=compartment, n_aggregation_memory_strata=n_aggregation_memory_strata, ), on=self.strata, how="inner", ) compartment_idx += 1 self.is_aggregated = True if self.output_file != "none": output( df=aggregated, output_filename=self.output_file, compression_options=compression_options, float_format=float_format, ) else: return aggregated
def merge_single_cells( self, compute_subsample=False, sc_output_file="none", compression_options=None, float_format=None, single_cell_normalize=False, normalize_args=None, ): """Given the linking columns, merge single cell data. Normalization is also supported. Parameters ---------- compute_subsample : bool, default False Whether or not to compute subsample. sc_output_file : str, optional The name of a file to output. compression_options : str, optional Compression arguments as input to pandas.to_csv() with pandas version >= 1.2. float_format : str, optional Decimal precision to use in writing output file. single_cell_normalize : bool, default False Whether or not to normalize the single cell data. normalize_args : dict, optional Additional arguments passed as input to pycytominer.normalize(). Returns ------- pandas.core.frame.DataFrame Either a dataframe (if output_file="none") or will write to file. """ # Load the single cell dataframe by merging on the specific linking columns sc_df = "" linking_check_cols = [] merge_suffix_rename = [] for left_compartment in self.compartment_linking_cols: for right_compartment in self.compartment_linking_cols[left_compartment]: # Make sure only one merge per combination occurs linking_check = "-".join(sorted([left_compartment, right_compartment])) if linking_check in linking_check_cols: continue # Specify how to indicate merge suffixes merge_suffix = [ "_{comp_l}".format(comp_l=left_compartment), "_{comp_r}".format(comp_r=right_compartment), ] merge_suffix_rename += merge_suffix left_link_col = self.compartment_linking_cols[left_compartment][ right_compartment ] right_link_col = self.compartment_linking_cols[right_compartment][ left_compartment ] if isinstance(sc_df, str): initial_df = self.load_compartment(compartment=left_compartment) if compute_subsample: # Sample cells proportionally by self.strata self.get_subsample(df=initial_df, rename_col=False) subset_logic_df = self.subset_data_df.drop( self.image_df.columns, axis="columns" ) initial_df = subset_logic_df.merge( initial_df, how="left", on=subset_logic_df.columns.tolist() ).reindex(initial_df.columns, axis="columns") sc_df = initial_df.merge( self.load_compartment(compartment=right_compartment), left_on=self.merge_cols + [left_link_col], right_on=self.merge_cols + [right_link_col], suffixes=merge_suffix, ) else: sc_df = sc_df.merge( self.load_compartment(compartment=right_compartment), left_on=self.merge_cols + [left_link_col], right_on=self.merge_cols + [right_link_col], suffixes=merge_suffix, ) linking_check_cols.append(linking_check) # Add metadata prefix to merged suffixes full_merge_suffix_rename = [] full_merge_suffix_original = [] for col_name in self.merge_cols + list(self.linking_col_rename.keys()): full_merge_suffix_original.append(col_name) full_merge_suffix_rename.append("Metadata_{x}".format(x=col_name)) for col_name in self.merge_cols + list(self.linking_col_rename.keys()): for suffix in set(merge_suffix_rename): full_merge_suffix_original.append("{x}{y}".format(x=col_name, y=suffix)) full_merge_suffix_rename.append( "Metadata_{x}{y}".format(x=col_name, y=suffix) ) self.full_merge_suffix_rename = dict( zip(full_merge_suffix_original, full_merge_suffix_rename) ) # Add image data to single cell dataframe if not self.load_image_data: self.load_image() self.load_image_data = True sc_df = ( self.image_df.merge(sc_df, on=self.merge_cols, how="right") .rename(self.linking_col_rename, axis="columns") .rename(self.full_merge_suffix_rename, axis="columns") ) if single_cell_normalize: # Infering features is tricky with non-canonical data if normalize_args is None: normalize_args = {} features = infer_cp_features(sc_df, compartments=self.compartments) elif "features" not in normalize_args: features = infer_cp_features(sc_df, compartments=self.compartments) elif normalize_args["features"] == "infer": features = infer_cp_features(sc_df, compartments=self.compartments) else: features = normalize_args["features"] normalize_args["features"] = features sc_df = normalize(profiles=sc_df, **normalize_args) if sc_output_file != "none": output( df=sc_df, output_filename=sc_output_file, compression_options=compression_options, float_format=float_format, ) else: return sc_df
def feature_select( profiles, features="infer", samples="all", operation="variance_threshold", output_file="none", na_cutoff=0.05, corr_threshold=0.9, corr_method="pearson", freq_cut=0.05, unique_cut=0.1, compression=None, float_format=None, blocklist_file=None, outlier_cutoff=15, ): """ Performs feature selection based on the given operation Arguments: profiles - either pandas DataFrame or a file that stores profile data features - list of cell painting features [default: "infer"] if "infer", then assume cell painting features are those that start with "Cells", "Nuclei", or "Cytoplasm" samples - if provided, a list of samples to provide operation on [default: "all"] - if "all", use all samples to calculate operation - str or list of given operations to perform on input profiles output_file - [default: "none"] if provided, will write annotated profiles to file if not specified, will return the annotated profiles. We recommend that this output file be suffixed with "_normalized_variable_selected.csv". na_cutoff - proportion of missing values in a column to tolerate before removing corr_threshold - float between (0, 1) to exclude features above [default: 0.9] freq_cut - float of ratio (2nd most common feature val / most common) [default: 0.1] unique_cut - float of ratio (num unique features / num samples) [default: 0.1] compression - the mechanism to compress [default: None] float_format - decimal precision to use in writing output file [default: None] For example, use "%.3g" for 3 decimal precision. blocklist_file - file location of dataframe with features to exclude [default: None] Note that if "blocklist" in operation then will remove standard blocklist outlier_cutoff - the threshold at which the maximum or minimum value of a feature across a full experiment is excluded [default: 15]. Note that this procedure is typically applied (and therefore the default is suitable) for after normalization. """ all_ops = [ "variance_threshold", "correlation_threshold", "drop_na_columns", "blocklist", "drop_outliers", ] # Make sure the user provides a supported operation if isinstance(operation, list): assert all([x in all_ops for x in operation ]), "Some operation(s) {} not supported. Choose {}".format( operation, all_ops) elif isinstance(operation, str): assert operation in all_ops, "{} not supported. Choose {}".format( operation, all_ops) operation = operation.split() else: return ValueError("Operation must be a list or string") # Load Data profiles = load_profiles(profiles) if features == "infer": features = infer_cp_features(profiles) excluded_features = [] for op in operation: if op == "variance_threshold": exclude = variance_threshold( population_df=profiles, features=features, samples=samples, freq_cut=freq_cut, unique_cut=unique_cut, ) elif op == "drop_na_columns": exclude = get_na_columns( population_df=profiles, features=features, samples=samples, cutoff=na_cutoff, ) elif op == "correlation_threshold": exclude = correlation_threshold( population_df=profiles, features=features, samples=samples, threshold=corr_threshold, method=corr_method, ) elif op == "blocklist": if blocklist_file: exclude = get_blocklist_features(population_df=profiles, blocklist_file=blocklist_file) else: exclude = get_blocklist_features(population_df=profiles) elif op == "drop_outliers": exclude = drop_outlier_features( population_df=profiles, features=features, samples=samples, outlier_cutoff=outlier_cutoff, ) excluded_features += exclude excluded_features = list(set(excluded_features)) selected_df = profiles.drop(excluded_features, axis="columns") if output_file != "none": output( df=selected_df, output_filename=output_file, compression=compression, float_format=float_format, ) else: return selected_df
def normalize( profiles, features="infer", meta_features="infer", samples="all", method="standardize", output_file="none", compression_options=None, float_format=None, spherize_center=True, spherize_method="ZCA-cor", spherize_epsilon=1e-6, ): """Normalize profiling features Parameters ---------- profiles : {pandas.Dataframe, path} Either a pandas DataFrame or a file that stores profile data features : list A list of strings corresponding to feature measurement column names in the `profiles` DataFrame. All features listed must be found in `profiles`. Defaults to "infer". If "infer", then assume cell painting features are those prefixed with "Cells", "Nuclei", or "Cytoplasm". meta_features : list A list of strings corresponding to metadata column names in the `profiles` DataFrame. All features listed must be found in `profiles`. Defaults to "infer". If "infer", then assume metadata features are those prefixed with "Metadata" samples : str The metadata column values to use as a normalization reference. We often use control samples. The function uses a pd.query() function, so you should structure samples in this fasion. An example is "Metadata_treatment == 'control'" (include all quotes). Defaults to "all". method : str How to normalize the dataframe. Defaults to "standardize". Check avail_methods for available normalization methods. output_file : str If provided, will write annotated profiles to file. If not specified, will return the normalized profiles as output. We recommend that this output file be suffixed with "_normalized.csv". Defaults to "none". compression_options : {dict, None} Contain compression options as input to pd.DataFrame.to_csv(compression=compression_options). pandas version >= 1.2. Defaults to None. float_format : {str, None} Decimal precision to use in writing output file as input to pd.DataFrame.to_csv(float_format=float_format). For example, use "%.3g" for 3 decimal precision. Defaults to None. spherize_center : bool If the function should center data before sphering (aka whitening). The function only uses this variable if method = "spherize". Defaults to True. spherize_method : str The sphering (aka whitening) normalization selection. The function only uses this variable if method = "spherize". Defaults to "ZCA-corr". See :py:func:`pycytominer.operations.transform` for available spherize methods. spherize_epsilon : float The sphering (aka whitening) fudge factor parameter. The function only uses this variable if method = "spherize". Defaults 1e-6. Returns ------- pd.DataFrame or None The normalized profile DataFrame. If output_file="none", then return the DataFrame. If you specify output_file, then write to file and do not return data. Examples -------- import pandas as pd from pycytominer import normalize data_df = pd.DataFrame( { "Metadata_plate": ["a", "a", "a", "a", "b", "b", "b", "b"], "Metadata_treatment": [ "drug", "drug", "control", "control", "drug", "drug", "control", "control", ], "x": [1, 2, 8, 2, 5, 5, 5, 1], "y": [3, 1, 7, 4, 5, 9, 6, 1], "z": [1, 8, 2, 5, 6, 22, 2, 2], "zz": [14, 46, 1, 6, 30, 100, 2, 2], } ).reset_index(drop=True) normalized_df = normalize( profiles=data_df, features=["x", "y", "z", "zz"], meta_features="infer", samples="Metadata_treatment == 'control'", method="standardize" ) """ # Load Data profiles = load_profiles(profiles) # Define which scaler to use method = method.lower() avail_methods = ["standardize", "robustize", "mad_robustize", "spherize"] assert method in avail_methods, "operation must be one {}".format(avail_methods) if method == "standardize": scaler = StandardScaler() elif method == "robustize": scaler = RobustScaler() elif method == "mad_robustize": scaler = RobustMAD() elif method == "spherize": scaler = Spherize( center=spherize_center, method=spherize_method, epsilon=spherize_epsilon ) if features == "infer": features = infer_cp_features(profiles) # Separate out the features and meta feature_df = profiles.loc[:, features] if meta_features == "infer": meta_features = infer_cp_features(profiles, metadata=True) meta_df = profiles.loc[:, meta_features] # Fit the sklearn scaler if samples == "all": fitted_scaler = scaler.fit(feature_df) else: # Subset to only the features measured in the sample query fitted_scaler = scaler.fit(profiles.query(samples).loc[:, features]) # Scale the feature dataframe feature_df = pd.DataFrame( fitted_scaler.transform(feature_df), columns=feature_df.columns, index=feature_df.index, ) normalized = meta_df.merge(feature_df, left_index=True, right_index=True) if output_file != "none": output( df=normalized, output_filename=output_file, compression_options=compression_options, float_format=float_format, ) else: return normalized
plate_file = plate_files[plate] output_file = pathlib.Path(f"{sc_dir}/{plate}_normalized_featureselected.csv.gz") # Set console output print(f"Now performing feature selection for... {plate_file}") sc_df = pd.read_csv(plate_file, low_memory=False) print("Before feature selection:") print(sc_df.shape) sc_df = feature_select( profiles=sc_df, operation=feature_select_operations, na_cutoff=na_cutoff, ) print("After feature selection:") print(sc_df.shape) # Output file to disk output( df=sc_df, output_filename=output_file, sep=",", float_format="%.5f", compression_options=compression_options, ) print("Done.") print("\n\n")
def normalize( profiles, features="infer", meta_features="infer", samples="all", method="standardize", output_file="none", compression=None, float_format=None, whiten_center=True, whiten_method="ZCA", ): """ Normalize features Arguments: profiles - either pandas DataFrame or a file that stores profile data features - list of cell painting features [default: "infer"] if "infer", then assume cell painting features are those that do not start with "Cells", "Nuclei", or "Cytoplasm" meta_features - if specified, then output these with specified features [default: "infer"] samples - string indicating which metadata column and values to use to subset the control samples are often used here [default: 'all'] the format of this variable will be used in a pd.query() function. An example is "Metadata_treatment == 'control'" (include all quotes) method - string indicating how the dataframe will be normalized [default: 'standardize'] output_file - [default: "none"] if provided, will write annotated profiles to file if not specified, will return the annotated profiles. We recommend that this output file be suffixed with "_normalized.csv". compression - the mechanism to compress [default: None] float_format - decimal precision to use in writing output file [default: None] For example, use "%.3g" for 3 decimal precision. whiten_center - if data should be centered before whitening transform [default: True] (only used if method = "whiten") whiten_method - the type of whitening normalization used [default: 'ZCA'] (only used if method = "whiten") Return: A normalized DataFrame """ # Load Data profiles = load_profiles(profiles) # Define which scaler to use method = method.lower() avail_methods = ["standardize", "robustize", "mad_robustize", "whiten"] assert method in avail_methods, "operation must be one {}".format( avail_methods) if method == "standardize": scaler = StandardScaler() elif method == "robustize": scaler = RobustScaler() elif method == "mad_robustize": scaler = RobustMAD() elif method == "whiten": scaler = Whiten(center=whiten_center, method=whiten_method) if features == "infer": features = infer_cp_features(profiles) # Separate out the features and meta feature_df = profiles.loc[:, features] if meta_features == "infer": meta_features = infer_cp_features(profiles, metadata=True) meta_df = profiles.loc[:, meta_features] # Fit the sklearn scaler if samples == "all": fitted_scaler = scaler.fit(feature_df) else: # Subset to only the features measured in the sample query fitted_scaler = scaler.fit(profiles.query(samples).loc[:, features]) # Scale the feature dataframe feature_df = pd.DataFrame( fitted_scaler.transform(feature_df), columns=feature_df.columns, index=feature_df.index, ) normalized = meta_df.merge(feature_df, left_index=True, right_index=True) if output_file != "none": output( df=normalized, output_filename=output_file, compression=compression, float_format=float_format, ) else: return normalized
features = infer_cp_features(df) meta_features = infer_cp_features(df, metadata=True) print(df.shape) df.head(2) # In[4]: # Output feature selected file output_file = pathlib.Path("data/cell_health_merged_feature_select.csv.gz") output( df=df, output_filename=output_file, sep=",", compression_options={"method": "gzip", "mtime": 1}, ) # In[5]: # Define cell health constants barcode_col = "Metadata_pert_name" gene_col = "Metadata_gene_name" replicate_group_grit = {"replicate_id": barcode_col, "group_id": gene_col} control_group_cut = ["Chr2", "Luc", "LacZ"] control_group_pert = ["EMPTY"]
# ## Apply normalization, feature select, and output data # In[12]: normalized_df = normalize(merged_df, features="infer", meta_features="infer", samples="all", method="standardize") # In[13]: feature_select_df = feature_select( normalized_df, features="infer", operation=feature_select_opts, output_file="none", na_cutoff=na_cutoff, corr_threshold=corr_threshold, ) print(feature_select_df.shape) feature_select_df.head() # In[14]: output_filename = pathlib.Path( f"data/{batch}/{plate}_singlecell_normalized_feature_select.csv.gz") output(normalized_df, output_filename, compression="gzip", float_format="%.5g")
def consensus( profiles, replicate_columns=["Metadata_Plate", "Metadata_Well"], operation="median", features="infer", output_file="none", compression_options=None, float_format=None, modz_args={"method": "spearman"}, ): """Form level 5 consensus profile data. :param profiles: A file or pandas DataFrame of profile data :type profiles: str :param replicate_columns: Metadata columns indicating which replicates to collapse, defaults to ["Metadata_Plate", "Metadata_Well"] :type replicate_columns: list :param operation: The method used to form consensus profiles, defaults to "median" :type operation: str :param features: The features to collapse, defaults to "infer" :type features: str, list :param output_file: If specified, the location to write the file, defaults to "none" :type output_file: str :param modz_args: Additional custom arguments passed as kwargs if operation="modz". See pycytominer.cyto_utils.modz for more details. :type modz_args: dict :param compression_options: the method to compress output data, defaults to None. See pycytominer.cyto_utils.output.py for options :type compression_options: str :param float_format: decimal precision to use in writing output file, defaults to None. For example, use "%.3g" for 3 decimal precision. :Example: import pandas as pd from pycytominer import consensus data_df = pd.concat( [ pd.DataFrame( { "Metadata_Plate": "X", "Metadata_Well": "a", "Cells_x": [0.1, 0.3, 0.8], "Nuclei_y": [0.5, 0.3, 0.1], } ), pd.DataFrame( { "Metadata_Plate": "X", "Metadata_Well": "b", "Cells_x": [0.4, 0.2, -0.5], "Nuclei_y": [-0.8, 1.2, -0.5], } ), ] ).reset_index(drop=True) consensus_df = consensus( profiles=data_df, replicate_columns=["Metadata_Plate", "Metadata_Well"], operation="median", features="infer", output_file="none", ) """ # Confirm that the operation is supported check_consensus_operation(operation) # Load Data profiles = load_profiles(profiles) if operation == "modz": consensus_df = modz(population_df=profiles, replicate_columns=replicate_columns, features=features, **modz_args) else: consensus_df = aggregate( population_df=profiles, strata=replicate_columns, features=features, operation=operation, subset_data_df="none", ) if output_file != "none": output( df=consensus_df, output_filename=output_file, compression_options=compression_options, float_format=float_format, ) else: return consensus_df
consensus_file = pathlib.Path(batch, consensus_file) consensus_df = all_consensus_dfs[batch][norm_strat][operation][ "no_feat_select" ] print( f" Now Writing: Feature selection: No; Consensus Operation: {operation}; Norm Strategy: {norm_strat}" ) print(f" File: {consensus_file}") print(consensus_df.shape) output( df=consensus_df, output_filename=consensus_file, sep=",", float_format=float_format, compression_options=compression_options, ) # With feature selection consensus_feat_df = all_consensus_dfs[batch][norm_strat][operation][ "feat_select" ] consensus_feat_file = ( f"{batch}_consensus_{operation}_feature_select{file_suffix}" ) consensus_feat_file = pathlib.Path(batch, consensus_feat_file) print(
print( f"Now aggregating by {aggregate_level}...with operation: {aggregate_operation}" ) logging.info( f"Aggregating by {aggregate_level}...with operation: {aggregate_operation}" ) aggregate_df = aggregate( population_df=single_cell_df, strata=aggregate_columns, features=aggregate_features, operation=aggregate_operation, ) # Define a dataset specific file aggregate_dataset_file = pathlib.Path( aggregate_output_dir, aggregate_output_file.name.replace(".csv.gz", f"_{data_split_site}.csv.gz"), ) output( aggregate_df, output_filename=aggregate_dataset_file, compression_options=compression, float_format=float_format, ) print("Finished 1.aggregate.") logging.info(f"Finished 1.aggregate.")
def annotate( profiles, platemap, join_on=["Metadata_well_position", "Metadata_Well"], output_file="none", add_metadata_id_to_platemap=True, format_broad_cmap=False, clean_cellprofiler=True, external_metadata="none", external_join_left="none", external_join_right="none", compression_options=None, float_format=None, cmap_args={}, ): """ Exclude features that have correlations above a certain threshold Arguments: profiles - either pandas DataFrame or a file that stores profile data platemap - either pandas DataFrame or a file that stores platemap metadata join_on - list of length two indicating which variables to merge profiles and plate [default: ["Metadata_well_position", "Metadata_Well"]]. The first element indicates variable(s) in platemap and the second element indicates variable(s) in profiles to merge using. Note the setting of `add_metadata_id_to_platemap` output_file - [default: "none"] if provided, will write annotated profiles to file if not specified, will return the annotated profiles. We recommend that this output file be suffixed with "_augmented.csv". add_metadata_id_to_platemap - [default: True] boolean if the platemap variables possibly need "Metadata" pre-pended format_broad_cmap - [default: False] boolean if we need to add columns to make compatible with Broad CMAP naming conventions. external_metadata - [default: "none"] a string indicating a file with additional metadata information external_join_left - [default: "none"] the merge column in the profile metadata external_join_right - [default: "none"] the merge column in the external metadata compression_options - the mechanism to compress [default: None] See cyto_utils/output.py for options. float_format - decimal precision to use in writing output file [default: None] For example, use "%.3g" for 3 decimal precision. cmap_args - [default: {}] - potential keyword arguments for annotate_cmap(). See cyto_utils/annotate_cmap.py for more details. Return: Pandas DataFrame of annotated profiles or written to file """ # Load Data profiles = load_profiles(profiles) platemap = load_platemap(platemap, add_metadata_id_to_platemap) annotated = platemap.merge(profiles, left_on=join_on[0], right_on=join_on[1], how="inner").drop(join_on[0], axis="columns") # Add specific Connectivity Map (CMAP) formatting if format_broad_cmap: annotated = annotate_cmap(annotated, annotate_join_on=join_on[1], **cmap_args) if clean_cellprofiler: annotated = cp_clean(annotated) if not isinstance(external_metadata, pd.DataFrame): if external_metadata != "none": assert os.path.exists( external_metadata ), "external metadata at {} does not exist".format( external_metadata) external_metadata = pd.read_csv(external_metadata) if isinstance(external_metadata, pd.DataFrame): external_metadata.columns = [ "Metadata_{}".format(x) if not x.startswith("Metadata_") else x for x in external_metadata.columns ] annotated = (annotated.merge( external_metadata, left_on=external_join_left, right_on=external_join_right, how="left", ).reset_index(drop=True).drop_duplicates()) # Reorder annotated metadata columns meta_cols = infer_cp_features(annotated, metadata=True) other_cols = annotated.drop(meta_cols, axis="columns").columns.tolist() annotated = annotated.loc[:, meta_cols + other_cols] if output_file != "none": output( df=annotated, output_filename=output_file, compression_options=compression_options, float_format=float_format, ) else: return annotated
def annotate( profiles, platemap, join_on=["Metadata_well_position", "Metadata_Well"], output_file="none", add_metadata_id_to_platemap=True, format_broad_cmap=False, clean_cellprofiler=True, external_metadata="none", external_join_left="none", external_join_right="none", compression_options=None, float_format=None, cmap_args={}, ): """Add metadata to aggregated profiles. Parameters ---------- profiles : pandas.core.frame.DataFrame or file DataFrame or file path of profiles. platemap : pandas.core.frame.DataFrame or file Dataframe or file path of platemap metadata. join_on : list or str, default: ["Metadata_well_position", "Metadata_Well"] Which variables to merge profiles and plate. The first element indicates variable(s) in platemap and the second element indicates variable(s) in profiles to merge using. Note the setting of `add_metadata_id_to_platemap` output_file : str, optional If not specified, will return the annotated profiles. We recommend that this output file be suffixed with "_augmented.csv". add_metadata_id_to_platemap : bool, default True Whether the plate map variables possibly need "Metadata" pre-pended format_broad_cmap : bool, default False Whether we need to add columns to make compatible with Broad CMAP naming conventions. clean_cellprofiler: bool, default True Clean specific CellProfiler feature names. external_metadata : str, optional File with additional metadata information external_join_left : str, optional Merge column in the profile metadata. external_join_right: str, optional Merge column in the external metadata. compression_options : str or dict, optional Contains compression options as input to pd.DataFrame.to_csv(compression=compression_options). pandas version >= 1.2. float_format : str, optional Decimal precision to use in writing output file as input to pd.DataFrame.to_csv(float_format=float_format). For example, use "%.3g" for 3 decimal precision. cmap_args : dict, default {} Potential keyword arguments for annotate_cmap(). See cyto_utils/annotate_custom.py for more details. Returns ------- annotated : pandas.core.frame.DataFrame, optional DataFrame of annotated features. If output_file="none", then return the DataFrame. If you specify output_file, then write to file and do not return data. """ # Load Data profiles = load_profiles(profiles) platemap = load_platemap(platemap, add_metadata_id_to_platemap) annotated = platemap.merge(profiles, left_on=join_on[0], right_on=join_on[1], how="inner").drop(join_on[0], axis="columns") # Add specific Connectivity Map (CMAP) formatting if format_broad_cmap: annotated = annotate_cmap(annotated, annotate_join_on=join_on[1], **cmap_args) if clean_cellprofiler: annotated = cp_clean(annotated) if not isinstance(external_metadata, pd.DataFrame): if external_metadata != "none": assert os.path.exists( external_metadata ), "external metadata at {} does not exist".format( external_metadata) external_metadata = pd.read_csv(external_metadata) if isinstance(external_metadata, pd.DataFrame): external_metadata.columns = [ "Metadata_{}".format(x) if not x.startswith("Metadata_") else x for x in external_metadata.columns ] annotated = (annotated.merge( external_metadata, left_on=external_join_left, right_on=external_join_right, how="left", ).reset_index(drop=True).drop_duplicates()) # Reorder annotated metadata columns meta_cols = infer_cp_features(annotated, metadata=True) other_cols = annotated.drop(meta_cols, axis="columns").columns.tolist() annotated = annotated.loc[:, meta_cols + other_cols] if output_file != "none": output( df=annotated, output_filename=output_file, compression_options=compression_options, float_format=float_format, ) else: return annotated
def aggregate_profiles( self, compute_subsample=False, output_file="none", compression_options=None, float_format=None, aggregate_args=None, ): """Aggregate and merge compartments. This is the primary entry to this class. Parameters ---------- compute_subsample : bool, default False Whether or not to compute subsample. compute_subsample must be specified to perform subsampling. The function aggregate_profiles(compute_subsample=True) will apply subsetting even if subsample is initialized. output_file : str, optional The name of a file to output. We recommended that, if provided, the output file be suffixed with "_augmented". compression_options : str, optional Compression arguments as input to pandas.to_csv() with pandas version >= 1.2. float_format : str, optional Decimal precision to use in writing output file. aggregate_args : dict, optional Additional arguments passed as input to pycytominer.normalize(). Returns ------- pandas.core.frame.DataFrame Either a dataframe (if output_file="none") or will write to file. """ if output_file != "none": self.set_output_file(output_file) compartment_idx = 0 for compartment in self.compartments: if compartment_idx == 0: aggregated = self.aggregate_compartment( compartment=compartment, compute_subsample=compute_subsample, compute_counts=True, ) else: aggregated = aggregated.merge( self.aggregate_compartment(compartment=compartment), on=self.strata, how="inner", ) compartment_idx += 1 self.is_aggregated = True if self.output_file != "none": output( df=aggregated, output_filename=self.output_file, compression_options=compression_options, float_format=float_format, ) else: return aggregated
def process_profile(sql_file, batch, plate, pipeline): """ Given batch details and a pipeline, process morphology profiles """ assert batch in sql_file, "batch {} not recognized in sql file {}".format( batch, sql_file) assert plate in sql_file, "plate {} not recognized in sql file {}".format( plate, sql_file) # Set output directory information pipeline_output = pipeline["output_dir"] output_dir = os.path.join(pipeline_output, batch, plate) os.makedirs(output_dir, exist_ok=True) # Set output file information aggregate_out_file = os.path.join(output_dir, "{}.csv.gz".format(plate)) annotate_out_file = os.path.join(output_dir, "{}_augmented.csv.gz".format(plate)) normalize_out_file = os.path.join(output_dir, "{}_normalized.csv.gz".format(plate)) feature_out_file = os.path.join( output_dir, "{}_normalized_feature_selected.csv.gz".format(plate)) # Load pipeline options compression = process_pipeline(pipeline["options"], option="compression") sc_float_format = process_pipeline(pipeline["options"], option="sc_float_format") samples = process_pipeline(pipeline["options"], option="samples") # Load and setup platemap info workspace_dir = pipeline["workspace_dir"] batch_dir = os.path.join(workspace_dir, "backend", batch) metadata_dir = os.path.join(workspace_dir, "metadata", batch) barcode_plate_map_file = os.path.join(metadata_dir, sorted(os.listdir(metadata_dir))[0]) barcode_plate_map_df = pd.read_csv(barcode_plate_map_file) plate_map_name = barcode_plate_map_df.query( "Assay_Plate_Barcode == @plate").Plate_Map_Name.values[0] plate_map_file = os.path.join(metadata_dir, "platemap", "{}.txt".format(plate_map_name)) plate_map_df = pd.read_csv(plate_map_file, sep="\t") plate_map_df.columns = [ "Metadata_{}".format(x) if not x.startswith("Metadata_") else x for x in plate_map_df.columns ] platemap_well_column = pipeline["platemap_well_column"] # Process Bulk profiles # Step 1: Aggregate aggregate_steps = pipeline["aggregate"] aggregate_features = aggregate_steps["features"] aggregate_operation = aggregate_steps["method"] aggregate_plate_column = aggregate_steps["plate_column"] aggregate_well_column = aggregate_steps["well_column"] strata = [aggregate_plate_column, aggregate_well_column] if "site_column" in aggregate_steps: aggregate_site_column = aggregate_steps["site_column"] strata += [aggregate_site_column] if aggregate_steps["perform"]: ap = AggregateProfiles( sql_file, strata=strata, features=aggregate_features, operation=aggregate_operation, ) ap.aggregate_profiles(output_file=aggregate_out_file, compression=compression) if pipeline["count"]["perform"]: if not aggregate_steps["perform"]: ap = AggregateProfiles( sql_file, strata=strata, features=aggregate_features, operation=aggregate_operation, ) count_dir = pipeline["count"]["output_dir"] os.makedirs(count_dir, exist_ok=True) cell_count_file = os.path.join( count_dir, "{}_{}_cell_count.tsv".format(batch, plate)) cell_count_df = ap.count_cells() cell_count_df = cell_count_df.merge( plate_map_df, left_on=aggregate_well_column, right_on=platemap_well_column, ).drop(platemap_well_column, axis="columns") cell_count_df.to_csv(cell_count_file, sep="\t", index=False) # Annotate Profiles annotate_steps = pipeline["annotate"] annotate_well_column = annotate_steps["well_column"] if annotate_steps["perform"]: annotate( profiles=aggregate_out_file, platemap=plate_map_df, join_on=[platemap_well_column, annotate_well_column], output_file=annotate_out_file, compression=compression, ) # Normalize Profiles normalize_steps = pipeline["normalize"] norm_features = normalize_steps["features"] norm_method = normalize_steps["method"] if normalize_steps["perform"]: normalize( profiles=annotate_out_file, features=norm_features, samples=samples, method=norm_method, output_file=normalize_out_file, compression=compression, ) # Apply feature selection feature_select_steps = pipeline["feature_select"] feature_select_operations = feature_select_steps["operations"] feature_select_features = feature_select_steps["features"] if feature_select_steps["perform"]: feature_select( profiles=normalize_out_file, features=feature_select_features, samples=samples, operation=feature_select_operations, output_file=feature_out_file, compression=compression, corr_threshold=0.9, corr_method="pearson", ) sc_steps = pipeline["single_cell"] if sc_steps["perform"]: if not aggregate_steps["perform"]: ap = AggregateProfiles( sql_file, strata=strata, features=aggregate_features, operation=aggregate_operation, ) # Load cells query = "select * from cells" cell_df = pd.read_sql(sql=query, con=ap.conn) # Load cytoplasm query = "select * from cytoplasm" cytoplasm_df = pd.read_sql(sql=query, con=ap.conn) # Load nuclei query = "select * from nuclei" nuclei_df = pd.read_sql(sql=query, con=ap.conn) # Merge single cells together sc_merged_df = (cell_df.merge( cytoplasm_df.drop("ObjectNumber", axis="columns"), left_on=["TableNumber", "ImageNumber", "ObjectNumber"], right_on=["TableNumber", "ImageNumber", "Cytoplasm_Parent_Cells"], how="inner", ).drop("ObjectNumber", axis="columns").merge( nuclei_df, left_on=["TableNumber", "ImageNumber", "Cytoplasm_Parent_Nuclei"], right_on=["TableNumber", "ImageNumber", "ObjectNumber"], how="inner", )) # Merge image data info sc_merged_df = ap.image_df.merge(sc_merged_df, how="right", on=ap.merge_cols) # Make sure column names are correctly prefixed prefix = ["Metadata", "Cells", "Cytoplasm", "Nuclei"] cols = [] for col in sc_merged_df.columns: if any([col.startswith(x) for x in prefix]): cols.append(col) else: cols.append(f"Metadata_{col}") sc_merged_df.columns = cols sc_merged_df = annotate( profiles=sc_merged_df, platemap=plate_map_df, join_on=[platemap_well_column, annotate_well_column], output_file="none", ) if sc_steps["normalize"]: sc_merged_df = normalize( profiles=sc_merged_df, features=norm_features, samples=samples, method=norm_method, output_file="none", ) if sc_steps["feature_select"]: sc_merged_df = feature_select( profiles=sc_merged_df, features=feature_select_features, samples=samples, operation=feature_select_operations, output_file="none", corr_threshold=0.9, corr_method="pearson", ) sc_pipeline_output = pipeline["sc_output_dir"] sc_output_dir = os.path.join(sc_pipeline_output, batch, plate) os.makedirs(sc_output_dir, exist_ok=True) # Set output file information sc_out_file = os.path.join(sc_output_dir, "{}_single_cell.csv.gz".format(plate)) output( df=sc_merged_df, output_filename=sc_out_file, compression="gzip", float_format=sc_float_format, )
# Add dose recoding information anno_df = anno_df.assign( Metadata_dose_recode=(anno_df.Metadata_mmoles_per_liter.apply( lambda x: recode_dose(x, primary_dose_mapping, return_level=True)))) # Reoroder columns metadata_cols = cyto_utils.infer_cp_features(anno_df, metadata=True) cp_cols = cyto_utils.infer_cp_features(anno_df) reindex_cols = metadata_cols + cp_cols anno_df = anno_df.reindex(reindex_cols, axis="columns") # Output annotated file cyto_utils.output( df=anno_df, output_filename=anno_file, float_format=float_format, compression_options=compression, ) # Normalize Profiles (DMSO Control) - Level 4A Data norm_dmso_file = pathlib.PurePath(output_dir, f"{plate_name}_normalized_dmso.csv.gz") normalize( profiles=anno_df, samples="Metadata_broad_sample == 'DMSO'", method=norm_method, output_file=norm_dmso_file, float_format=float_format, compression_options=compression, )
def pipeline_feature_select(self, steps, suffix=None): feature_select_steps = steps pipeline_output = self.pipeline["output_dir"] level = feature_select_steps["level"] gct = feature_select_steps["gct"] feature_select_operations = feature_select_steps["operations"] feature_select_features = feature_select_steps["features"] all_plates_df = pd.DataFrame() for batch in self.profile_config: batch_df = pd.DataFrame() for plate in self.profile_config[batch]: output_dir = pathlib.PurePath(".", pipeline_output, batch, plate) if suffix: normalize_output_file = pathlib.PurePath( output_dir, f"{plate}_normalized_{suffix}.csv.gz") feature_select_output_file_plate = pathlib.PurePath( output_dir, f"{plate}_normalized_feature_select_{suffix}_plate.csv.gz", ) else: normalize_output_file = pathlib.PurePath( output_dir, f"{plate}_normalized.csv.gz") feature_select_output_file_plate = pathlib.PurePath( output_dir, f"{plate}_normalized_feature_select_plate.csv.gz") if feature_select_features == "infer" and self.noncanonical: feature_select_features = cyto_utils.infer_cp_features( pd.read_csv(normalize_output_file), compartments=self.compartments, ) df = pd.read_csv(normalize_output_file).assign( Metadata_batch=batch) if level == "plate": df = df.drop(columns=["Metadata_batch"]) feature_select( profiles=df, features=feature_select_features, operation=feature_select_operations, output_file=feature_select_output_file_plate, compression_options=self. pipeline_options["compression"], float_format=self.pipeline_options["float_format"], ) elif level == "batch": batch_df = concat_dataframes(batch_df, df) elif level == "all": all_plates_df = concat_dataframes(all_plates_df, df) if level == "batch": fs_df = feature_select( profiles=batch_df, features=feature_select_features, operation=feature_select_operations, ) for plate in self.profile_config[batch]: output_dir = pathlib.PurePath(".", pipeline_output, batch, plate) if suffix: feature_select_output_file_batch = pathlib.PurePath( output_dir, f"{plate}_normalized_feature_select_{suffix}_batch.csv.gz", ) else: feature_select_output_file_batch = pathlib.PurePath( output_dir, f"{plate}_normalized_feature_select_batch.csv.gz", ) if feature_select_features == "infer" and self.noncanonical: feature_select_features = cyto_utils.infer_cp_features( batch_df, compartments=self.compartments) df = fs_df.query("Metadata_Plate==@plate").reset_index( drop=True) df = df.drop(columns=["Metadata_batch"]) cyto_utils.output( output_filename=feature_select_output_file_batch, df=df, compression_options=self. pipeline_options["compression"], float_format=self.pipeline_options["float_format"], ) if gct: create_gct_directories(batch) if suffix: stacked_file = pathlib.PurePath( ".", "gct", batch, f"{batch}_normalized_feature_select_{suffix}_batch.csv.gz", ) gct_file = pathlib.PurePath( ".", "gct", batch, f"{batch}_normalized_feature_select_{suffix}_batch.gct", ) else: stacked_file = pathlib.PurePath( ".", "gct", batch, f"{batch}_normalized_feature_select_batch.csv.gz", ) gct_file = pathlib.PurePath( ".", "gct", batch, f"{batch}_normalized_feature_select_batch.gct", ) cyto_utils.output( output_filename=stacked_file, df=fs_df, compression_options=self. pipeline_options["compression"], float_format=self.pipeline_options["float_format"], ) write_gct(profiles=fs_df, output_file=gct_file) if level == "all": fs_df = feature_select( profiles=all_plates_df, features=feature_select_features, operation=feature_select_operations, ) for batch in self.profile_config: fs_batch_df = fs_df.loc[fs_df.Metadata_batch == batch].reset_index(drop=True) for plate in self.profile_config[batch]: output_dir = pathlib.PurePath(".", pipeline_output, batch, plate) if suffix: feature_select_output_file_all = pathlib.PurePath( output_dir, f"{plate}_normalized_feature_select_{suffix}_all.csv.gz", ) else: feature_select_output_file_all = pathlib.PurePath( output_dir, f"{plate}_normalized_feature_select_all.csv.gz") if feature_select_features == "infer" and self.noncanonical: feature_select_features = cyto_utils.infer_cp_features( all_plates_df, compartments=self.compartments) df = fs_batch_df.query( "Metadata_Plate==@plate").reset_index(drop=True) df = df.drop(columns=["Metadata_batch"]) cyto_utils.output( output_filename=feature_select_output_file_all, df=df, compression_options=self. pipeline_options["compression"], float_format=self.pipeline_options["float_format"], ) if gct: create_gct_directories(batch) if suffix: stacked_file = pathlib.PurePath( ".", "gct", batch, f"{batch}_normalized_feature_select_{suffix}_all.csv.gz", ) gct_file = pathlib.PurePath( ".", "gct", batch, f"{batch}_normalized_feature_select_{suffix}_all.gct", ) else: stacked_file = pathlib.PurePath( ".", "gct", batch, f"{batch}_normalized_feature_select_all.csv.gz", ) gct_file = pathlib.PurePath( ".", "gct", batch, f"{batch}_normalized_feature_select_all.gct", ) cyto_utils.output( output_filename=stacked_file, df=fs_batch_df, compression_options=self. pipeline_options["compression"], float_format=self.pipeline_options["float_format"], ) write_gct(profiles=fs_batch_df, output_file=gct_file)
else: warnings.warn( f"{site_file} does not exist. There must have been an error in processing" ) single_cell_df = pd.concat(single_cell_df, axis="rows").reset_index(drop=True) # Perform the aggregation based on the defined levels and columns aggregate_output_dir.mkdir(parents=True, exist_ok=True) for aggregate_level, aggregate_columns in aggregate_levels.items(): aggregate_output_file = aggregate_output_files[aggregate_level] print( f"Now aggregating by {aggregate_level}...with operation: {aggregate_operation}" ) aggregate_df = aggregate( population_df=single_cell_df, strata=aggregate_columns, features=aggregate_features, operation=aggregate_operation, ) output( aggregate_df, output_filename=aggregate_output_file, compression=compression, float_format=float_format, )
"unknown") # Set a timepoint variable only for batch 1 if batch == "2016_04_01_a549_48hr_batch1": spherized_df = spherized_df.assign(Metadata_time_point="48H") for operation in operations: output_file = pathlib.Path( f"{output_dir}/{batch}{spherized_string}{norm_strat}_consensus_{operation}.csv.gz" ) print(f" with consensus operation: {operation}") spherized_consensus_df = consensus( profiles=spherized_df, replicate_columns=replicate_cols, operation=operation, features=features, ) print(spherized_consensus_df.shape) output( df=spherized_consensus_df, output_filename=output_file, sep=",", float_format=float_format, compression_options=compression_options, ) print(" Done.") print("Batch done.\n")
def feature_select( profiles, features="infer", image_features=False, samples="all", operation="variance_threshold", output_file="none", na_cutoff=0.05, corr_threshold=0.9, corr_method="pearson", freq_cut=0.05, unique_cut=0.1, compression_options=None, float_format=None, blocklist_file=None, outlier_cutoff=15, noise_removal_perturb_groups=None, noise_removal_stdev_cutoff=None, ): """Performs feature selection based on the given operation. Parameters ---------- profiles : pandas.core.frame.DataFrame or file DataFrame or file of profiles. features : list A list of strings corresponding to feature measurement column names in the `profiles` DataFrame. All features listed must be found in `profiles`. Defaults to "infer". If "infer", then assume cell painting features are those prefixed with "Cells", "Nuclei", or "Cytoplasm". image_features: bool, default False Whether the profiles contain image features. samples : list or str, default "all" Samples to provide operation on. operation: list of str or str, default "variance_threshold Operations to perform on the input profiles. output_file : str, optional If provided, will write annotated profiles to file. If not specified, will return the normalized profiles as output. We recommend that this output file be suffixed with "_normalized_variable_selected.csv". na_cutoff : float, default 0.05 Proportion of missing values in a column to tolerate before removing. corr_threshold : float, default 0.1 Value between (0, 1) to exclude features above if any two features are correlated above this threshold. corr_method : str, default "pearson" Correlation type to compute. Allowed methods are "spearman", "kendall" and "pearson". freq_cut : float, default 0.05 Ratio (2nd most common feature val / most common). unique_cut: float, default 0.01 Ratio (num unique features / num samples). compression_options : str or dict, optional Contains compression options as input to pd.DataFrame.to_csv(compression=compression_options). pandas version >= 1.2. float_format : str, optional Decimal precision to use in writing output file as input to pd.DataFrame.to_csv(float_format=float_format). For example, use "%.3g" for 3 decimal precision. blocklist_file : str, optional File location of datafrmame with with features to exclude. Note that if "blocklist" in operation then will remove standard blocklist outlier_cutoff : float, default 15 The threshold at which the maximum or minimum value of a feature across a full experiment is excluded. Note that this procedure is typically applied (and therefore the default is uitable) for after normalization. noise_removal_perturb_groups: str or list of str, optional Perturbation groups corresponding to rows in profiles or the the name of the metadata column containing this information. noise_removal_stdev_cutoff: float,optional Maximum mean feature standard deviation to be kept for noise removal, grouped by the identity of the perturbation from perturb_list. The data must already be normalized so that this cutoff can apply to all columns. Returns ------- selected_df : pandas.core.frame.DataFrame, optional The feature selected profile DataFrame. If output_file="none", then return the DataFrame. If you specify output_file, then write to file and do not return data. """ all_ops = [ "variance_threshold", "correlation_threshold", "drop_na_columns", "blocklist", "drop_outliers", "noise_removal", ] # Make sure the user provides a supported operation if isinstance(operation, list): assert all([x in all_ops for x in operation ]), "Some operation(s) {} not supported. Choose {}".format( operation, all_ops) elif isinstance(operation, str): assert operation in all_ops, "{} not supported. Choose {}".format( operation, all_ops) operation = operation.split() else: return ValueError("Operation must be a list or string") # Load Data profiles = load_profiles(profiles) if features == "infer": features = infer_cp_features(profiles, image_features=image_features) excluded_features = [] for op in operation: if op == "variance_threshold": exclude = variance_threshold( population_df=profiles, features=features, samples=samples, freq_cut=freq_cut, unique_cut=unique_cut, ) elif op == "drop_na_columns": exclude = get_na_columns( population_df=profiles, features=features, samples=samples, cutoff=na_cutoff, ) elif op == "correlation_threshold": exclude = correlation_threshold( population_df=profiles, features=features, samples=samples, threshold=corr_threshold, method=corr_method, ) elif op == "blocklist": if blocklist_file: exclude = get_blocklist_features(population_df=profiles, blocklist_file=blocklist_file) else: exclude = get_blocklist_features(population_df=profiles) elif op == "drop_outliers": exclude = drop_outlier_features( population_df=profiles, features=features, samples=samples, outlier_cutoff=outlier_cutoff, ) elif op == "noise_removal": exclude = noise_removal( population_df=profiles, features=features, noise_removal_perturb_groups=noise_removal_perturb_groups, noise_removal_stdev_cutoff=noise_removal_stdev_cutoff, ) excluded_features += exclude excluded_features = list(set(excluded_features)) selected_df = profiles.drop(excluded_features, axis="columns") if output_file != "none": output( df=selected_df, output_filename=output_file, compression_options=compression_options, float_format=float_format, ) else: return selected_df
def aggregate( population_df, strata=["Metadata_Plate", "Metadata_Well"], features="infer", operation="median", output_file="none", compute_object_count=False, object_feature="ObjectNumber", subset_data_df="none", compression_options=None, float_format=None, ): """Combine population dataframe variables by strata groups using given operation. Parameters ---------- population_df : pandas.core.frame.DataFrame DataFrame to group and aggregate. strata : list of str, default ["Metadata_Plate", "Metadata_Well"] Columns to groupby and aggregate. features : list of str, default "all" List of features that should be aggregated. operation : str, default "median" How the data is aggregated. Currently only supports one of ['mean', 'median']. output_file : str or file handle, optional If provided, will write aggregated profiles to file. If not specified, will return the aggregated profiles. We recommend naming the file based on the plate name. compute_object_count : bool, default False Whether or not to compute object counts. object_feature : str, default "ObjectNumber" Object number feature. Only used if compute_object_count=True. subset_data_df : pandas.core.frame.DataFrame How to subset the input. compression_options : str, optional The mechanism to compress. float_format : str, optional Decimal precision to use in writing output file. Returns ------- pandas.core.frame.DataFrame DataFrame of aggregated features. """ # Check that the operation is supported operation = check_aggregate_operation(operation) # Subset the data to specified samples if isinstance(subset_data_df, pd.DataFrame): population_df = subset_data_df.merge( population_df, how="left", on=subset_data_df.columns.tolist()).reindex(population_df.columns, axis="columns") # Subset dataframe to only specified variables if provided strata_df = population_df.loc[:, strata] # Only extract single object column in preparation for count if compute_object_count: count_object_df = population_df.loc[:, np. union1d(strata, [object_feature])] count_object_df = (count_object_df.groupby( strata)[object_feature].count().reset_index().rename( columns={f"{object_feature}": "Metadata_Object_Count"})) if features == "infer": features = infer_cp_features(population_df) population_df = population_df.loc[:, features] else: population_df = population_df.loc[:, features] # Fix dtype of input features (they should all be floats!) convert_dict = {x: float for x in features} population_df = population_df.astype(convert_dict) # Merge back metadata used to aggregate by population_df = pd.concat([strata_df, population_df], axis="columns") # Perform aggregating function population_df = population_df.groupby(strata, dropna=False) if operation == "median": population_df = population_df.median().reset_index() else: population_df = population_df.mean().reset_index() # Compute objects counts if compute_object_count: population_df = count_object_df.merge(population_df, on=strata, how="right") # Aggregated image number and object number do not make sense for col in ["ImageNumber", "ObjectNumber"]: if col in population_df.columns: population_df = population_df.drop([col], axis="columns") if output_file != "none": output( df=population_df, output_filename=output_file, compression_options=compression_options, float_format=float_format, ) else: return population_df return population_df
na_cutoff=na_cut) else: profile_df = feature_select(profiles=profile_df, operation=feature_select_ops, na_cutoff=na_cut, corr_threshold=corr_threshold, blocklist_file=full_blocklist_file) # Step 2: Spherize transform if batch == "2017_12_05_Batch2": spherize_df = (profile_df.groupby([ "Metadata_cell_line", "Metadata_time_point" ]).apply( lambda x: normalize(profiles=x, features="infer", meta_features="infer", samples="Metadata_broad_sample == 'DMSO'", method="spherize"))) else: spherize_df = normalize(profiles=profile_df, features="infer", meta_features="infer", samples="Metadata_broad_sample == 'DMSO'", method="spherize") print(spherize_df.shape) spherize_df.head() # Step 3: Output profiles output(df=spherize_df, output_filename=output_file)