def test_load_profiles(): profiles = load_profiles(output_data_file) pd.testing.assert_frame_equal(data_df, profiles) profiles_gzip = load_profiles(output_data_gzip_file) pd.testing.assert_frame_equal(data_df, profiles_gzip) platemap = load_platemap(output_data_comma_file, add_metadata_id=False) pd.testing.assert_frame_equal(data_df, profiles) profiles_from_frame = load_profiles(data_df) pd.testing.assert_frame_equal(data_df, profiles_from_frame)
def normalize( profiles, features="infer", meta_features="infer", samples="all", method="standardize", output_file="none", compression=None, float_format=None, whiten_center=True, whiten_method="ZCA", ): """ Normalize features Arguments: profiles - either pandas DataFrame or a file that stores profile data features - list of cell painting features [default: "infer"] if "infer", then assume cell painting features are those that do not start with "Cells", "Nuclei", or "Cytoplasm" meta_features - if specified, then output these with specified features [default: "infer"] samples - string indicating which metadata column and values to use to subset the control samples are often used here [default: 'all'] the format of this variable will be used in a pd.query() function. An example is "Metadata_treatment == 'control'" (include all quotes) method - string indicating how the dataframe will be normalized [default: 'standardize'] output_file - [default: "none"] if provided, will write annotated profiles to file if not specified, will return the annotated profiles. We recommend that this output file be suffixed with "_normalized.csv". compression - the mechanism to compress [default: None] float_format - decimal precision to use in writing output file [default: None] For example, use "%.3g" for 3 decimal precision. whiten_center - if data should be centered before whitening transform [default: True] (only used if method = "whiten") whiten_method - the type of whitening normalization used [default: 'ZCA'] (only used if method = "whiten") Return: A normalized DataFrame """ # Load Data profiles = load_profiles(profiles) # Define which scaler to use method = method.lower() avail_methods = ["standardize", "robustize", "mad_robustize", "whiten"] assert method in avail_methods, "operation must be one {}".format( avail_methods) if method == "standardize": scaler = StandardScaler() elif method == "robustize": scaler = RobustScaler() elif method == "mad_robustize": scaler = RobustMAD() elif method == "whiten": scaler = Whiten(center=whiten_center, method=whiten_method) if features == "infer": features = infer_cp_features(profiles) # Separate out the features and meta feature_df = profiles.loc[:, features] if meta_features == "infer": meta_features = infer_cp_features(profiles, metadata=True) meta_df = profiles.loc[:, meta_features] # Fit the sklearn scaler if samples == "all": fitted_scaler = scaler.fit(feature_df) else: # Subset to only the features measured in the sample query fitted_scaler = scaler.fit(profiles.query(samples).loc[:, features]) # Scale the feature dataframe feature_df = pd.DataFrame( fitted_scaler.transform(feature_df), columns=feature_df.columns, index=feature_df.index, ) normalized = meta_df.merge(feature_df, left_index=True, right_index=True) if output_file != "none": output( df=normalized, output_filename=output_file, compression=compression, float_format=float_format, ) else: return normalized
def annotate( profiles, platemap, cell_id="unknown", join_on=["Metadata_well_position", "Metadata_Well"], output_file="none", add_metadata_id_to_platemap=True, format_broad_cmap=False, perturbation_mode="none", external_metadata="none", external_join_left="none", external_join_right="none", compression=None, float_format=None, ): """ Exclude features that have correlations above a certain threshold Arguments: profiles - either pandas DataFrame or a file that stores profile data platemap - either pandas DataFrame or a file that stores platemap metadata cell_id - [default: "unknown"] provide a string to annotate cell id column join_on - list of length two indicating which variables to merge profiles and plate [default: ["Metadata_well_position", "Metadata_Well"]]. The first element indicates variable(s) in platemap and the second element indicates variable(s) in profiles to merge using. Note the setting of `add_metadata_id_to_platemap` output_file - [default: "none"] if provided, will write annotated profiles to file if not specified, will return the annotated profiles. We recommend that this output file be suffixed with "_augmented.csv". add_metadata_id_to_platemap - boolean if the platemap variables should be recoded format_broad_cmap - [default: False] boolean if we need to add columns to make compatible with Broad CMAP naming conventions. perturbation_mode - [default: "none"] - either "chemical", "genetic" or "none" and only active if format_broad_cmap == True external_metadata - [default: "none"] a string indicating a file with additional metadata information external_join_left - [default: "none"] the merge column in the profile metadata external_join_right - [default: "none"] the merge column in the external metadata compression - the mechanism to compress [default: None] float_format - decimal precision to use in writing output file [default: None] For example, use "%.3g" for 3 decimal precision. Return: Pandas DataFrame of annotated profiles or written to file """ # Load Data profiles = load_profiles(profiles) platemap = load_platemap(platemap, add_metadata_id_to_platemap) annotated = platemap.merge(profiles, left_on=join_on[0], right_on=join_on[1], how="inner").drop(join_on[0], axis="columns") if format_broad_cmap: pert_opts = ["none", "chemical", "genetic"] assert (perturbation_mode in pert_opts ), "perturbation mode must be one of {}".format(pert_opts) assert ( "Metadata_broad_sample" in annotated.columns ), "Are you sure this is a CMAP file? 'Metadata_broad_sample column not found.'" annotated = annotated.assign( Metadata_pert_id=annotated.Metadata_broad_sample.str.extract( r"(BRD[-N][A-Z0-9]+)"), Metadata_pert_mfc_id=annotated.Metadata_broad_sample, Metadata_pert_well=annotated.loc[:, join_on[1]], Metadata_pert_id_vendor="", ) if "Metadata_pert_iname" in annotated.columns: annotated = annotated.assign( Metadata_pert_mfc_desc=annotated.Metadata_pert_iname, Metadata_pert_name=annotated.Metadata_pert_iname, ) if "Metadata_cell_id" not in annotated.columns: annotated = annotated.assign(Metadata_cell_id=cell_id) if perturbation_mode == "chemical": annotated = annotated.assign(Metadata_broad_sample_type=[ "control" if x in ["DMSO", np.nan] else "trt" for x in annotated.Metadata_broad_sample ]) # Generate Metadata_broad_sample column annotated.loc[annotated.Metadata_broad_sample_type == "control", "Metadata_broad_sample", ] = "DMSO" annotated.loc[annotated.Metadata_broad_sample == "empty", "Metadata_broad_sample_type"] = "empty" if "Metadata_mmoles_per_liter" in annotated.columns: annotated.loc[annotated.Metadata_broad_sample_type == "control", "Metadata_mmoles_per_liter", ] = 0 if "Metadata_solvent" in annotated.columns: annotated = annotated.assign( Metadata_pert_vehicle=annotated.Metadata_solvent) if "Metadata_mg_per_ml" in annotated.columns: annotated.loc[annotated.Metadata_broad_sample_type == "control", "Metadata_mg_per_ml", ] = 0 if perturbation_mode == "genetic": if "Metadata_pert_name" in annotated.columns: annotated = annotated.assign(Metadata_broad_sample_type=[ "control" if x == "EMPTY" else "trt" for x in annotated.Metadata_pert_name ]) if "Metadata_broad_sample_type" in annotated.columns: annotated = annotated.assign( Metadata_pert_type=annotated.Metadata_broad_sample_type) else: annotated = annotated.assign(Metadata_pert_type="", Metadata_broad_sample_type="") # Add specific Connectivity Map (CMAP) formatting if not isinstance(external_metadata, pd.DataFrame): if external_metadata != "none": assert os.path.exists( external_metadata ), "external metadata at {} does not exist".format( external_metadata) external_metadata = pd.read_csv(external_metadata) if isinstance(external_metadata, pd.DataFrame): external_metadata.columns = [ "Metadata_{}".format(x) if not x.startswith("Metadata_") else x for x in external_metadata.columns ] annotated = (annotated.merge( external_metadata, left_on=external_join_left, right_on=external_join_right, how="left", ).reset_index(drop=True).drop_duplicates()) # Reorder annotated metadata columns meta_cols = infer_cp_features(annotated, metadata=True) other_cols = annotated.drop(meta_cols, axis="columns").columns.tolist() annotated = annotated.loc[:, meta_cols + other_cols] if output_file != "none": output( df=annotated, output_filename=output_file, compression=compression, float_format=float_format, ) else: return annotated
def annotate( profiles, platemap, join_on=["Metadata_well_position", "Metadata_Well"], output_file="none", add_metadata_id_to_platemap=True, format_broad_cmap=False, clean_cellprofiler=True, external_metadata="none", external_join_left="none", external_join_right="none", compression_options=None, float_format=None, cmap_args={}, ): """Add metadata to aggregated profiles. Parameters ---------- profiles : pandas.core.frame.DataFrame or file DataFrame or file path of profiles. platemap : pandas.core.frame.DataFrame or file Dataframe or file path of platemap metadata. join_on : list or str, default: ["Metadata_well_position", "Metadata_Well"] Which variables to merge profiles and plate. The first element indicates variable(s) in platemap and the second element indicates variable(s) in profiles to merge using. Note the setting of `add_metadata_id_to_platemap` output_file : str, optional If not specified, will return the annotated profiles. We recommend that this output file be suffixed with "_augmented.csv". add_metadata_id_to_platemap : bool, default True Whether the plate map variables possibly need "Metadata" pre-pended format_broad_cmap : bool, default False Whether we need to add columns to make compatible with Broad CMAP naming conventions. clean_cellprofiler: bool, default True Clean specific CellProfiler feature names. external_metadata : str, optional File with additional metadata information external_join_left : str, optional Merge column in the profile metadata. external_join_right: str, optional Merge column in the external metadata. compression_options : str or dict, optional Contains compression options as input to pd.DataFrame.to_csv(compression=compression_options). pandas version >= 1.2. float_format : str, optional Decimal precision to use in writing output file as input to pd.DataFrame.to_csv(float_format=float_format). For example, use "%.3g" for 3 decimal precision. cmap_args : dict, default {} Potential keyword arguments for annotate_cmap(). See cyto_utils/annotate_custom.py for more details. Returns ------- annotated : pandas.core.frame.DataFrame, optional DataFrame of annotated features. If output_file="none", then return the DataFrame. If you specify output_file, then write to file and do not return data. """ # Load Data profiles = load_profiles(profiles) platemap = load_platemap(platemap, add_metadata_id_to_platemap) annotated = platemap.merge(profiles, left_on=join_on[0], right_on=join_on[1], how="inner").drop(join_on[0], axis="columns") # Add specific Connectivity Map (CMAP) formatting if format_broad_cmap: annotated = annotate_cmap(annotated, annotate_join_on=join_on[1], **cmap_args) if clean_cellprofiler: annotated = cp_clean(annotated) if not isinstance(external_metadata, pd.DataFrame): if external_metadata != "none": assert os.path.exists( external_metadata ), "external metadata at {} does not exist".format( external_metadata) external_metadata = pd.read_csv(external_metadata) if isinstance(external_metadata, pd.DataFrame): external_metadata.columns = [ "Metadata_{}".format(x) if not x.startswith("Metadata_") else x for x in external_metadata.columns ] annotated = (annotated.merge( external_metadata, left_on=external_join_left, right_on=external_join_right, how="left", ).reset_index(drop=True).drop_duplicates()) # Reorder annotated metadata columns meta_cols = infer_cp_features(annotated, metadata=True) other_cols = annotated.drop(meta_cols, axis="columns").columns.tolist() annotated = annotated.loc[:, meta_cols + other_cols] if output_file != "none": output( df=annotated, output_filename=output_file, compression_options=compression_options, float_format=float_format, ) else: return annotated
def feature_select( profiles, features="infer", samples="all", operation="variance_threshold", output_file="none", na_cutoff=0.05, corr_threshold=0.9, corr_method="pearson", freq_cut=0.05, unique_cut=0.1, compression=None, float_format=None, blocklist_file=None, outlier_cutoff=15, ): """ Performs feature selection based on the given operation Arguments: profiles - either pandas DataFrame or a file that stores profile data features - list of cell painting features [default: "infer"] if "infer", then assume cell painting features are those that start with "Cells", "Nuclei", or "Cytoplasm" samples - if provided, a list of samples to provide operation on [default: "all"] - if "all", use all samples to calculate operation - str or list of given operations to perform on input profiles output_file - [default: "none"] if provided, will write annotated profiles to file if not specified, will return the annotated profiles. We recommend that this output file be suffixed with "_normalized_variable_selected.csv". na_cutoff - proportion of missing values in a column to tolerate before removing corr_threshold - float between (0, 1) to exclude features above [default: 0.9] freq_cut - float of ratio (2nd most common feature val / most common) [default: 0.1] unique_cut - float of ratio (num unique features / num samples) [default: 0.1] compression - the mechanism to compress [default: None] float_format - decimal precision to use in writing output file [default: None] For example, use "%.3g" for 3 decimal precision. blocklist_file - file location of dataframe with features to exclude [default: None] Note that if "blocklist" in operation then will remove standard blocklist outlier_cutoff - the threshold at which the maximum or minimum value of a feature across a full experiment is excluded [default: 15]. Note that this procedure is typically applied (and therefore the default is suitable) for after normalization. """ all_ops = [ "variance_threshold", "correlation_threshold", "drop_na_columns", "blocklist", "drop_outliers", ] # Make sure the user provides a supported operation if isinstance(operation, list): assert all([x in all_ops for x in operation ]), "Some operation(s) {} not supported. Choose {}".format( operation, all_ops) elif isinstance(operation, str): assert operation in all_ops, "{} not supported. Choose {}".format( operation, all_ops) operation = operation.split() else: return ValueError("Operation must be a list or string") # Load Data profiles = load_profiles(profiles) if features == "infer": features = infer_cp_features(profiles) excluded_features = [] for op in operation: if op == "variance_threshold": exclude = variance_threshold( population_df=profiles, features=features, samples=samples, freq_cut=freq_cut, unique_cut=unique_cut, ) elif op == "drop_na_columns": exclude = get_na_columns( population_df=profiles, features=features, samples=samples, cutoff=na_cutoff, ) elif op == "correlation_threshold": exclude = correlation_threshold( population_df=profiles, features=features, samples=samples, threshold=corr_threshold, method=corr_method, ) elif op == "blocklist": if blocklist_file: exclude = get_blocklist_features(population_df=profiles, blocklist_file=blocklist_file) else: exclude = get_blocklist_features(population_df=profiles) elif op == "drop_outliers": exclude = drop_outlier_features( population_df=profiles, features=features, samples=samples, outlier_cutoff=outlier_cutoff, ) excluded_features += exclude excluded_features = list(set(excluded_features)) selected_df = profiles.drop(excluded_features, axis="columns") if output_file != "none": output( df=selected_df, output_filename=output_file, compression=compression, float_format=float_format, ) else: return selected_df
def feature_select( profiles, features="infer", image_features=False, samples="all", operation="variance_threshold", output_file="none", na_cutoff=0.05, corr_threshold=0.9, corr_method="pearson", freq_cut=0.05, unique_cut=0.1, compression_options=None, float_format=None, blocklist_file=None, outlier_cutoff=15, noise_removal_perturb_groups=None, noise_removal_stdev_cutoff=None, ): """Performs feature selection based on the given operation. Parameters ---------- profiles : pandas.core.frame.DataFrame or file DataFrame or file of profiles. features : list A list of strings corresponding to feature measurement column names in the `profiles` DataFrame. All features listed must be found in `profiles`. Defaults to "infer". If "infer", then assume cell painting features are those prefixed with "Cells", "Nuclei", or "Cytoplasm". image_features: bool, default False Whether the profiles contain image features. samples : list or str, default "all" Samples to provide operation on. operation: list of str or str, default "variance_threshold Operations to perform on the input profiles. output_file : str, optional If provided, will write annotated profiles to file. If not specified, will return the normalized profiles as output. We recommend that this output file be suffixed with "_normalized_variable_selected.csv". na_cutoff : float, default 0.05 Proportion of missing values in a column to tolerate before removing. corr_threshold : float, default 0.1 Value between (0, 1) to exclude features above if any two features are correlated above this threshold. corr_method : str, default "pearson" Correlation type to compute. Allowed methods are "spearman", "kendall" and "pearson". freq_cut : float, default 0.05 Ratio (2nd most common feature val / most common). unique_cut: float, default 0.01 Ratio (num unique features / num samples). compression_options : str or dict, optional Contains compression options as input to pd.DataFrame.to_csv(compression=compression_options). pandas version >= 1.2. float_format : str, optional Decimal precision to use in writing output file as input to pd.DataFrame.to_csv(float_format=float_format). For example, use "%.3g" for 3 decimal precision. blocklist_file : str, optional File location of datafrmame with with features to exclude. Note that if "blocklist" in operation then will remove standard blocklist outlier_cutoff : float, default 15 The threshold at which the maximum or minimum value of a feature across a full experiment is excluded. Note that this procedure is typically applied (and therefore the default is uitable) for after normalization. noise_removal_perturb_groups: str or list of str, optional Perturbation groups corresponding to rows in profiles or the the name of the metadata column containing this information. noise_removal_stdev_cutoff: float,optional Maximum mean feature standard deviation to be kept for noise removal, grouped by the identity of the perturbation from perturb_list. The data must already be normalized so that this cutoff can apply to all columns. Returns ------- selected_df : pandas.core.frame.DataFrame, optional The feature selected profile DataFrame. If output_file="none", then return the DataFrame. If you specify output_file, then write to file and do not return data. """ all_ops = [ "variance_threshold", "correlation_threshold", "drop_na_columns", "blocklist", "drop_outliers", "noise_removal", ] # Make sure the user provides a supported operation if isinstance(operation, list): assert all([x in all_ops for x in operation ]), "Some operation(s) {} not supported. Choose {}".format( operation, all_ops) elif isinstance(operation, str): assert operation in all_ops, "{} not supported. Choose {}".format( operation, all_ops) operation = operation.split() else: return ValueError("Operation must be a list or string") # Load Data profiles = load_profiles(profiles) if features == "infer": features = infer_cp_features(profiles, image_features=image_features) excluded_features = [] for op in operation: if op == "variance_threshold": exclude = variance_threshold( population_df=profiles, features=features, samples=samples, freq_cut=freq_cut, unique_cut=unique_cut, ) elif op == "drop_na_columns": exclude = get_na_columns( population_df=profiles, features=features, samples=samples, cutoff=na_cutoff, ) elif op == "correlation_threshold": exclude = correlation_threshold( population_df=profiles, features=features, samples=samples, threshold=corr_threshold, method=corr_method, ) elif op == "blocklist": if blocklist_file: exclude = get_blocklist_features(population_df=profiles, blocklist_file=blocklist_file) else: exclude = get_blocklist_features(population_df=profiles) elif op == "drop_outliers": exclude = drop_outlier_features( population_df=profiles, features=features, samples=samples, outlier_cutoff=outlier_cutoff, ) elif op == "noise_removal": exclude = noise_removal( population_df=profiles, features=features, noise_removal_perturb_groups=noise_removal_perturb_groups, noise_removal_stdev_cutoff=noise_removal_stdev_cutoff, ) excluded_features += exclude excluded_features = list(set(excluded_features)) selected_df = profiles.drop(excluded_features, axis="columns") if output_file != "none": output( df=selected_df, output_filename=output_file, compression_options=compression_options, float_format=float_format, ) else: return selected_df
def consensus( profiles, replicate_columns=["Metadata_Plate", "Metadata_Well"], operation="median", features="infer", output_file="none", compression_options=None, float_format=None, modz_args={"method": "spearman"}, ): """Form level 5 consensus profile data. :param profiles: A file or pandas DataFrame of profile data :type profiles: str :param replicate_columns: Metadata columns indicating which replicates to collapse, defaults to ["Metadata_Plate", "Metadata_Well"] :type replicate_columns: list :param operation: The method used to form consensus profiles, defaults to "median" :type operation: str :param features: The features to collapse, defaults to "infer" :type features: str, list :param output_file: If specified, the location to write the file, defaults to "none" :type output_file: str :param modz_args: Additional custom arguments passed as kwargs if operation="modz". See pycytominer.cyto_utils.modz for more details. :type modz_args: dict :param compression_options: the method to compress output data, defaults to None. See pycytominer.cyto_utils.output.py for options :type compression_options: str :param float_format: decimal precision to use in writing output file, defaults to None. For example, use "%.3g" for 3 decimal precision. :Example: import pandas as pd from pycytominer import consensus data_df = pd.concat( [ pd.DataFrame( { "Metadata_Plate": "X", "Metadata_Well": "a", "Cells_x": [0.1, 0.3, 0.8], "Nuclei_y": [0.5, 0.3, 0.1], } ), pd.DataFrame( { "Metadata_Plate": "X", "Metadata_Well": "b", "Cells_x": [0.4, 0.2, -0.5], "Nuclei_y": [-0.8, 1.2, -0.5], } ), ] ).reset_index(drop=True) consensus_df = consensus( profiles=data_df, replicate_columns=["Metadata_Plate", "Metadata_Well"], operation="median", features="infer", output_file="none", ) """ # Confirm that the operation is supported check_consensus_operation(operation) # Load Data profiles = load_profiles(profiles) if operation == "modz": consensus_df = modz(population_df=profiles, replicate_columns=replicate_columns, features=features, **modz_args) else: consensus_df = aggregate( population_df=profiles, strata=replicate_columns, features=features, operation=operation, subset_data_df="none", ) if output_file != "none": output( df=consensus_df, output_filename=output_file, compression_options=compression_options, float_format=float_format, ) else: return consensus_df
def annotate( profiles, platemap, join_on=["Metadata_well_position", "Metadata_Well"], output_file="none", add_metadata_id_to_platemap=True, format_broad_cmap=False, clean_cellprofiler=True, external_metadata="none", external_join_left="none", external_join_right="none", compression_options=None, float_format=None, cmap_args={}, ): """ Exclude features that have correlations above a certain threshold Arguments: profiles - either pandas DataFrame or a file that stores profile data platemap - either pandas DataFrame or a file that stores platemap metadata join_on - list of length two indicating which variables to merge profiles and plate [default: ["Metadata_well_position", "Metadata_Well"]]. The first element indicates variable(s) in platemap and the second element indicates variable(s) in profiles to merge using. Note the setting of `add_metadata_id_to_platemap` output_file - [default: "none"] if provided, will write annotated profiles to file if not specified, will return the annotated profiles. We recommend that this output file be suffixed with "_augmented.csv". add_metadata_id_to_platemap - [default: True] boolean if the platemap variables possibly need "Metadata" pre-pended format_broad_cmap - [default: False] boolean if we need to add columns to make compatible with Broad CMAP naming conventions. external_metadata - [default: "none"] a string indicating a file with additional metadata information external_join_left - [default: "none"] the merge column in the profile metadata external_join_right - [default: "none"] the merge column in the external metadata compression_options - the mechanism to compress [default: None] See cyto_utils/output.py for options. float_format - decimal precision to use in writing output file [default: None] For example, use "%.3g" for 3 decimal precision. cmap_args - [default: {}] - potential keyword arguments for annotate_cmap(). See cyto_utils/annotate_cmap.py for more details. Return: Pandas DataFrame of annotated profiles or written to file """ # Load Data profiles = load_profiles(profiles) platemap = load_platemap(platemap, add_metadata_id_to_platemap) annotated = platemap.merge(profiles, left_on=join_on[0], right_on=join_on[1], how="inner").drop(join_on[0], axis="columns") # Add specific Connectivity Map (CMAP) formatting if format_broad_cmap: annotated = annotate_cmap(annotated, annotate_join_on=join_on[1], **cmap_args) if clean_cellprofiler: annotated = cp_clean(annotated) if not isinstance(external_metadata, pd.DataFrame): if external_metadata != "none": assert os.path.exists( external_metadata ), "external metadata at {} does not exist".format( external_metadata) external_metadata = pd.read_csv(external_metadata) if isinstance(external_metadata, pd.DataFrame): external_metadata.columns = [ "Metadata_{}".format(x) if not x.startswith("Metadata_") else x for x in external_metadata.columns ] annotated = (annotated.merge( external_metadata, left_on=external_join_left, right_on=external_join_right, how="left", ).reset_index(drop=True).drop_duplicates()) # Reorder annotated metadata columns meta_cols = infer_cp_features(annotated, metadata=True) other_cols = annotated.drop(meta_cols, axis="columns").columns.tolist() annotated = annotated.loc[:, meta_cols + other_cols] if output_file != "none": output( df=annotated, output_filename=output_file, compression_options=compression_options, float_format=float_format, ) else: return annotated
def normalize( profiles, features="infer", meta_features="infer", samples="all", method="standardize", output_file="none", compression_options=None, float_format=None, spherize_center=True, spherize_method="ZCA-cor", spherize_epsilon=1e-6, ): """Normalize profiling features Parameters ---------- profiles : {pandas.Dataframe, path} Either a pandas DataFrame or a file that stores profile data features : list A list of strings corresponding to feature measurement column names in the `profiles` DataFrame. All features listed must be found in `profiles`. Defaults to "infer". If "infer", then assume cell painting features are those prefixed with "Cells", "Nuclei", or "Cytoplasm". meta_features : list A list of strings corresponding to metadata column names in the `profiles` DataFrame. All features listed must be found in `profiles`. Defaults to "infer". If "infer", then assume metadata features are those prefixed with "Metadata" samples : str The metadata column values to use as a normalization reference. We often use control samples. The function uses a pd.query() function, so you should structure samples in this fasion. An example is "Metadata_treatment == 'control'" (include all quotes). Defaults to "all". method : str How to normalize the dataframe. Defaults to "standardize". Check avail_methods for available normalization methods. output_file : str If provided, will write annotated profiles to file. If not specified, will return the normalized profiles as output. We recommend that this output file be suffixed with "_normalized.csv". Defaults to "none". compression_options : {dict, None} Contain compression options as input to pd.DataFrame.to_csv(compression=compression_options). pandas version >= 1.2. Defaults to None. float_format : {str, None} Decimal precision to use in writing output file as input to pd.DataFrame.to_csv(float_format=float_format). For example, use "%.3g" for 3 decimal precision. Defaults to None. spherize_center : bool If the function should center data before sphering (aka whitening). The function only uses this variable if method = "spherize". Defaults to True. spherize_method : str The sphering (aka whitening) normalization selection. The function only uses this variable if method = "spherize". Defaults to "ZCA-corr". See :py:func:`pycytominer.operations.transform` for available spherize methods. spherize_epsilon : float The sphering (aka whitening) fudge factor parameter. The function only uses this variable if method = "spherize". Defaults 1e-6. Returns ------- pd.DataFrame or None The normalized profile DataFrame. If output_file="none", then return the DataFrame. If you specify output_file, then write to file and do not return data. Examples -------- import pandas as pd from pycytominer import normalize data_df = pd.DataFrame( { "Metadata_plate": ["a", "a", "a", "a", "b", "b", "b", "b"], "Metadata_treatment": [ "drug", "drug", "control", "control", "drug", "drug", "control", "control", ], "x": [1, 2, 8, 2, 5, 5, 5, 1], "y": [3, 1, 7, 4, 5, 9, 6, 1], "z": [1, 8, 2, 5, 6, 22, 2, 2], "zz": [14, 46, 1, 6, 30, 100, 2, 2], } ).reset_index(drop=True) normalized_df = normalize( profiles=data_df, features=["x", "y", "z", "zz"], meta_features="infer", samples="Metadata_treatment == 'control'", method="standardize" ) """ # Load Data profiles = load_profiles(profiles) # Define which scaler to use method = method.lower() avail_methods = ["standardize", "robustize", "mad_robustize", "spherize"] assert method in avail_methods, "operation must be one {}".format(avail_methods) if method == "standardize": scaler = StandardScaler() elif method == "robustize": scaler = RobustScaler() elif method == "mad_robustize": scaler = RobustMAD() elif method == "spherize": scaler = Spherize( center=spherize_center, method=spherize_method, epsilon=spherize_epsilon ) if features == "infer": features = infer_cp_features(profiles) # Separate out the features and meta feature_df = profiles.loc[:, features] if meta_features == "infer": meta_features = infer_cp_features(profiles, metadata=True) meta_df = profiles.loc[:, meta_features] # Fit the sklearn scaler if samples == "all": fitted_scaler = scaler.fit(feature_df) else: # Subset to only the features measured in the sample query fitted_scaler = scaler.fit(profiles.query(samples).loc[:, features]) # Scale the feature dataframe feature_df = pd.DataFrame( fitted_scaler.transform(feature_df), columns=feature_df.columns, index=feature_df.index, ) normalized = meta_df.merge(feature_df, left_index=True, right_index=True) if output_file != "none": output( df=normalized, output_filename=output_file, compression_options=compression_options, float_format=float_format, ) else: return normalized