def pipeline_normalize(self, batch, plate, steps, samples, suffix=None): normalize_steps = steps output_dir = pathlib.PurePath(".", self.pipeline_output, batch, plate) annotate_output_file = pathlib.PurePath(output_dir, f"{plate}_augmented.csv.gz") normalize_output_file = pathlib.PurePath(output_dir, f"{plate}_normalized.csv.gz") if suffix: normalize_output_file = pathlib.PurePath( output_dir, f"{plate}_normalized_{suffix}.csv.gz") normalization_features = normalize_steps["features"] normalization_method = normalize_steps["method"] if normalization_features == "infer" and self.noncanonical: normalization_features = cyto_utils.infer_cp_features( pd.read_csv(annotate_output_file), compartments=self.compartments) normalize( profiles=annotate_output_file, features=normalization_features, samples=samples, method=normalization_method, output_file=normalize_output_file, compression_options=self.pipeline_options["compression"], float_format=self.pipeline_options["float_format"], )
def normalize_profile(plate, output_dir, commit="cd91bd0daacef2b5ea25dcceb62482bb664d9de1"): link = f"https://github.com/broadinstitute/cell-health/raw/{commit}/1.generate-profiles/data/profiles/{plate}/{plate}_augmented.csv.gz" annotate_df = pd.read_csv(link) norm_file = pathlib.Path( f"{output_dir}/{plate}_wholeplate_normalized.csv.gz") feat_select_file = pathlib.Path( f"{output_dir}/{plate}_wholeplate_normalized_feature_selected.csv.gz") normalize(profiles=annotate_df, features="infer", meta_features=meta_features, samples="all", method="mad_robustize", output_file=norm_file, compression_options={ "method": "gzip", "mtime": 1 })
normalize_singlecell_from_single_file = sc_config[ "output_one_single_cell_file_only"] normalize_args = config["options"]["profile"]["normalize"] normalize_levels = normalize_args["levels"] normalize_by_samples = normalize_args["by_samples"] normalize_these_features = normalize_args["features"] normalize_method = normalize_args["method"] for data_level in normalize_levels: if data_level == "single_cell": if not normalize_singlecell_from_single_file: continue file_to_normalize = normalize_input_files[data_level] output_file = normalize_output_files[data_level] print(f"Now normalizing {data_level}...with operation: {normalize_method}") df = pd.read_csv(file_to_normalize) normalize( profiles=df, features=normalize_these_features, samples=normalize_by_samples, method=normalize_method, output_file=output_file, compression=compression, float_format=float_format, )
def test_merge_single_cells(): sc_merged_df = ap.merge_single_cells() # Assert that the image data was merged assert all(x in sc_merged_df.columns for x in ["Metadata_Plate", "Metadata_Well"]) # Assert that metadata columns were renamed appropriately for x in ap.full_merge_suffix_rename: assert ap.full_merge_suffix_rename[x] == "Metadata_{x}".format(x=x) # Perform a manual merge manual_merge = cytoplasm_df.merge( cells_df, left_on=["TableNumber", "ImageNumber", "Cytoplasm_Parent_Cells"], right_on=["TableNumber", "ImageNumber", "ObjectNumber"], suffixes=["_cytoplasm", "_cells"], ).merge( nuclei_df, left_on=["TableNumber", "ImageNumber", "Cytoplasm_Parent_Nuclei"], right_on=["TableNumber", "ImageNumber", "ObjectNumber"], suffixes=["_cytoplasm", "_nuclei"], ) manual_merge = image_df.merge(manual_merge, on=ap.merge_cols, how="right").rename( ap.full_merge_suffix_rename, axis="columns") # Confirm that the merge correctly reversed the object number (opposite from Parent) assert (sc_merged_df.Metadata_ObjectNumber_cytoplasm.tolist()[::-1] == sc_merged_df.Metadata_ObjectNumber.tolist()) assert (manual_merge.Metadata_ObjectNumber_cytoplasm.tolist()[::-1] == sc_merged_df.Metadata_ObjectNumber.tolist()) assert (manual_merge.Metadata_ObjectNumber_cytoplasm.tolist()[::-1] == sc_merged_df.Metadata_ObjectNumber.tolist()) assert (manual_merge.Metadata_ObjectNumber_cells.tolist() == sc_merged_df.Metadata_ObjectNumber.tolist()) # Confirm the merge and adding merge options for method in ["standardize", "robustize"]: for samples in ["all", "Metadata_ImageNumber == 'x'"]: for features in ["infer", ["Cytoplasm_a", "Cells_a"]]: norm_method_df = ap.merge_single_cells( single_cell_normalize=True, normalize_args={ "method": method, "samples": samples, "features": features, }, ) manual_merge_normalize = normalize(manual_merge, method=method, samples=samples, features=features) pd.testing.assert_frame_equal( norm_method_df.sort_index(axis=1), manual_merge_normalize.sort_index(axis=1), ) # Test non-canonical compartment merging new_sc_merge_df = ap_new.merge_single_cells() assert sum(new_sc_merge_df.columns.str.startswith("New")) == 4 assert (new_compartment_df.ObjectNumber.tolist()[::-1] == new_sc_merge_df.Metadata_ObjectNumber_new.tolist()) norm_new_method_df = ap_new.merge_single_cells( single_cell_normalize=True, normalize_args={ "method": "standardize", "samples": "all", "features": "infer", }, ) norm_new_method_no_feature_infer_df = ap_new.merge_single_cells( single_cell_normalize=True, normalize_args={ "method": "standardize", "samples": "all", }, ) default_feature_infer_df = ap_new.merge_single_cells( single_cell_normalize=True) pd.testing.assert_frame_equal(norm_new_method_df, default_feature_infer_df) pd.testing.assert_frame_equal(norm_new_method_df, norm_new_method_no_feature_infer_df) new_compartment_cols = infer_cp_features(new_compartment_df, compartments=ap_new.compartments) traditional_norm_df = normalize( ap_new.image_df.merge(new_compartment_df, on=ap.merge_cols), features=new_compartment_cols, samples="all", method="standardize", ) pd.testing.assert_frame_equal( norm_new_method_df.loc[:, new_compartment_cols].abs().describe(), traditional_norm_df.loc[:, new_compartment_cols].abs().describe(), )
def process_profile(sql_file, batch, plate, pipeline): """ Given batch details and a pipeline, process morphology profiles """ assert batch in sql_file, "batch {} not recognized in sql file {}".format( batch, sql_file) assert plate in sql_file, "plate {} not recognized in sql file {}".format( plate, sql_file) # Set output directory information pipeline_output = pipeline["output_dir"] output_dir = os.path.join(pipeline_output, batch, plate) os.makedirs(output_dir, exist_ok=True) # Set output file information aggregate_out_file = os.path.join(output_dir, "{}.csv.gz".format(plate)) annotate_out_file = os.path.join(output_dir, "{}_augmented.csv.gz".format(plate)) normalize_out_file = os.path.join(output_dir, "{}_normalized.csv.gz".format(plate)) feature_out_file = os.path.join( output_dir, "{}_normalized_feature_selected.csv.gz".format(plate)) # Load pipeline options compression = process_pipeline(pipeline["options"], option="compression") samples = process_pipeline(pipeline["options"], option="samples") # Load and setup platemap info workspace_dir = pipeline["workspace_dir"] batch_dir = os.path.join(workspace_dir, "backend", batch) metadata_dir = os.path.join(workspace_dir, "metadata", batch) barcode_plate_map_file = os.path.join(metadata_dir, sorted(os.listdir(metadata_dir))[0]) barcode_plate_map_df = pd.read_csv(barcode_plate_map_file) plate_map_name = barcode_plate_map_df.query( "Assay_Plate_Barcode == @plate").Plate_Map_Name.values[0] plate_map_file = os.path.join(metadata_dir, "platemap", "{}.txt".format(plate_map_name)) plate_map_df = pd.read_csv(plate_map_file, sep="\t") plate_map_df.columns = [ "Metadata_{}".format(x) if not x.startswith("Metadata_") else x for x in plate_map_df.columns ] platemap_well_column = pipeline["platemap_well_column"] # Step 1: Aggregate aggregate_steps = pipeline["aggregate"] if aggregate_steps["perform"]: aggregate_features = aggregate_steps["features"] aggregate_operation = aggregate_steps["method"] aggregate_plate_column = aggregate_steps["plate_column"] aggregate_well_column = aggregate_steps["well_column"] strata = [aggregate_plate_column, aggregate_well_column] if "site_column" in aggregate_steps: aggregate_site_column = aggregate_steps["site_column"] strata += [aggregate_site_column] ap = AggregateProfiles( sql_file, strata=strata, features=aggregate_features, operation=aggregate_operation, ) ap.aggregate_profiles(output_file=aggregate_out_file, compression=compression) if pipeline["count"]["perform"]: count_dir = pipeline["count"]["output_dir"] os.makedirs(count_dir, exist_ok=True) cell_count_file = os.path.join( count_dir, "{}_{}_cell_count.tsv".format(batch, plate)) cell_count_df = ap.count_cells() cell_count_df = cell_count_df.merge( plate_map_df, left_on=aggregate_well_column, right_on=platemap_well_column, ).drop(platemap_well_column, axis="columns") cell_count_df.to_csv(cell_count_file, sep="\t", index=False) # Annotate Profiles annotate_steps = pipeline["annotate"] if annotate_steps["perform"]: annotate_well_column = annotate_steps["well_column"] annotate( profiles=aggregate_out_file, platemap=plate_map_df, join_on=[platemap_well_column, annotate_well_column], output_file=annotate_out_file, compression=compression, ) # Normalize Profiles normalize_steps = pipeline["normalize"] if normalize_steps["perform"]: norm_features = normalize_steps["features"] norm_method = normalize_steps["method"] normalize( profiles=annotate_out_file, features=norm_features, samples=samples, method=norm_method, output_file=normalize_out_file, compression=compression, ) # Apply feature selection feature_select_steps = pipeline["feature_select"] if feature_select_steps["perform"]: feature_select_operations = feature_select_steps["operations"] feature_select_features = feature_select_steps["features"] feature_select( profiles=normalize_out_file, features=feature_select_features, samples="none", operation=feature_select_operations, output_file=feature_out_file, compression=compression, corr_threshold=0.9, corr_method="pearson", )
print(feature_df.shape) feature_df.head() # In[6]: # Perform spherize transform for file in data_files: # Extract plate from file name plate = str(file).split("/")[-1].split("_")[0] print(f"Now processing {plate}...") # Load data and apply feature selection df = pd.read_csv(file).reindex(feature_df.index, axis="columns") # Get feature names metadata_cols = ["Image_Metadata_Well"] + infer_cp_features(df, metadata=True) feature_cols = infer_cp_features( df, compartments=["Cells", "Cytoplasm", "Nuclei"]) output_file = pathlib.Path(f"{data_dir}/{plate}_{output_file_suffix}") # Apply spherize transformation and output files normalize(profiles=df, features=feature_cols, meta_features=metadata_cols, method="spherize", spherize_method="ZCA-cor", spherize_center=True, output_file=output_file)
def process_profile(sql_file, batch, plate, pipeline): """ Given batch details and a pipeline, process morphology profiles """ assert batch in sql_file, "batch {} not recognized in sql file {}".format( batch, sql_file) assert plate in sql_file, "plate {} not recognized in sql file {}".format( plate, sql_file) # Set output directory information pipeline_output = pipeline["output_dir"] output_dir = os.path.join(pipeline_output, batch, plate) os.makedirs(output_dir, exist_ok=True) # Set output file information aggregate_out_file = os.path.join(output_dir, "{}.csv.gz".format(plate)) annotate_out_file = os.path.join(output_dir, "{}_augmented.csv.gz".format(plate)) normalize_out_file = os.path.join(output_dir, "{}_normalized.csv.gz".format(plate)) feature_out_file = os.path.join( output_dir, "{}_normalized_feature_selected.csv.gz".format(plate)) # Load pipeline options compression = process_pipeline(pipeline["options"], option="compression") sc_float_format = process_pipeline(pipeline["options"], option="sc_float_format") samples = process_pipeline(pipeline["options"], option="samples") # Load and setup platemap info workspace_dir = pipeline["workspace_dir"] batch_dir = os.path.join(workspace_dir, "backend", batch) metadata_dir = os.path.join(workspace_dir, "metadata", batch) barcode_plate_map_file = os.path.join(metadata_dir, sorted(os.listdir(metadata_dir))[0]) barcode_plate_map_df = pd.read_csv(barcode_plate_map_file) plate_map_name = barcode_plate_map_df.query( "Assay_Plate_Barcode == @plate").Plate_Map_Name.values[0] plate_map_file = os.path.join(metadata_dir, "platemap", "{}.txt".format(plate_map_name)) plate_map_df = pd.read_csv(plate_map_file, sep="\t") plate_map_df.columns = [ "Metadata_{}".format(x) if not x.startswith("Metadata_") else x for x in plate_map_df.columns ] platemap_well_column = pipeline["platemap_well_column"] # Process Bulk profiles # Step 1: Aggregate aggregate_steps = pipeline["aggregate"] aggregate_features = aggregate_steps["features"] aggregate_operation = aggregate_steps["method"] aggregate_plate_column = aggregate_steps["plate_column"] aggregate_well_column = aggregate_steps["well_column"] strata = [aggregate_plate_column, aggregate_well_column] if "site_column" in aggregate_steps: aggregate_site_column = aggregate_steps["site_column"] strata += [aggregate_site_column] if aggregate_steps["perform"]: ap = AggregateProfiles( sql_file, strata=strata, features=aggregate_features, operation=aggregate_operation, ) ap.aggregate_profiles(output_file=aggregate_out_file, compression=compression) if pipeline["count"]["perform"]: if not aggregate_steps["perform"]: ap = AggregateProfiles( sql_file, strata=strata, features=aggregate_features, operation=aggregate_operation, ) count_dir = pipeline["count"]["output_dir"] os.makedirs(count_dir, exist_ok=True) cell_count_file = os.path.join( count_dir, "{}_{}_cell_count.tsv".format(batch, plate)) cell_count_df = ap.count_cells() cell_count_df = cell_count_df.merge( plate_map_df, left_on=aggregate_well_column, right_on=platemap_well_column, ).drop(platemap_well_column, axis="columns") cell_count_df.to_csv(cell_count_file, sep="\t", index=False) # Annotate Profiles annotate_steps = pipeline["annotate"] annotate_well_column = annotate_steps["well_column"] if annotate_steps["perform"]: annotate( profiles=aggregate_out_file, platemap=plate_map_df, join_on=[platemap_well_column, annotate_well_column], output_file=annotate_out_file, compression=compression, ) # Normalize Profiles normalize_steps = pipeline["normalize"] norm_features = normalize_steps["features"] norm_method = normalize_steps["method"] if normalize_steps["perform"]: normalize( profiles=annotate_out_file, features=norm_features, samples=samples, method=norm_method, output_file=normalize_out_file, compression=compression, ) # Apply feature selection feature_select_steps = pipeline["feature_select"] feature_select_operations = feature_select_steps["operations"] feature_select_features = feature_select_steps["features"] if feature_select_steps["perform"]: feature_select( profiles=normalize_out_file, features=feature_select_features, samples=samples, operation=feature_select_operations, output_file=feature_out_file, compression=compression, corr_threshold=0.9, corr_method="pearson", ) sc_steps = pipeline["single_cell"] if sc_steps["perform"]: if not aggregate_steps["perform"]: ap = AggregateProfiles( sql_file, strata=strata, features=aggregate_features, operation=aggregate_operation, ) # Load cells query = "select * from cells" cell_df = pd.read_sql(sql=query, con=ap.conn) # Load cytoplasm query = "select * from cytoplasm" cytoplasm_df = pd.read_sql(sql=query, con=ap.conn) # Load nuclei query = "select * from nuclei" nuclei_df = pd.read_sql(sql=query, con=ap.conn) # Merge single cells together sc_merged_df = (cell_df.merge( cytoplasm_df.drop("ObjectNumber", axis="columns"), left_on=["TableNumber", "ImageNumber", "ObjectNumber"], right_on=["TableNumber", "ImageNumber", "Cytoplasm_Parent_Cells"], how="inner", ).drop("ObjectNumber", axis="columns").merge( nuclei_df, left_on=["TableNumber", "ImageNumber", "Cytoplasm_Parent_Nuclei"], right_on=["TableNumber", "ImageNumber", "ObjectNumber"], how="inner", )) # Merge image data info sc_merged_df = ap.image_df.merge(sc_merged_df, how="right", on=ap.merge_cols) # Make sure column names are correctly prefixed prefix = ["Metadata", "Cells", "Cytoplasm", "Nuclei"] cols = [] for col in sc_merged_df.columns: if any([col.startswith(x) for x in prefix]): cols.append(col) else: cols.append(f"Metadata_{col}") sc_merged_df.columns = cols sc_merged_df = annotate( profiles=sc_merged_df, platemap=plate_map_df, join_on=[platemap_well_column, annotate_well_column], output_file="none", ) if sc_steps["normalize"]: sc_merged_df = normalize( profiles=sc_merged_df, features=norm_features, samples=samples, method=norm_method, output_file="none", ) if sc_steps["feature_select"]: sc_merged_df = feature_select( profiles=sc_merged_df, features=feature_select_features, samples=samples, operation=feature_select_operations, output_file="none", corr_threshold=0.9, corr_method="pearson", ) sc_pipeline_output = pipeline["sc_output_dir"] sc_output_dir = os.path.join(sc_pipeline_output, batch, plate) os.makedirs(sc_output_dir, exist_ok=True) # Set output file information sc_out_file = os.path.join(sc_output_dir, "{}_single_cell.csv.gz".format(plate)) output( df=sc_merged_df, output_filename=sc_out_file, compression="gzip", float_format=sc_float_format, )
def merge_single_cells( self, compute_subsample=False, sc_output_file="none", compression_options=None, float_format=None, single_cell_normalize=False, normalize_args=None, ): """Given the linking columns, merge single cell data. Normalization is also supported. Parameters ---------- compute_subsample : bool, default False Whether or not to compute subsample. sc_output_file : str, optional The name of a file to output. compression_options : str, optional Compression arguments as input to pandas.to_csv() with pandas version >= 1.2. float_format : str, optional Decimal precision to use in writing output file. single_cell_normalize : bool, default False Whether or not to normalize the single cell data. normalize_args : dict, optional Additional arguments passed as input to pycytominer.normalize(). Returns ------- pandas.core.frame.DataFrame Either a dataframe (if output_file="none") or will write to file. """ # Load the single cell dataframe by merging on the specific linking columns sc_df = "" linking_check_cols = [] merge_suffix_rename = [] for left_compartment in self.compartment_linking_cols: for right_compartment in self.compartment_linking_cols[left_compartment]: # Make sure only one merge per combination occurs linking_check = "-".join(sorted([left_compartment, right_compartment])) if linking_check in linking_check_cols: continue # Specify how to indicate merge suffixes merge_suffix = [ "_{comp_l}".format(comp_l=left_compartment), "_{comp_r}".format(comp_r=right_compartment), ] merge_suffix_rename += merge_suffix left_link_col = self.compartment_linking_cols[left_compartment][ right_compartment ] right_link_col = self.compartment_linking_cols[right_compartment][ left_compartment ] if isinstance(sc_df, str): initial_df = self.load_compartment(compartment=left_compartment) if compute_subsample: # Sample cells proportionally by self.strata self.get_subsample(df=initial_df, rename_col=False) subset_logic_df = self.subset_data_df.drop( self.image_df.columns, axis="columns" ) initial_df = subset_logic_df.merge( initial_df, how="left", on=subset_logic_df.columns.tolist() ).reindex(initial_df.columns, axis="columns") sc_df = initial_df.merge( self.load_compartment(compartment=right_compartment), left_on=self.merge_cols + [left_link_col], right_on=self.merge_cols + [right_link_col], suffixes=merge_suffix, ) else: sc_df = sc_df.merge( self.load_compartment(compartment=right_compartment), left_on=self.merge_cols + [left_link_col], right_on=self.merge_cols + [right_link_col], suffixes=merge_suffix, ) linking_check_cols.append(linking_check) # Add metadata prefix to merged suffixes full_merge_suffix_rename = [] full_merge_suffix_original = [] for col_name in self.merge_cols + list(self.linking_col_rename.keys()): full_merge_suffix_original.append(col_name) full_merge_suffix_rename.append("Metadata_{x}".format(x=col_name)) for col_name in self.merge_cols + list(self.linking_col_rename.keys()): for suffix in set(merge_suffix_rename): full_merge_suffix_original.append("{x}{y}".format(x=col_name, y=suffix)) full_merge_suffix_rename.append( "Metadata_{x}{y}".format(x=col_name, y=suffix) ) self.full_merge_suffix_rename = dict( zip(full_merge_suffix_original, full_merge_suffix_rename) ) # Add image data to single cell dataframe if not self.load_image_data: self.load_image() self.load_image_data = True sc_df = ( self.image_df.merge(sc_df, on=self.merge_cols, how="right") .rename(self.linking_col_rename, axis="columns") .rename(self.full_merge_suffix_rename, axis="columns") ) if single_cell_normalize: # Infering features is tricky with non-canonical data if normalize_args is None: normalize_args = {} features = infer_cp_features(sc_df, compartments=self.compartments) elif "features" not in normalize_args: features = infer_cp_features(sc_df, compartments=self.compartments) elif normalize_args["features"] == "infer": features = infer_cp_features(sc_df, compartments=self.compartments) else: features = normalize_args["features"] normalize_args["features"] = features sc_df = normalize(profiles=sc_df, **normalize_args) if sc_output_file != "none": output( df=sc_df, output_filename=sc_output_file, compression_options=compression_options, float_format=float_format, ) else: return sc_df
# Output annotated file cyto_utils.output( df=anno_df, output_filename=anno_file, float_format=float_format, compression_options=compression, ) # Normalize Profiles (DMSO Control) - Level 4A Data norm_dmso_file = pathlib.PurePath(output_dir, f"{plate_name}_normalized_dmso.csv.gz") normalize( profiles=anno_df, samples="Metadata_broad_sample == 'DMSO'", method=norm_method, output_file=norm_dmso_file, float_format=float_format, compression_options=compression, ) # Normalize Profiles (Whole Plate) - Level 4A Data norm_file = pathlib.PurePath(output_dir, f"{plate_name}_normalized.csv.gz") normalize( profiles=anno_df, samples="all", method=norm_method, output_file=norm_file, float_format=float_format, compression_options=compression, )
# Merge with the image information merged_df = image_df.merge(merged_df, on=["TableNumber", "ImageNumber"], how="right") print(merged_df.shape) merged_df.head() # ## Apply normalization, feature select, and output data # In[12]: normalized_df = normalize(merged_df, features="infer", meta_features="infer", samples="all", method="standardize") # In[13]: feature_select_df = feature_select( normalized_df, features="infer", operation=feature_select_opts, output_file="none", na_cutoff=na_cutoff, corr_threshold=corr_threshold, ) print(feature_select_df.shape)
def get_profiles(plate, backend_dir, metadata_dir, barcode_platemap_df): """ Apply all profiling steps for a given plate. Output: Will write a series of processed files to disk """ print("Processing {}.....".format(plate)) sqlite_file = "sqlite:///{}/{}.sqlite".format(backend_dir, plate) # Load specific platemap platemap = barcode_platemap_df.query( "Assay_Plate_Barcode == @plate").Plate_Map_Name.values[0] platemap_file = os.path.join(metadata_dir, "platemap", "{}.csv".format(platemap)) platemap_df = pd.read_csv(platemap_file) # Prepare sql file for processing ap = AggregateProfiles( sqlite_file, strata=["Image_Metadata_Plate", "Image_Metadata_Well"]) # Count cells and output cell_count_file = os.path.join("results", "{}_cell_count.tsv".format(plate)) cell_count_df = ap.count_cells() cell_count_df = cell_count_df.merge( platemap_df, left_on="Image_Metadata_Well", right_on="well_position").drop(["WellRow", "WellCol", "well_position"], axis="columns") cell_count_df.to_csv(cell_count_file, sep="\t", index=False) # Begin processing profiles output_dir = os.path.join("data", "profiles", plate) os.makedirs(output_dir, exist_ok=True) # Aggregate single cells into well profiles out_file = os.path.join(output_dir, "{}.csv.gz".format(plate)) ap.aggregate_profiles(output_file=out_file, compression="gzip") # Annotate Profiles anno_file = os.path.join(output_dir, "{}_augmented.csv.gz".format(plate)) annotate( profiles=out_file, platemap=platemap_df, join_on=["Metadata_well_position", "Image_Metadata_Well"], output_file=anno_file, compression="gzip", ) # Define metadata features meta_features = [ "Image_Metadata_Plate", "Image_Metadata_Well", "Metadata_WellRow", "Metadata_WellCol", "Metadata_gene_name", "Metadata_pert_name", "Metadata_broad_sample", "Metadata_cell_line", ] # Normalize Profiles norm_file = os.path.join(output_dir, "{}_normalized.csv.gz".format(plate)) normalize( profiles=anno_file, features="infer", meta_features=meta_features, samples="Metadata_pert_name == 'EMPTY'", method="mad_robustize", output_file=norm_file, compression="gzip", ) # Perform feature selection (just drop columns with high number of missingness) # Drop columns with high number of missingness, extreme values, and blacklist feat_file = os.path.join( output_dir, "{}_normalized_feature_select.csv.gz".format(plate)) feature_select( profiles=norm_file, features="infer", samples="none", operation=[ "drop_na_columns", "blacklist", "variance_threshold", "drop_outliers", ], output_file=feat_file, compression="gzip", ) # Perform audits profile_df = pd.read_csv(feat_file).drop( ["Image_Metadata_Well", "Image_Metadata_Plate"], axis="columns") # Audit guide replicability audit_file = os.path.join("results", "{}_audit_guide.csv".format(plate)) audit( profiles=profile_df, audit_groups=[ "Metadata_pert_name", "Metadata_gene_name", "Metadata_cell_line" ], iterations=10, output_file=audit_file, ) # Audit gene replicability audit_file = os.path.join("results", "{}_audit_gene.csv".format(plate)) audit( profiles=profile_df, audit_groups=["Metadata_gene_name", "Metadata_cell_line"], iterations=10, output_file=audit_file, )
# Set console output print(f"Now processing... {output_file}") # Initiate single cell class sc = cells.SingleCells( file_or_conn=sql_file, strata=["Image_Metadata_Plate", "Image_Metadata_Well"], ) # Merge single cells sc_df = sc.merge_single_cells() # Normalize data sc_df = normalize( profiles=sc_df, method="standardize" ) # Merge well and plate metadata sc_df = ( sc.image_df.merge( metadata_df, left_on="Image_Metadata_Well", right_on="Metadata_well_position", how="left" ).merge( sc_df, left_on=["TableNumber", "ImageNumber"], right_on=["Metadata_TableNumber", "Metadata_ImageNumber"], how="right" )
for batch in matched_plates: compound_matches = matched_plates[batch] for compound in compound_matches: matched_plates[batch][compound]["data"] = (pd.concat( matched_plates[batch][compound]["data"]).reset_index(drop=True)) # In[7]: # Normalize profiles for batch in matched_plates: compound_matches = matched_plates[batch] for compound in compound_matches: df = matched_plates[batch][compound]["data"] normalized_data = normalize(profiles=df, features="infer", meta_features="infer", samples="all", method="standardize") matched_plates[batch][compound]["normalized_data"] = normalized_data # In[8]: # Detect the impact of batch - is it necessary to adjust? n_components = 20 pca_columns = [f"pca_{x}" for x in range(0, n_components)] model_formula = "pca_value ~ Metadata_clone_number + Metadata_treatment + Metadata_Plate + Metadata_treatment * Metadata_Plate" anova_results_full_new_normalized = [] for batch in matched_plates: compound_matches = matched_plates[batch]
na_cutoff=na_cut) else: profile_df = feature_select(profiles=profile_df, operation=feature_select_ops, na_cutoff=na_cut, corr_threshold=corr_threshold, blocklist_file=full_blocklist_file) # Step 2: Spherize transform if batch == "2017_12_05_Batch2": spherize_df = (profile_df.groupby([ "Metadata_cell_line", "Metadata_time_point" ]).apply( lambda x: normalize(profiles=x, features="infer", meta_features="infer", samples="Metadata_broad_sample == 'DMSO'", method="spherize"))) else: spherize_df = normalize(profiles=profile_df, features="infer", meta_features="infer", samples="Metadata_broad_sample == 'DMSO'", method="spherize") print(spherize_df.shape) spherize_df.head() # Step 3: Output profiles output(df=spherize_df, output_filename=output_file)