def feature_selection(dataset_link): """ Perform feature selection by dropping columns with null or only zeros values, and highly correlated values from the data. params: dataset_link: string of github link to the consensus dataset Returns: data: returned consensus dataframe """ data = pd.read_csv(dataset_link, compression='gzip', error_bad_lines=False) cols = data.columns.tolist() drop_cols = [ x for x in cols if ((data[x].isnull().sum()) | all(y == 0.0 for y in data[x].values)) ] data.drop(drop_cols, axis=1, inplace=True) data = feature_select( data, operation=["correlation_threshold", "variance_threshold", "blocklist"], blocklist_file= "https://raw.githubusercontent.com/broadinstitute/lincs-cell-painting/1769b32c7cef3385ccc4cea7057386e8a1bde39a/utils/consensus_blocklist.txt" ) return data
def feature_selection(df_lvl4): """ Perform feature selection by dropping columns with null values (greater than 384 i.e. equivalent to one plate worth of cell profiles) and highly correlated values from the data. """ metadata_columns = [x for x in df_lvl4.columns if (x.startswith("Metadata_"))] df_lvl4_metadata = df_lvl4[metadata_columns].copy() df_lvl4_features = df_lvl4.drop(metadata_columns, axis = 1) null_cols = [col for col in df_lvl4_features.columns if df_lvl4_features[col].isnull().sum() > 384] df_lvl4_features.drop(null_cols, axis = 1, inplace=True) df_lvl4_features = feature_select(df_lvl4_features, operation=["correlation_threshold", "variance_threshold"]) for col in df_lvl4_features.columns: if df_lvl4_features[col].isnull().sum(): df_lvl4_features[col].fillna(value=df_lvl4_features[col].mean(), inplace = True) df_meta_info = df_lvl4_metadata[['Metadata_broad_sample', 'Metadata_pert_id', 'Metadata_Plate', 'Metadata_Well', 'Metadata_broad_id', 'Metadata_moa', 'Metadata_dose_recode']].copy() df_lvl4_new = pd.concat([df_meta_info, df_lvl4_features], axis=1) return df_lvl4_new
df.head(2) # In[3]: # Perform feature selection feature_select_ops = [ "variance_threshold", "correlation_threshold", "drop_na_columns", "blocklist", "drop_outliers", ] df = feature_select(profiles=df, operation=feature_select_ops, na_cutoff=0) features = infer_cp_features(df) meta_features = infer_cp_features(df, metadata=True) print(df.shape) df.head(2) # In[4]: # Output feature selected file output_file = pathlib.Path("data/cell_health_merged_feature_select.csv.gz") output(
def process_profile(sql_file, batch, plate, pipeline): """ Given batch details and a pipeline, process morphology profiles """ assert batch in sql_file, "batch {} not recognized in sql file {}".format( batch, sql_file) assert plate in sql_file, "plate {} not recognized in sql file {}".format( plate, sql_file) # Set output directory information pipeline_output = pipeline["output_dir"] output_dir = os.path.join(pipeline_output, batch, plate) os.makedirs(output_dir, exist_ok=True) # Set output file information aggregate_out_file = os.path.join(output_dir, "{}.csv.gz".format(plate)) annotate_out_file = os.path.join(output_dir, "{}_augmented.csv.gz".format(plate)) normalize_out_file = os.path.join(output_dir, "{}_normalized.csv.gz".format(plate)) feature_out_file = os.path.join( output_dir, "{}_normalized_feature_selected.csv.gz".format(plate)) # Load pipeline options compression = process_pipeline(pipeline["options"], option="compression") samples = process_pipeline(pipeline["options"], option="samples") # Load and setup platemap info workspace_dir = pipeline["workspace_dir"] batch_dir = os.path.join(workspace_dir, "backend", batch) metadata_dir = os.path.join(workspace_dir, "metadata", batch) barcode_plate_map_file = os.path.join(metadata_dir, sorted(os.listdir(metadata_dir))[0]) barcode_plate_map_df = pd.read_csv(barcode_plate_map_file) plate_map_name = barcode_plate_map_df.query( "Assay_Plate_Barcode == @plate").Plate_Map_Name.values[0] plate_map_file = os.path.join(metadata_dir, "platemap", "{}.txt".format(plate_map_name)) plate_map_df = pd.read_csv(plate_map_file, sep="\t") plate_map_df.columns = [ "Metadata_{}".format(x) if not x.startswith("Metadata_") else x for x in plate_map_df.columns ] platemap_well_column = pipeline["platemap_well_column"] # Step 1: Aggregate aggregate_steps = pipeline["aggregate"] if aggregate_steps["perform"]: aggregate_features = aggregate_steps["features"] aggregate_operation = aggregate_steps["method"] aggregate_plate_column = aggregate_steps["plate_column"] aggregate_well_column = aggregate_steps["well_column"] strata = [aggregate_plate_column, aggregate_well_column] if "site_column" in aggregate_steps: aggregate_site_column = aggregate_steps["site_column"] strata += [aggregate_site_column] ap = AggregateProfiles( sql_file, strata=strata, features=aggregate_features, operation=aggregate_operation, ) ap.aggregate_profiles(output_file=aggregate_out_file, compression=compression) if pipeline["count"]["perform"]: count_dir = pipeline["count"]["output_dir"] os.makedirs(count_dir, exist_ok=True) cell_count_file = os.path.join( count_dir, "{}_{}_cell_count.tsv".format(batch, plate)) cell_count_df = ap.count_cells() cell_count_df = cell_count_df.merge( plate_map_df, left_on=aggregate_well_column, right_on=platemap_well_column, ).drop(platemap_well_column, axis="columns") cell_count_df.to_csv(cell_count_file, sep="\t", index=False) # Annotate Profiles annotate_steps = pipeline["annotate"] if annotate_steps["perform"]: annotate_well_column = annotate_steps["well_column"] annotate( profiles=aggregate_out_file, platemap=plate_map_df, join_on=[platemap_well_column, annotate_well_column], output_file=annotate_out_file, compression=compression, ) # Normalize Profiles normalize_steps = pipeline["normalize"] if normalize_steps["perform"]: norm_features = normalize_steps["features"] norm_method = normalize_steps["method"] normalize( profiles=annotate_out_file, features=norm_features, samples=samples, method=norm_method, output_file=normalize_out_file, compression=compression, ) # Apply feature selection feature_select_steps = pipeline["feature_select"] if feature_select_steps["perform"]: feature_select_operations = feature_select_steps["operations"] feature_select_features = feature_select_steps["features"] feature_select( profiles=normalize_out_file, features=feature_select_features, samples="none", operation=feature_select_operations, output_file=feature_out_file, compression=compression, corr_threshold=0.9, corr_method="pearson", )
def process_profile(sql_file, batch, plate, pipeline): """ Given batch details and a pipeline, process morphology profiles """ assert batch in sql_file, "batch {} not recognized in sql file {}".format( batch, sql_file) assert plate in sql_file, "plate {} not recognized in sql file {}".format( plate, sql_file) # Set output directory information pipeline_output = pipeline["output_dir"] output_dir = os.path.join(pipeline_output, batch, plate) os.makedirs(output_dir, exist_ok=True) # Set output file information aggregate_out_file = os.path.join(output_dir, "{}.csv.gz".format(plate)) annotate_out_file = os.path.join(output_dir, "{}_augmented.csv.gz".format(plate)) normalize_out_file = os.path.join(output_dir, "{}_normalized.csv.gz".format(plate)) feature_out_file = os.path.join( output_dir, "{}_normalized_feature_selected.csv.gz".format(plate)) # Load pipeline options compression = process_pipeline(pipeline["options"], option="compression") sc_float_format = process_pipeline(pipeline["options"], option="sc_float_format") samples = process_pipeline(pipeline["options"], option="samples") # Load and setup platemap info workspace_dir = pipeline["workspace_dir"] batch_dir = os.path.join(workspace_dir, "backend", batch) metadata_dir = os.path.join(workspace_dir, "metadata", batch) barcode_plate_map_file = os.path.join(metadata_dir, sorted(os.listdir(metadata_dir))[0]) barcode_plate_map_df = pd.read_csv(barcode_plate_map_file) plate_map_name = barcode_plate_map_df.query( "Assay_Plate_Barcode == @plate").Plate_Map_Name.values[0] plate_map_file = os.path.join(metadata_dir, "platemap", "{}.txt".format(plate_map_name)) plate_map_df = pd.read_csv(plate_map_file, sep="\t") plate_map_df.columns = [ "Metadata_{}".format(x) if not x.startswith("Metadata_") else x for x in plate_map_df.columns ] platemap_well_column = pipeline["platemap_well_column"] # Process Bulk profiles # Step 1: Aggregate aggregate_steps = pipeline["aggregate"] aggregate_features = aggregate_steps["features"] aggregate_operation = aggregate_steps["method"] aggregate_plate_column = aggregate_steps["plate_column"] aggregate_well_column = aggregate_steps["well_column"] strata = [aggregate_plate_column, aggregate_well_column] if "site_column" in aggregate_steps: aggregate_site_column = aggregate_steps["site_column"] strata += [aggregate_site_column] if aggregate_steps["perform"]: ap = AggregateProfiles( sql_file, strata=strata, features=aggregate_features, operation=aggregate_operation, ) ap.aggregate_profiles(output_file=aggregate_out_file, compression=compression) if pipeline["count"]["perform"]: if not aggregate_steps["perform"]: ap = AggregateProfiles( sql_file, strata=strata, features=aggregate_features, operation=aggregate_operation, ) count_dir = pipeline["count"]["output_dir"] os.makedirs(count_dir, exist_ok=True) cell_count_file = os.path.join( count_dir, "{}_{}_cell_count.tsv".format(batch, plate)) cell_count_df = ap.count_cells() cell_count_df = cell_count_df.merge( plate_map_df, left_on=aggregate_well_column, right_on=platemap_well_column, ).drop(platemap_well_column, axis="columns") cell_count_df.to_csv(cell_count_file, sep="\t", index=False) # Annotate Profiles annotate_steps = pipeline["annotate"] annotate_well_column = annotate_steps["well_column"] if annotate_steps["perform"]: annotate( profiles=aggregate_out_file, platemap=plate_map_df, join_on=[platemap_well_column, annotate_well_column], output_file=annotate_out_file, compression=compression, ) # Normalize Profiles normalize_steps = pipeline["normalize"] norm_features = normalize_steps["features"] norm_method = normalize_steps["method"] if normalize_steps["perform"]: normalize( profiles=annotate_out_file, features=norm_features, samples=samples, method=norm_method, output_file=normalize_out_file, compression=compression, ) # Apply feature selection feature_select_steps = pipeline["feature_select"] feature_select_operations = feature_select_steps["operations"] feature_select_features = feature_select_steps["features"] if feature_select_steps["perform"]: feature_select( profiles=normalize_out_file, features=feature_select_features, samples=samples, operation=feature_select_operations, output_file=feature_out_file, compression=compression, corr_threshold=0.9, corr_method="pearson", ) sc_steps = pipeline["single_cell"] if sc_steps["perform"]: if not aggregate_steps["perform"]: ap = AggregateProfiles( sql_file, strata=strata, features=aggregate_features, operation=aggregate_operation, ) # Load cells query = "select * from cells" cell_df = pd.read_sql(sql=query, con=ap.conn) # Load cytoplasm query = "select * from cytoplasm" cytoplasm_df = pd.read_sql(sql=query, con=ap.conn) # Load nuclei query = "select * from nuclei" nuclei_df = pd.read_sql(sql=query, con=ap.conn) # Merge single cells together sc_merged_df = (cell_df.merge( cytoplasm_df.drop("ObjectNumber", axis="columns"), left_on=["TableNumber", "ImageNumber", "ObjectNumber"], right_on=["TableNumber", "ImageNumber", "Cytoplasm_Parent_Cells"], how="inner", ).drop("ObjectNumber", axis="columns").merge( nuclei_df, left_on=["TableNumber", "ImageNumber", "Cytoplasm_Parent_Nuclei"], right_on=["TableNumber", "ImageNumber", "ObjectNumber"], how="inner", )) # Merge image data info sc_merged_df = ap.image_df.merge(sc_merged_df, how="right", on=ap.merge_cols) # Make sure column names are correctly prefixed prefix = ["Metadata", "Cells", "Cytoplasm", "Nuclei"] cols = [] for col in sc_merged_df.columns: if any([col.startswith(x) for x in prefix]): cols.append(col) else: cols.append(f"Metadata_{col}") sc_merged_df.columns = cols sc_merged_df = annotate( profiles=sc_merged_df, platemap=plate_map_df, join_on=[platemap_well_column, annotate_well_column], output_file="none", ) if sc_steps["normalize"]: sc_merged_df = normalize( profiles=sc_merged_df, features=norm_features, samples=samples, method=norm_method, output_file="none", ) if sc_steps["feature_select"]: sc_merged_df = feature_select( profiles=sc_merged_df, features=feature_select_features, samples=samples, operation=feature_select_operations, output_file="none", corr_threshold=0.9, corr_method="pearson", ) sc_pipeline_output = pipeline["sc_output_dir"] sc_output_dir = os.path.join(sc_pipeline_output, batch, plate) os.makedirs(sc_output_dir, exist_ok=True) # Set output file information sc_out_file = os.path.join(sc_output_dir, "{}_single_cell.csv.gz".format(plate)) output( df=sc_merged_df, output_filename=sc_out_file, compression="gzip", float_format=sc_float_format, )
# In[4]: for plate in plate_files: plate_file = plate_files[plate] output_file = pathlib.Path(f"{sc_dir}/{plate}_normalized_featureselected.csv.gz") # Set console output print(f"Now performing feature selection for... {plate_file}") sc_df = pd.read_csv(plate_file, low_memory=False) print("Before feature selection:") print(sc_df.shape) sc_df = feature_select( profiles=sc_df, operation=feature_select_operations, na_cutoff=na_cutoff, ) print("After feature selection:") print(sc_df.shape) # Output file to disk output( df=sc_df, output_filename=output_file, sep=",", float_format="%.5f", compression_options=compression_options, )
complete_consensus_df = complete_consensus_df.assign( Metadata_unique_id=complete_consensus_df.Metadata_broad_sample + "_dose_" + complete_consensus_df.Metadata_dose_recode.astype(str)) print(complete_consensus_df.shape) complete_consensus_df.head(2) # In[16]: # Perform feature selection complete_consensus_df = feature_select( profiles=complete_consensus_df, features="infer", samples="none", operation=feature_select_opts, output_file="none", na_cutoff=0, corr_threshold=0.9, corr_method="pearson", freq_cut=0.05, unique_cut=0.1, ) print(complete_consensus_df.shape) # In[17]: # Zero One Normalize Data complete_consensus_df = transform(complete_consensus_df) # In[18]:
def get_profiles(plate, backend_dir, metadata_dir, barcode_platemap_df): """ Apply all profiling steps for a given plate. Output: Will write a series of processed files to disk """ print("Processing {}.....".format(plate)) sqlite_file = "sqlite:///{}/{}.sqlite".format(backend_dir, plate) # Load specific platemap platemap = barcode_platemap_df.query( "Assay_Plate_Barcode == @plate").Plate_Map_Name.values[0] platemap_file = os.path.join(metadata_dir, "platemap", "{}.csv".format(platemap)) platemap_df = pd.read_csv(platemap_file) # Prepare sql file for processing ap = AggregateProfiles( sqlite_file, strata=["Image_Metadata_Plate", "Image_Metadata_Well"]) # Count cells and output cell_count_file = os.path.join("results", "{}_cell_count.tsv".format(plate)) cell_count_df = ap.count_cells() cell_count_df = cell_count_df.merge( platemap_df, left_on="Image_Metadata_Well", right_on="well_position").drop(["WellRow", "WellCol", "well_position"], axis="columns") cell_count_df.to_csv(cell_count_file, sep="\t", index=False) # Begin processing profiles output_dir = os.path.join("data", "profiles", plate) os.makedirs(output_dir, exist_ok=True) # Aggregate single cells into well profiles out_file = os.path.join(output_dir, "{}.csv.gz".format(plate)) ap.aggregate_profiles(output_file=out_file, compression="gzip") # Annotate Profiles anno_file = os.path.join(output_dir, "{}_augmented.csv.gz".format(plate)) annotate( profiles=out_file, platemap=platemap_df, join_on=["Metadata_well_position", "Image_Metadata_Well"], output_file=anno_file, compression="gzip", ) # Define metadata features meta_features = [ "Image_Metadata_Plate", "Image_Metadata_Well", "Metadata_WellRow", "Metadata_WellCol", "Metadata_gene_name", "Metadata_pert_name", "Metadata_broad_sample", "Metadata_cell_line", ] # Normalize Profiles norm_file = os.path.join(output_dir, "{}_normalized.csv.gz".format(plate)) normalize( profiles=anno_file, features="infer", meta_features=meta_features, samples="Metadata_pert_name == 'EMPTY'", method="mad_robustize", output_file=norm_file, compression="gzip", ) # Perform feature selection (just drop columns with high number of missingness) # Drop columns with high number of missingness, extreme values, and blacklist feat_file = os.path.join( output_dir, "{}_normalized_feature_select.csv.gz".format(plate)) feature_select( profiles=norm_file, features="infer", samples="none", operation=[ "drop_na_columns", "blacklist", "variance_threshold", "drop_outliers", ], output_file=feat_file, compression="gzip", ) # Perform audits profile_df = pd.read_csv(feat_file).drop( ["Image_Metadata_Well", "Image_Metadata_Plate"], axis="columns") # Audit guide replicability audit_file = os.path.join("results", "{}_audit_guide.csv".format(plate)) audit( profiles=profile_df, audit_groups=[ "Metadata_pert_name", "Metadata_gene_name", "Metadata_cell_line" ], iterations=10, output_file=audit_file, ) # Audit gene replicability audit_file = os.path.join("results", "{}_audit_gene.csv".format(plate)) audit( profiles=profile_df, audit_groups=["Metadata_gene_name", "Metadata_cell_line"], iterations=10, output_file=audit_file, )
# ## Apply normalization, feature select, and output data # In[12]: normalized_df = normalize(merged_df, features="infer", meta_features="infer", samples="all", method="standardize") # In[13]: feature_select_df = feature_select( normalized_df, features="infer", operation=feature_select_opts, output_file="none", na_cutoff=na_cutoff, corr_threshold=corr_threshold, ) print(feature_select_df.shape) feature_select_df.head() # In[14]: output_filename = pathlib.Path( f"data/{batch}/{plate}_singlecell_normalized_feature_select.csv.gz") output(normalized_df, output_filename, compression="gzip", float_format="%.5g")
"{}_feature_select.gct".format(batch)) # Load the profile data and add cell counts df = load_data(batch=batch, suffix=suffix, profile_dir=profile_dir, combine_dfs=True, add_cell_count=True, cell_count_dir=cell_count_dir) # Save normalized and non-feature selected data profile_batches[batch] = df # Apply feature selection again - this is particularly important for batches # with multiple plates df = feature_select(df, operation=feature_select_ops) # Write the dataframe as a gct file for input into Morpheus write_gct(profiles=df, output_file=output_gct_file) # ## Merge Profiles Together and Output # In[4]: all_profiles_df = pd.concat(profile_batches.values(), sort=True).reset_index(drop=True) meta_features = infer_cp_features(all_profiles_df, metadata=True) cp_cols = infer_cp_features(all_profiles_df, metadata=False) all_profiles_df = all_profiles_df.reindex(meta_features + cp_cols,
operation=operation, features=cp_norm_features, ) # How many DMSO profiles per well? print( f" There are {consensus_profiles[operation]['no_feat_select'].shape[0]} {operation} consensus profiles for {norm_strat} normalization" ) # Perform feature selection print( f" Now feature selecting on {operation} consensus for {norm_strat} normalization" ) consensus_profiles[operation]["feat_select"] = feature_select( profiles=consensus_profiles[operation]["no_feat_select"], features="infer", operation=feature_select_ops, blocklist_file=full_blocklist_file, ) # How many features in feature selected profile? print( f" There are {consensus_profiles[operation]['feat_select'].shape[1]} features in {operation} consensus profiles for {norm_strat} normalization" ) all_consensus_dfs[batch][norm_strat] = consensus_profiles print("\n") # ## Merge and output consensus signatures # # Output with and without feature selection.
.assign(Metadata_Dataset="FourClone") ) cloneAE_data_recode_df = ( cloneAE_data_df.assign(Metadata_treatment="bortezomib") .assign(Metadata_Dataset="CloneAE") ) cloneAE_data_recode_df.loc[cloneAE_data_recode_df.Metadata_Dosage == 0, "Metadata_treatment"] = "DMSO" # In[17]: combined_df = pd.concat([fourclone_data_recode_df, cloneAE_data_recode_df], sort=True).reset_index(drop=True) combined_df = feature_select(combined_df, operation="drop_na_columns") print(combined_df.shape) combined_df.head() # In[18]: embedding_combined_df = process_umap(combined_df) embedding_combined_df.head() # In[19]:
dataset_a_df.head() # In[6]: pd.crosstab(dataset_a_df.Metadata_CellLine, dataset_a_df.Metadata_Dosage) # In[7]: dataset_a_name = "combined_cloneAcloneE_dataset" # In[8]: output_file = os.path.join(output_dir, "{}.csv.gz".format(dataset_a_name)) dataset_a_df.to_csv(output_file, index=False, compression="gzip") dataset_a_featureselect_df = feature_select(dataset_a_df, operation=feature_select_ops) output_file = os.path.join(output_dir, "{}_feature_select.csv.gz".format(dataset_a_name)) dataset_a_featureselect_df.to_csv(output_file, index=False, compression="gzip") output_gct_file = os.path.join(gct_dir, "{}_feature_select.gct".format(dataset_a_name)) write_gct(profiles=dataset_a_featureselect_df, output_file=output_gct_file) print(dataset_a_featureselect_df.shape) dataset_a_featureselect_df.head() # ## Process and Output Dataset B # In[9]:
"{}_feature_select.gct".format(batch)) # Load the profile data and add cell counts df = load_data(batch=batch, suffix=suffix, profile_dir=profile_dir, combine_dfs=True, add_cell_count=True, harmonize_cols=True, cell_count_dir=cell_count_dir) # Save normalized and non-feature selected data profile_batches[batch] = df # Apply feature selection feature_select_df = feature_select(df, operation=feature_select_ops) # Write the dataframe as a gct file for input into Morpheus write_gct(profiles=feature_select_df, output_file=output_gct_file) # ## Merge Profiles Together and Output # In[4]: all_profiles_df = pd.concat(profile_batches.values(), sort=True).reset_index(drop=True) all_profiles_df = all_profiles_df.assign(Metadata_clone_type="resistant") all_profiles_df.loc[all_profiles_df.Metadata_clone_number.str.contains("WT"), "Metadata_clone_type"] = "wildtype"
def pipeline_feature_select(self, steps, suffix=None): feature_select_steps = steps pipeline_output = self.pipeline["output_dir"] level = feature_select_steps["level"] gct = feature_select_steps["gct"] feature_select_operations = feature_select_steps["operations"] feature_select_features = feature_select_steps["features"] all_plates_df = pd.DataFrame() for batch in self.profile_config: batch_df = pd.DataFrame() for plate in self.profile_config[batch]: output_dir = pathlib.PurePath(".", pipeline_output, batch, plate) if suffix: normalize_output_file = pathlib.PurePath( output_dir, f"{plate}_normalized_{suffix}.csv.gz") feature_select_output_file_plate = pathlib.PurePath( output_dir, f"{plate}_normalized_feature_select_{suffix}_plate.csv.gz", ) else: normalize_output_file = pathlib.PurePath( output_dir, f"{plate}_normalized.csv.gz") feature_select_output_file_plate = pathlib.PurePath( output_dir, f"{plate}_normalized_feature_select_plate.csv.gz") if feature_select_features == "infer" and self.noncanonical: feature_select_features = cyto_utils.infer_cp_features( pd.read_csv(normalize_output_file), compartments=self.compartments, ) df = pd.read_csv(normalize_output_file).assign( Metadata_batch=batch) if level == "plate": df = df.drop(columns=["Metadata_batch"]) feature_select( profiles=df, features=feature_select_features, operation=feature_select_operations, output_file=feature_select_output_file_plate, compression_options=self. pipeline_options["compression"], float_format=self.pipeline_options["float_format"], ) elif level == "batch": batch_df = concat_dataframes(batch_df, df) elif level == "all": all_plates_df = concat_dataframes(all_plates_df, df) if level == "batch": fs_df = feature_select( profiles=batch_df, features=feature_select_features, operation=feature_select_operations, ) for plate in self.profile_config[batch]: output_dir = pathlib.PurePath(".", pipeline_output, batch, plate) if suffix: feature_select_output_file_batch = pathlib.PurePath( output_dir, f"{plate}_normalized_feature_select_{suffix}_batch.csv.gz", ) else: feature_select_output_file_batch = pathlib.PurePath( output_dir, f"{plate}_normalized_feature_select_batch.csv.gz", ) if feature_select_features == "infer" and self.noncanonical: feature_select_features = cyto_utils.infer_cp_features( batch_df, compartments=self.compartments) df = fs_df.query("Metadata_Plate==@plate").reset_index( drop=True) df = df.drop(columns=["Metadata_batch"]) cyto_utils.output( output_filename=feature_select_output_file_batch, df=df, compression_options=self. pipeline_options["compression"], float_format=self.pipeline_options["float_format"], ) if gct: create_gct_directories(batch) if suffix: stacked_file = pathlib.PurePath( ".", "gct", batch, f"{batch}_normalized_feature_select_{suffix}_batch.csv.gz", ) gct_file = pathlib.PurePath( ".", "gct", batch, f"{batch}_normalized_feature_select_{suffix}_batch.gct", ) else: stacked_file = pathlib.PurePath( ".", "gct", batch, f"{batch}_normalized_feature_select_batch.csv.gz", ) gct_file = pathlib.PurePath( ".", "gct", batch, f"{batch}_normalized_feature_select_batch.gct", ) cyto_utils.output( output_filename=stacked_file, df=fs_df, compression_options=self. pipeline_options["compression"], float_format=self.pipeline_options["float_format"], ) write_gct(profiles=fs_df, output_file=gct_file) if level == "all": fs_df = feature_select( profiles=all_plates_df, features=feature_select_features, operation=feature_select_operations, ) for batch in self.profile_config: fs_batch_df = fs_df.loc[fs_df.Metadata_batch == batch].reset_index(drop=True) for plate in self.profile_config[batch]: output_dir = pathlib.PurePath(".", pipeline_output, batch, plate) if suffix: feature_select_output_file_all = pathlib.PurePath( output_dir, f"{plate}_normalized_feature_select_{suffix}_all.csv.gz", ) else: feature_select_output_file_all = pathlib.PurePath( output_dir, f"{plate}_normalized_feature_select_all.csv.gz") if feature_select_features == "infer" and self.noncanonical: feature_select_features = cyto_utils.infer_cp_features( all_plates_df, compartments=self.compartments) df = fs_batch_df.query( "Metadata_Plate==@plate").reset_index(drop=True) df = df.drop(columns=["Metadata_batch"]) cyto_utils.output( output_filename=feature_select_output_file_all, df=df, compression_options=self. pipeline_options["compression"], float_format=self.pipeline_options["float_format"], ) if gct: create_gct_directories(batch) if suffix: stacked_file = pathlib.PurePath( ".", "gct", batch, f"{batch}_normalized_feature_select_{suffix}_all.csv.gz", ) gct_file = pathlib.PurePath( ".", "gct", batch, f"{batch}_normalized_feature_select_{suffix}_all.gct", ) else: stacked_file = pathlib.PurePath( ".", "gct", batch, f"{batch}_normalized_feature_select_all.csv.gz", ) gct_file = pathlib.PurePath( ".", "gct", batch, f"{batch}_normalized_feature_select_all.gct", ) cyto_utils.output( output_filename=stacked_file, df=fs_batch_df, compression_options=self. pipeline_options["compression"], float_format=self.pipeline_options["float_format"], ) write_gct(profiles=fs_batch_df, output_file=gct_file)
normalize( profiles=anno_df, samples="all", method=norm_method, output_file=norm_file, float_format=float_format, compression_options=compression, ) # Feature Selection (DMSO Control) - Level 4B Data feat_dmso_file = pathlib.PurePath( output_dir, f"{plate_name}_normalized_feature_select_dmso.csv.gz") feature_select( profiles=norm_dmso_file, features="infer", operation=feature_select_ops, output_file=feat_dmso_file, float_format=float_format, compression_options=compression, ) # Feature Selection (Whole Plate) - Level 4B Data feat_file = pathlib.PurePath(output_dir, f"{plate_name}_normalized_feature_select.csv.gz") feature_select( profiles=norm_file, features="infer", operation=feature_select_ops, output_file=feat_file, float_format=float_format, compression_options=compression, )
feature_select_corr_threshold = feature_select_args["corr_threshold"] for data_level in feature_select_levels: if data_level == "single_cell": if not singlecell_from_single_file: warnings.warn( "Feature select operation is not enabled for site-specific single cell files. Skipping." ) continue input_file = feature_select_input_files[data_level] output_file = feature_select_output_files[data_level] print( f"Now performing feature selection for {data_level}...with operations: {feature_select_operations}" ) df = pd.read_csv(input_file) feature_select( profiles=df, features=feature_select_features, samples=feature_select_drop_samples, operation=feature_select_operations, na_cutoff=feature_select_nacutoff, corr_threshold=feature_select_corr_threshold, output_file=output_file, compression=compression, float_format=float_format, )
# In[10]: # We see a very large difference in cell count across profiles # Remember that profiles were generated from averaging feature values for all single cells full_df.Metadata_cell_count.hist() # In[11]: selected_features = [] for dataset in datasets: # Apply feature selection feature_select_df = feature_select( profiles=(full_df.query("Metadata_dataset == @dataset").query( "Metadata_model_split == 'training'")), operation=feature_select_opts, na_cutoff=na_cutoff, corr_threshold=corr_threshold) dataset_features = infer_cp_features(feature_select_df) selected_features.append( pd.DataFrame(dataset_features, columns=["features"]).assign(dataset=dataset)) # Output results of feature selection all_selected_features = pd.concat(selected_features).reset_index(drop=True) output_file = pathlib.Path(f"{output_dir}/dataset_features_selected.tsv") all_selected_features.to_csv(output_file, sep="\t", index=False)
# ## Apply Feature Selection # In[18]: meta_features = infer_cp_features(train_df, metadata=True) meta_features # In[19]: train_df = feature_select( train_df, operation=feature_select_opts, na_cutoff=na_cutoff, corr_threshold=corr_threshold ) selected_features = infer_cp_features(train_df) reindex_features = meta_features + selected_features test_df = test_df.reindex(reindex_features, axis="columns") train_df = train_df.reindex(reindex_features, axis="columns") holdout_df = holdout_df.reindex(reindex_features, axis="columns") other_df = other_df.reindex(reindex_features, axis="columns") # In[20]:
f"{output_dir}/{batch}_dmso_spherized_profiles_with_input_normalized_by_{suffix}.csv.gz" ) print(f"Now processing {output_file}...") profile_df = pd.concat([pd.read_csv(x) for x in files[batch][suffix] ]).reset_index(drop=True) print(profile_df.shape) # Step 1: Perform feature selection if batch == "2017_12_05_Batch2": profile_df = (profile_df.groupby([ "Metadata_cell_line", "Metadata_time_point" ]).apply( lambda x: feature_select(profiles=x, operation=feature_select_ops, na_cutoff=na_cut, corr_threshold=corr_threshold, blocklist_file=full_blocklist_file))) # Drop features that weren't selected in the grouped splits profile_df = feature_select(profiles=profile_df, operation="drop_na_columns", na_cutoff=na_cut) else: profile_df = feature_select(profiles=profile_df, operation=feature_select_ops, na_cutoff=na_cut, corr_threshold=corr_threshold, blocklist_file=full_blocklist_file) # Step 2: Spherize transform