def test_annotate_cp_clean(): data_rename_df = data_df.rename({"Metadata_Well": "Image_Metadata_Well"}, axis="columns") data_rename_df = data_rename_df.assign(Image_Metadata_Plate="test") anno_result = annotate( profiles=data_rename_df, platemap=broad_platemap_df, clean_cellprofiler=False, join_on=["Metadata_well_position", "Image_Metadata_Well"], ) assert all([ x in anno_result.columns for x in ["Image_Metadata_Well", "Image_Metadata_Plate"] ]) anno_result = annotate( profiles=data_rename_df, platemap=broad_platemap_df, clean_cellprofiler=True, join_on=["Metadata_well_position", "Image_Metadata_Well"], ) assert all([ x in anno_result.columns for x in ["Metadata_Well", "Metadata_Plate"] ])
def test_annotate_cmap_pertchemical(): anno_result = annotate( profiles=data_df, platemap=broad_platemap_df, join_on=["Metadata_well_position", "Metadata_Well"], format_broad_cmap=True, perturbation_mode="genetic", ) added_cols = [ "Metadata_pert_id", "Metadata_pert_mfc_id", "Metadata_pert_well", "Metadata_pert_id_vendor", "Metadata_cell_id", "Metadata_pert_type", "Metadata_broad_sample_type", ] assert all(x in anno_result.columns for x in added_cols) some_doses = [1000, 2, 1, 1, 1, 1] chemical_platemap = broad_platemap_df.copy() chemical_platemap.loc[0, "Metadata_broad_sample"] = "DMSO" chemical_platemap = chemical_platemap.assign( Metadata_mmoles_per_liter=some_doses, Metadata_mg_per_ml=some_doses, Metadata_solvent="DMSO", ) anno_result = annotate( profiles=data_df, platemap=chemical_platemap, join_on=["Metadata_well_position", "Metadata_Well"], format_broad_cmap=True, perturbation_mode="chemical", ) expected_Metadata_pert_type = [ "control", "trt", "trt", "trt", "trt", "trt" ] assert anno_result.Metadata_pert_type.tolist( ) == expected_Metadata_pert_type assert (anno_result.Metadata_broad_sample_type.tolist() == expected_Metadata_pert_type) expected_dose = [0, 2, 1, 1, 1, 1] assert anno_result.Metadata_mmoles_per_liter.tolist() == expected_dose assert anno_result.Metadata_mg_per_ml.tolist() == expected_dose added_cols += [ "Metadata_mmoles_per_liter", "Metadata_mg_per_ml", "Metadata_solvent", "Metadata_pert_vehicle", ] assert all(x in anno_result.columns for x in added_cols)
def test_annotate_cmap_externalmetadata(): external_data_example = pd.DataFrame({ "test_well_join": ["A01"], "test_info_col": ["DMSO is cool"] }).reset_index(drop=True) external_data_example.to_csv(output_file, index=False, sep=",") some_doses = [1000, 2, 1, 1, 1, 1] chemical_platemap = broad_platemap_df.copy() chemical_platemap.loc[0, "Metadata_broad_sample"] = "DMSO" chemical_platemap = chemical_platemap.assign( Metadata_mmoles_per_liter=some_doses, Metadata_mg_per_ml=some_doses, Metadata_solvent="DMSO", Metadata_cell_id="A549", ) anno_result = annotate( profiles=data_df, platemap=chemical_platemap, join_on=["Metadata_well_position", "Metadata_Well"], format_broad_cmap=True, perturbation_mode="chemical", external_metadata=output_file, external_join_left="Metadata_Well", external_join_right="Metadata_test_well_join", ) assert anno_result.loc[0, "Metadata_test_info_col"] == "DMSO is cool" assert anno_result.Metadata_cell_id.unique()[0] == "A549"
def test_annotate_cmap_assert(): with pytest.raises(AssertionError) as nocmap: anno_result = annotate( profiles=data_df, platemap=platemap_df, join_on=["Metadata_well_position", "Metadata_Well"], format_broad_cmap=True, perturbation_mode="none", ) assert "Are you sure this is a CMAP file?" in str(nocmap.value)
def test_annotate_cmap_pertgenetic(): anno_result = annotate( profiles=data_df, platemap=broad_platemap_df.assign( Metadata_pert_name=example_genetic_perts), join_on=["Metadata_well_position", "Metadata_Well"], format_broad_cmap=True, perturbation_mode="genetic", ) expected_Metadata_pert_type = [ "trt", "trt", "trt", "trt", "control", "control" ] assert anno_result.Metadata_pert_type.tolist( ) == expected_Metadata_pert_type assert (anno_result.Metadata_broad_sample_type.tolist() == expected_Metadata_pert_type) assert anno_result.Metadata_pert_id.tolist() == expected_pert_ids
def test_annotate_cmap_pertnone(): anno_result = annotate( profiles=data_df, platemap=broad_platemap_df, join_on=["Metadata_well_position", "Metadata_Well"], format_broad_cmap=True, perturbation_mode="none", ) added_cols = [ "Metadata_pert_id", "Metadata_pert_mfc_id", "Metadata_pert_well", "Metadata_pert_id_vendor", "Metadata_cell_id", "Metadata_pert_type", "Metadata_broad_sample_type", ] assert all(x in anno_result.columns for x in added_cols) assert anno_result.Metadata_pert_id.tolist() == expected_pert_ids
def pipeline_annotate(self, batch, plate): annotate_steps = self.pipeline["annotate"] output_dir = pathlib.PurePath(".", self.pipeline_output, batch, plate) aggregate_output_file = pathlib.PurePath(output_dir, f"{plate}.csv.gz") annotate_output_file = pathlib.PurePath(output_dir, f"{plate}_augmented.csv.gz") metadata_dir = pathlib.PurePath(".", "metadata", "platemaps", batch) barcode_plate_map_file = pathlib.PurePath(metadata_dir, "barcode_platemap.csv") barcode_plate_map_df = pd.read_csv(barcode_plate_map_file, dtype={"Assay_Plate_Barcode": str}) plate_map_name = barcode_plate_map_df.query( "Assay_Plate_Barcode == @plate").Plate_Map_Name.values[0] plate_map_file = pathlib.PurePath(metadata_dir, "platemap", f"{plate_map_name}.txt") plate_map_df = pd.read_csv(plate_map_file, sep="\t") plate_map_df.columns = [ f"Metadata_{x}" if not x.startswith("Metadata_") else x for x in plate_map_df.columns ] platemap_well_column = self.pipeline["platemap_well_column"] annotate_well_column = annotate_steps["well_column"] if annotate_steps["external"]["perform"]: external_df = pd.read_csv( pathlib.PurePath( ".", "metadata", "external_metadata", annotate_steps["external"]["file"], ), sep="\t", ) if annotate_steps["external"]["merge_column"].startswith( "Metadata"): external_join_column = [ annotate_steps["external"]["merge_column"] ] else: external_join_column = [ "Metadata_" + annotate_steps["external"]["merge_column"] ] annotate( profiles=aggregate_output_file, platemap=plate_map_df, join_on=[platemap_well_column, annotate_well_column], external_metadata=external_df, external_join_left=external_join_column, external_join_right=external_join_column, output_file=annotate_output_file, compression_options=self.pipeline_options["compression"], float_format=self.pipeline_options["float_format"], clean_cellprofiler=True, ) else: annotate( profiles=aggregate_output_file, platemap=plate_map_df, join_on=[platemap_well_column, annotate_well_column], output_file=annotate_output_file, compression_options=self.pipeline_options["compression"], float_format=self.pipeline_options["float_format"], clean_cellprofiler=True, )
def process_profile(sql_file, batch, plate, pipeline): """ Given batch details and a pipeline, process morphology profiles """ assert batch in sql_file, "batch {} not recognized in sql file {}".format( batch, sql_file) assert plate in sql_file, "plate {} not recognized in sql file {}".format( plate, sql_file) # Set output directory information pipeline_output = pipeline["output_dir"] output_dir = os.path.join(pipeline_output, batch, plate) os.makedirs(output_dir, exist_ok=True) # Set output file information aggregate_out_file = os.path.join(output_dir, "{}.csv.gz".format(plate)) annotate_out_file = os.path.join(output_dir, "{}_augmented.csv.gz".format(plate)) normalize_out_file = os.path.join(output_dir, "{}_normalized.csv.gz".format(plate)) feature_out_file = os.path.join( output_dir, "{}_normalized_feature_selected.csv.gz".format(plate)) # Load pipeline options compression = process_pipeline(pipeline["options"], option="compression") samples = process_pipeline(pipeline["options"], option="samples") # Load and setup platemap info workspace_dir = pipeline["workspace_dir"] batch_dir = os.path.join(workspace_dir, "backend", batch) metadata_dir = os.path.join(workspace_dir, "metadata", batch) barcode_plate_map_file = os.path.join(metadata_dir, sorted(os.listdir(metadata_dir))[0]) barcode_plate_map_df = pd.read_csv(barcode_plate_map_file) plate_map_name = barcode_plate_map_df.query( "Assay_Plate_Barcode == @plate").Plate_Map_Name.values[0] plate_map_file = os.path.join(metadata_dir, "platemap", "{}.txt".format(plate_map_name)) plate_map_df = pd.read_csv(plate_map_file, sep="\t") plate_map_df.columns = [ "Metadata_{}".format(x) if not x.startswith("Metadata_") else x for x in plate_map_df.columns ] platemap_well_column = pipeline["platemap_well_column"] # Step 1: Aggregate aggregate_steps = pipeline["aggregate"] if aggregate_steps["perform"]: aggregate_features = aggregate_steps["features"] aggregate_operation = aggregate_steps["method"] aggregate_plate_column = aggregate_steps["plate_column"] aggregate_well_column = aggregate_steps["well_column"] strata = [aggregate_plate_column, aggregate_well_column] if "site_column" in aggregate_steps: aggregate_site_column = aggregate_steps["site_column"] strata += [aggregate_site_column] ap = AggregateProfiles( sql_file, strata=strata, features=aggregate_features, operation=aggregate_operation, ) ap.aggregate_profiles(output_file=aggregate_out_file, compression=compression) if pipeline["count"]["perform"]: count_dir = pipeline["count"]["output_dir"] os.makedirs(count_dir, exist_ok=True) cell_count_file = os.path.join( count_dir, "{}_{}_cell_count.tsv".format(batch, plate)) cell_count_df = ap.count_cells() cell_count_df = cell_count_df.merge( plate_map_df, left_on=aggregate_well_column, right_on=platemap_well_column, ).drop(platemap_well_column, axis="columns") cell_count_df.to_csv(cell_count_file, sep="\t", index=False) # Annotate Profiles annotate_steps = pipeline["annotate"] if annotate_steps["perform"]: annotate_well_column = annotate_steps["well_column"] annotate( profiles=aggregate_out_file, platemap=plate_map_df, join_on=[platemap_well_column, annotate_well_column], output_file=annotate_out_file, compression=compression, ) # Normalize Profiles normalize_steps = pipeline["normalize"] if normalize_steps["perform"]: norm_features = normalize_steps["features"] norm_method = normalize_steps["method"] normalize( profiles=annotate_out_file, features=norm_features, samples=samples, method=norm_method, output_file=normalize_out_file, compression=compression, ) # Apply feature selection feature_select_steps = pipeline["feature_select"] if feature_select_steps["perform"]: feature_select_operations = feature_select_steps["operations"] feature_select_features = feature_select_steps["features"] feature_select( profiles=normalize_out_file, features=feature_select_features, samples="none", operation=feature_select_operations, output_file=feature_out_file, compression=compression, corr_threshold=0.9, corr_method="pearson", )
def process_profile(sql_file, batch, plate, pipeline): """ Given batch details and a pipeline, process morphology profiles """ assert batch in sql_file, "batch {} not recognized in sql file {}".format( batch, sql_file) assert plate in sql_file, "plate {} not recognized in sql file {}".format( plate, sql_file) # Set output directory information pipeline_output = pipeline["output_dir"] output_dir = os.path.join(pipeline_output, batch, plate) os.makedirs(output_dir, exist_ok=True) # Set output file information aggregate_out_file = os.path.join(output_dir, "{}.csv.gz".format(plate)) annotate_out_file = os.path.join(output_dir, "{}_augmented.csv.gz".format(plate)) normalize_out_file = os.path.join(output_dir, "{}_normalized.csv.gz".format(plate)) feature_out_file = os.path.join( output_dir, "{}_normalized_feature_selected.csv.gz".format(plate)) # Load pipeline options compression = process_pipeline(pipeline["options"], option="compression") sc_float_format = process_pipeline(pipeline["options"], option="sc_float_format") samples = process_pipeline(pipeline["options"], option="samples") # Load and setup platemap info workspace_dir = pipeline["workspace_dir"] batch_dir = os.path.join(workspace_dir, "backend", batch) metadata_dir = os.path.join(workspace_dir, "metadata", batch) barcode_plate_map_file = os.path.join(metadata_dir, sorted(os.listdir(metadata_dir))[0]) barcode_plate_map_df = pd.read_csv(barcode_plate_map_file) plate_map_name = barcode_plate_map_df.query( "Assay_Plate_Barcode == @plate").Plate_Map_Name.values[0] plate_map_file = os.path.join(metadata_dir, "platemap", "{}.txt".format(plate_map_name)) plate_map_df = pd.read_csv(plate_map_file, sep="\t") plate_map_df.columns = [ "Metadata_{}".format(x) if not x.startswith("Metadata_") else x for x in plate_map_df.columns ] platemap_well_column = pipeline["platemap_well_column"] # Process Bulk profiles # Step 1: Aggregate aggregate_steps = pipeline["aggregate"] aggregate_features = aggregate_steps["features"] aggregate_operation = aggregate_steps["method"] aggregate_plate_column = aggregate_steps["plate_column"] aggregate_well_column = aggregate_steps["well_column"] strata = [aggregate_plate_column, aggregate_well_column] if "site_column" in aggregate_steps: aggregate_site_column = aggregate_steps["site_column"] strata += [aggregate_site_column] if aggregate_steps["perform"]: ap = AggregateProfiles( sql_file, strata=strata, features=aggregate_features, operation=aggregate_operation, ) ap.aggregate_profiles(output_file=aggregate_out_file, compression=compression) if pipeline["count"]["perform"]: if not aggregate_steps["perform"]: ap = AggregateProfiles( sql_file, strata=strata, features=aggregate_features, operation=aggregate_operation, ) count_dir = pipeline["count"]["output_dir"] os.makedirs(count_dir, exist_ok=True) cell_count_file = os.path.join( count_dir, "{}_{}_cell_count.tsv".format(batch, plate)) cell_count_df = ap.count_cells() cell_count_df = cell_count_df.merge( plate_map_df, left_on=aggregate_well_column, right_on=platemap_well_column, ).drop(platemap_well_column, axis="columns") cell_count_df.to_csv(cell_count_file, sep="\t", index=False) # Annotate Profiles annotate_steps = pipeline["annotate"] annotate_well_column = annotate_steps["well_column"] if annotate_steps["perform"]: annotate( profiles=aggregate_out_file, platemap=plate_map_df, join_on=[platemap_well_column, annotate_well_column], output_file=annotate_out_file, compression=compression, ) # Normalize Profiles normalize_steps = pipeline["normalize"] norm_features = normalize_steps["features"] norm_method = normalize_steps["method"] if normalize_steps["perform"]: normalize( profiles=annotate_out_file, features=norm_features, samples=samples, method=norm_method, output_file=normalize_out_file, compression=compression, ) # Apply feature selection feature_select_steps = pipeline["feature_select"] feature_select_operations = feature_select_steps["operations"] feature_select_features = feature_select_steps["features"] if feature_select_steps["perform"]: feature_select( profiles=normalize_out_file, features=feature_select_features, samples=samples, operation=feature_select_operations, output_file=feature_out_file, compression=compression, corr_threshold=0.9, corr_method="pearson", ) sc_steps = pipeline["single_cell"] if sc_steps["perform"]: if not aggregate_steps["perform"]: ap = AggregateProfiles( sql_file, strata=strata, features=aggregate_features, operation=aggregate_operation, ) # Load cells query = "select * from cells" cell_df = pd.read_sql(sql=query, con=ap.conn) # Load cytoplasm query = "select * from cytoplasm" cytoplasm_df = pd.read_sql(sql=query, con=ap.conn) # Load nuclei query = "select * from nuclei" nuclei_df = pd.read_sql(sql=query, con=ap.conn) # Merge single cells together sc_merged_df = (cell_df.merge( cytoplasm_df.drop("ObjectNumber", axis="columns"), left_on=["TableNumber", "ImageNumber", "ObjectNumber"], right_on=["TableNumber", "ImageNumber", "Cytoplasm_Parent_Cells"], how="inner", ).drop("ObjectNumber", axis="columns").merge( nuclei_df, left_on=["TableNumber", "ImageNumber", "Cytoplasm_Parent_Nuclei"], right_on=["TableNumber", "ImageNumber", "ObjectNumber"], how="inner", )) # Merge image data info sc_merged_df = ap.image_df.merge(sc_merged_df, how="right", on=ap.merge_cols) # Make sure column names are correctly prefixed prefix = ["Metadata", "Cells", "Cytoplasm", "Nuclei"] cols = [] for col in sc_merged_df.columns: if any([col.startswith(x) for x in prefix]): cols.append(col) else: cols.append(f"Metadata_{col}") sc_merged_df.columns = cols sc_merged_df = annotate( profiles=sc_merged_df, platemap=plate_map_df, join_on=[platemap_well_column, annotate_well_column], output_file="none", ) if sc_steps["normalize"]: sc_merged_df = normalize( profiles=sc_merged_df, features=norm_features, samples=samples, method=norm_method, output_file="none", ) if sc_steps["feature_select"]: sc_merged_df = feature_select( profiles=sc_merged_df, features=feature_select_features, samples=samples, operation=feature_select_operations, output_file="none", corr_threshold=0.9, corr_method="pearson", ) sc_pipeline_output = pipeline["sc_output_dir"] sc_output_dir = os.path.join(sc_pipeline_output, batch, plate) os.makedirs(sc_output_dir, exist_ok=True) # Set output file information sc_out_file = os.path.join(sc_output_dir, "{}_single_cell.csv.gz".format(plate)) output( df=sc_merged_df, output_filename=sc_out_file, compression="gzip", float_format=sc_float_format, )
# Count cells count_file = pathlib.PurePath(cell_count_dir, f"{plate_name}_cell_count.csv") cell_count_df = sc.count_cells() cell_count_df.to_csv(count_file, sep=",", index=False) del sc # Annotate profiles - Level 3 Data anno_file = pathlib.PurePath(output_dir, f"{plate_name}_augmented.csv.gz") anno_df = annotate( profiles=out_file, platemap=platemap_file, join_on=["Metadata_well_position", well_col], format_broad_cmap=True, external_metadata=moa_df, external_join_left=["Metadata_broad_sample"], external_join_right=["Metadata_broad_sample"], cmap_args={ "cell_id": cell_id, "perturbation_mode": "chemical" }, ) # Rename columns anno_df = anno_df.rename( { "Image_Metadata_Plate": "Metadata_Plate", "Image_Metadata_Well": "Metadata_Well" }, axis="columns", )
def get_profiles(plate, backend_dir, metadata_dir, barcode_platemap_df): """ Apply all profiling steps for a given plate. Output: Will write a series of processed files to disk """ print("Processing {}.....".format(plate)) sqlite_file = "sqlite:///{}/{}.sqlite".format(backend_dir, plate) # Load specific platemap platemap = barcode_platemap_df.query( "Assay_Plate_Barcode == @plate").Plate_Map_Name.values[0] platemap_file = os.path.join(metadata_dir, "platemap", "{}.csv".format(platemap)) platemap_df = pd.read_csv(platemap_file) # Prepare sql file for processing ap = AggregateProfiles( sqlite_file, strata=["Image_Metadata_Plate", "Image_Metadata_Well"]) # Count cells and output cell_count_file = os.path.join("results", "{}_cell_count.tsv".format(plate)) cell_count_df = ap.count_cells() cell_count_df = cell_count_df.merge( platemap_df, left_on="Image_Metadata_Well", right_on="well_position").drop(["WellRow", "WellCol", "well_position"], axis="columns") cell_count_df.to_csv(cell_count_file, sep="\t", index=False) # Begin processing profiles output_dir = os.path.join("data", "profiles", plate) os.makedirs(output_dir, exist_ok=True) # Aggregate single cells into well profiles out_file = os.path.join(output_dir, "{}.csv.gz".format(plate)) ap.aggregate_profiles(output_file=out_file, compression="gzip") # Annotate Profiles anno_file = os.path.join(output_dir, "{}_augmented.csv.gz".format(plate)) annotate( profiles=out_file, platemap=platemap_df, join_on=["Metadata_well_position", "Image_Metadata_Well"], output_file=anno_file, compression="gzip", ) # Define metadata features meta_features = [ "Image_Metadata_Plate", "Image_Metadata_Well", "Metadata_WellRow", "Metadata_WellCol", "Metadata_gene_name", "Metadata_pert_name", "Metadata_broad_sample", "Metadata_cell_line", ] # Normalize Profiles norm_file = os.path.join(output_dir, "{}_normalized.csv.gz".format(plate)) normalize( profiles=anno_file, features="infer", meta_features=meta_features, samples="Metadata_pert_name == 'EMPTY'", method="mad_robustize", output_file=norm_file, compression="gzip", ) # Perform feature selection (just drop columns with high number of missingness) # Drop columns with high number of missingness, extreme values, and blacklist feat_file = os.path.join( output_dir, "{}_normalized_feature_select.csv.gz".format(plate)) feature_select( profiles=norm_file, features="infer", samples="none", operation=[ "drop_na_columns", "blacklist", "variance_threshold", "drop_outliers", ], output_file=feat_file, compression="gzip", ) # Perform audits profile_df = pd.read_csv(feat_file).drop( ["Image_Metadata_Well", "Image_Metadata_Plate"], axis="columns") # Audit guide replicability audit_file = os.path.join("results", "{}_audit_guide.csv".format(plate)) audit( profiles=profile_df, audit_groups=[ "Metadata_pert_name", "Metadata_gene_name", "Metadata_cell_line" ], iterations=10, output_file=audit_file, ) # Audit gene replicability audit_file = os.path.join("results", "{}_audit_gene.csv".format(plate)) audit( profiles=profile_df, audit_groups=["Metadata_gene_name", "Metadata_cell_line"], iterations=10, output_file=audit_file, )
# Count cells count_file = pathlib.PurePath(cell_count_dir, f"{plate_name}_cell_count.tsv") cell_count_df = ap.count_cells() cell_count_df.to_csv(count_file, sep="\t") del ap # Annotate profiles - Level 3 Data anno_file = pathlib.PurePath(output_dir, f"{plate_name}_augmented.csv.gz") anno_df = annotate( profiles=out_file, platemap=platemap_file, join_on=["Metadata_well_position", "Image_Metadata_Well"], cell_id=cell_id, format_broad_cmap=True, perturbation_mode="chemical", external_metadata=moa_df, external_join_left=["Metadata_broad_sample"], external_join_right=["Metadata_broad_sample"], ) # Rename columns anno_df = anno_df.rename( { "Image_Metadata_Plate": "Metadata_Plate", "Image_Metadata_Well": "Metadata_Well" }, axis="columns", )