def process_profile(sql_file, batch, plate, pipeline): """ Given batch details and a pipeline, process morphology profiles """ assert batch in sql_file, "batch {} not recognized in sql file {}".format( batch, sql_file) assert plate in sql_file, "plate {} not recognized in sql file {}".format( plate, sql_file) # Set output directory information pipeline_output = pipeline["output_dir"] output_dir = os.path.join(pipeline_output, batch, plate) os.makedirs(output_dir, exist_ok=True) # Set output file information aggregate_out_file = os.path.join(output_dir, "{}.csv.gz".format(plate)) annotate_out_file = os.path.join(output_dir, "{}_augmented.csv.gz".format(plate)) normalize_out_file = os.path.join(output_dir, "{}_normalized.csv.gz".format(plate)) feature_out_file = os.path.join( output_dir, "{}_normalized_feature_selected.csv.gz".format(plate)) # Load pipeline options compression = process_pipeline(pipeline["options"], option="compression") sc_float_format = process_pipeline(pipeline["options"], option="sc_float_format") samples = process_pipeline(pipeline["options"], option="samples") # Load and setup platemap info workspace_dir = pipeline["workspace_dir"] batch_dir = os.path.join(workspace_dir, "backend", batch) metadata_dir = os.path.join(workspace_dir, "metadata", batch) barcode_plate_map_file = os.path.join(metadata_dir, sorted(os.listdir(metadata_dir))[0]) barcode_plate_map_df = pd.read_csv(barcode_plate_map_file) plate_map_name = barcode_plate_map_df.query( "Assay_Plate_Barcode == @plate").Plate_Map_Name.values[0] plate_map_file = os.path.join(metadata_dir, "platemap", "{}.txt".format(plate_map_name)) plate_map_df = pd.read_csv(plate_map_file, sep="\t") plate_map_df.columns = [ "Metadata_{}".format(x) if not x.startswith("Metadata_") else x for x in plate_map_df.columns ] platemap_well_column = pipeline["platemap_well_column"] # Process Bulk profiles # Step 1: Aggregate aggregate_steps = pipeline["aggregate"] aggregate_features = aggregate_steps["features"] aggregate_operation = aggregate_steps["method"] aggregate_plate_column = aggregate_steps["plate_column"] aggregate_well_column = aggregate_steps["well_column"] strata = [aggregate_plate_column, aggregate_well_column] if "site_column" in aggregate_steps: aggregate_site_column = aggregate_steps["site_column"] strata += [aggregate_site_column] if aggregate_steps["perform"]: ap = AggregateProfiles( sql_file, strata=strata, features=aggregate_features, operation=aggregate_operation, ) ap.aggregate_profiles(output_file=aggregate_out_file, compression=compression) if pipeline["count"]["perform"]: if not aggregate_steps["perform"]: ap = AggregateProfiles( sql_file, strata=strata, features=aggregate_features, operation=aggregate_operation, ) count_dir = pipeline["count"]["output_dir"] os.makedirs(count_dir, exist_ok=True) cell_count_file = os.path.join( count_dir, "{}_{}_cell_count.tsv".format(batch, plate)) cell_count_df = ap.count_cells() cell_count_df = cell_count_df.merge( plate_map_df, left_on=aggregate_well_column, right_on=platemap_well_column, ).drop(platemap_well_column, axis="columns") cell_count_df.to_csv(cell_count_file, sep="\t", index=False) # Annotate Profiles annotate_steps = pipeline["annotate"] annotate_well_column = annotate_steps["well_column"] if annotate_steps["perform"]: annotate( profiles=aggregate_out_file, platemap=plate_map_df, join_on=[platemap_well_column, annotate_well_column], output_file=annotate_out_file, compression=compression, ) # Normalize Profiles normalize_steps = pipeline["normalize"] norm_features = normalize_steps["features"] norm_method = normalize_steps["method"] if normalize_steps["perform"]: normalize( profiles=annotate_out_file, features=norm_features, samples=samples, method=norm_method, output_file=normalize_out_file, compression=compression, ) # Apply feature selection feature_select_steps = pipeline["feature_select"] feature_select_operations = feature_select_steps["operations"] feature_select_features = feature_select_steps["features"] if feature_select_steps["perform"]: feature_select( profiles=normalize_out_file, features=feature_select_features, samples=samples, operation=feature_select_operations, output_file=feature_out_file, compression=compression, corr_threshold=0.9, corr_method="pearson", ) sc_steps = pipeline["single_cell"] if sc_steps["perform"]: if not aggregate_steps["perform"]: ap = AggregateProfiles( sql_file, strata=strata, features=aggregate_features, operation=aggregate_operation, ) # Load cells query = "select * from cells" cell_df = pd.read_sql(sql=query, con=ap.conn) # Load cytoplasm query = "select * from cytoplasm" cytoplasm_df = pd.read_sql(sql=query, con=ap.conn) # Load nuclei query = "select * from nuclei" nuclei_df = pd.read_sql(sql=query, con=ap.conn) # Merge single cells together sc_merged_df = (cell_df.merge( cytoplasm_df.drop("ObjectNumber", axis="columns"), left_on=["TableNumber", "ImageNumber", "ObjectNumber"], right_on=["TableNumber", "ImageNumber", "Cytoplasm_Parent_Cells"], how="inner", ).drop("ObjectNumber", axis="columns").merge( nuclei_df, left_on=["TableNumber", "ImageNumber", "Cytoplasm_Parent_Nuclei"], right_on=["TableNumber", "ImageNumber", "ObjectNumber"], how="inner", )) # Merge image data info sc_merged_df = ap.image_df.merge(sc_merged_df, how="right", on=ap.merge_cols) # Make sure column names are correctly prefixed prefix = ["Metadata", "Cells", "Cytoplasm", "Nuclei"] cols = [] for col in sc_merged_df.columns: if any([col.startswith(x) for x in prefix]): cols.append(col) else: cols.append(f"Metadata_{col}") sc_merged_df.columns = cols sc_merged_df = annotate( profiles=sc_merged_df, platemap=plate_map_df, join_on=[platemap_well_column, annotate_well_column], output_file="none", ) if sc_steps["normalize"]: sc_merged_df = normalize( profiles=sc_merged_df, features=norm_features, samples=samples, method=norm_method, output_file="none", ) if sc_steps["feature_select"]: sc_merged_df = feature_select( profiles=sc_merged_df, features=feature_select_features, samples=samples, operation=feature_select_operations, output_file="none", corr_threshold=0.9, corr_method="pearson", ) sc_pipeline_output = pipeline["sc_output_dir"] sc_output_dir = os.path.join(sc_pipeline_output, batch, plate) os.makedirs(sc_output_dir, exist_ok=True) # Set output file information sc_out_file = os.path.join(sc_output_dir, "{}_single_cell.csv.gz".format(plate)) output( df=sc_merged_df, output_filename=sc_out_file, compression="gzip", float_format=sc_float_format, )
def process_profile(sql_file, batch, plate, pipeline): """ Given batch details and a pipeline, process morphology profiles """ assert batch in sql_file, "batch {} not recognized in sql file {}".format( batch, sql_file) assert plate in sql_file, "plate {} not recognized in sql file {}".format( plate, sql_file) # Set output directory information pipeline_output = pipeline["output_dir"] output_dir = os.path.join(pipeline_output, batch, plate) os.makedirs(output_dir, exist_ok=True) # Set output file information aggregate_out_file = os.path.join(output_dir, "{}.csv.gz".format(plate)) annotate_out_file = os.path.join(output_dir, "{}_augmented.csv.gz".format(plate)) normalize_out_file = os.path.join(output_dir, "{}_normalized.csv.gz".format(plate)) feature_out_file = os.path.join( output_dir, "{}_normalized_feature_selected.csv.gz".format(plate)) # Load pipeline options compression = process_pipeline(pipeline["options"], option="compression") samples = process_pipeline(pipeline["options"], option="samples") # Load and setup platemap info workspace_dir = pipeline["workspace_dir"] batch_dir = os.path.join(workspace_dir, "backend", batch) metadata_dir = os.path.join(workspace_dir, "metadata", batch) barcode_plate_map_file = os.path.join(metadata_dir, sorted(os.listdir(metadata_dir))[0]) barcode_plate_map_df = pd.read_csv(barcode_plate_map_file) plate_map_name = barcode_plate_map_df.query( "Assay_Plate_Barcode == @plate").Plate_Map_Name.values[0] plate_map_file = os.path.join(metadata_dir, "platemap", "{}.txt".format(plate_map_name)) plate_map_df = pd.read_csv(plate_map_file, sep="\t") plate_map_df.columns = [ "Metadata_{}".format(x) if not x.startswith("Metadata_") else x for x in plate_map_df.columns ] platemap_well_column = pipeline["platemap_well_column"] # Step 1: Aggregate aggregate_steps = pipeline["aggregate"] if aggregate_steps["perform"]: aggregate_features = aggregate_steps["features"] aggregate_operation = aggregate_steps["method"] aggregate_plate_column = aggregate_steps["plate_column"] aggregate_well_column = aggregate_steps["well_column"] strata = [aggregate_plate_column, aggregate_well_column] if "site_column" in aggregate_steps: aggregate_site_column = aggregate_steps["site_column"] strata += [aggregate_site_column] ap = AggregateProfiles( sql_file, strata=strata, features=aggregate_features, operation=aggregate_operation, ) ap.aggregate_profiles(output_file=aggregate_out_file, compression=compression) if pipeline["count"]["perform"]: count_dir = pipeline["count"]["output_dir"] os.makedirs(count_dir, exist_ok=True) cell_count_file = os.path.join( count_dir, "{}_{}_cell_count.tsv".format(batch, plate)) cell_count_df = ap.count_cells() cell_count_df = cell_count_df.merge( plate_map_df, left_on=aggregate_well_column, right_on=platemap_well_column, ).drop(platemap_well_column, axis="columns") cell_count_df.to_csv(cell_count_file, sep="\t", index=False) # Annotate Profiles annotate_steps = pipeline["annotate"] if annotate_steps["perform"]: annotate_well_column = annotate_steps["well_column"] annotate( profiles=aggregate_out_file, platemap=plate_map_df, join_on=[platemap_well_column, annotate_well_column], output_file=annotate_out_file, compression=compression, ) # Normalize Profiles normalize_steps = pipeline["normalize"] if normalize_steps["perform"]: norm_features = normalize_steps["features"] norm_method = normalize_steps["method"] normalize( profiles=annotate_out_file, features=norm_features, samples=samples, method=norm_method, output_file=normalize_out_file, compression=compression, ) # Apply feature selection feature_select_steps = pipeline["feature_select"] if feature_select_steps["perform"]: feature_select_operations = feature_select_steps["operations"] feature_select_features = feature_select_steps["features"] feature_select( profiles=normalize_out_file, features=feature_select_features, samples="none", operation=feature_select_operations, output_file=feature_out_file, compression=compression, corr_threshold=0.9, corr_method="pearson", )
def get_profiles(plate, backend_dir, metadata_dir, barcode_platemap_df): """ Apply all profiling steps for a given plate. Output: Will write a series of processed files to disk """ print("Processing {}.....".format(plate)) sqlite_file = "sqlite:///{}/{}.sqlite".format(backend_dir, plate) # Load specific platemap platemap = barcode_platemap_df.query( "Assay_Plate_Barcode == @plate").Plate_Map_Name.values[0] platemap_file = os.path.join(metadata_dir, "platemap", "{}.csv".format(platemap)) platemap_df = pd.read_csv(platemap_file) # Prepare sql file for processing ap = AggregateProfiles( sqlite_file, strata=["Image_Metadata_Plate", "Image_Metadata_Well"]) # Count cells and output cell_count_file = os.path.join("results", "{}_cell_count.tsv".format(plate)) cell_count_df = ap.count_cells() cell_count_df = cell_count_df.merge( platemap_df, left_on="Image_Metadata_Well", right_on="well_position").drop(["WellRow", "WellCol", "well_position"], axis="columns") cell_count_df.to_csv(cell_count_file, sep="\t", index=False) # Begin processing profiles output_dir = os.path.join("data", "profiles", plate) os.makedirs(output_dir, exist_ok=True) # Aggregate single cells into well profiles out_file = os.path.join(output_dir, "{}.csv.gz".format(plate)) ap.aggregate_profiles(output_file=out_file, compression="gzip") # Annotate Profiles anno_file = os.path.join(output_dir, "{}_augmented.csv.gz".format(plate)) annotate( profiles=out_file, platemap=platemap_df, join_on=["Metadata_well_position", "Image_Metadata_Well"], output_file=anno_file, compression="gzip", ) # Define metadata features meta_features = [ "Image_Metadata_Plate", "Image_Metadata_Well", "Metadata_WellRow", "Metadata_WellCol", "Metadata_gene_name", "Metadata_pert_name", "Metadata_broad_sample", "Metadata_cell_line", ] # Normalize Profiles norm_file = os.path.join(output_dir, "{}_normalized.csv.gz".format(plate)) normalize( profiles=anno_file, features="infer", meta_features=meta_features, samples="Metadata_pert_name == 'EMPTY'", method="mad_robustize", output_file=norm_file, compression="gzip", ) # Perform feature selection (just drop columns with high number of missingness) # Drop columns with high number of missingness, extreme values, and blacklist feat_file = os.path.join( output_dir, "{}_normalized_feature_select.csv.gz".format(plate)) feature_select( profiles=norm_file, features="infer", samples="none", operation=[ "drop_na_columns", "blacklist", "variance_threshold", "drop_outliers", ], output_file=feat_file, compression="gzip", ) # Perform audits profile_df = pd.read_csv(feat_file).drop( ["Image_Metadata_Well", "Image_Metadata_Plate"], axis="columns") # Audit guide replicability audit_file = os.path.join("results", "{}_audit_guide.csv".format(plate)) audit( profiles=profile_df, audit_groups=[ "Metadata_pert_name", "Metadata_gene_name", "Metadata_cell_line" ], iterations=10, output_file=audit_file, ) # Audit gene replicability audit_file = os.path.join("results", "{}_audit_gene.csv".format(plate)) audit( profiles=profile_df, audit_groups=["Metadata_gene_name", "Metadata_cell_line"], iterations=10, output_file=audit_file, )
feature_select_ops = [ "drop_na_columns", "variance_threshold", "correlation_threshold", "blacklist", ] # Define external metadata to add to annotation moa_df = pd.read_csv(moa_file, sep="\t") barcode_platemap_df = pd.read_csv(barcode_platemap_file).query( "Assay_Plate_Barcode == @plate_name") # Aggregate profiles out_file = pathlib.PurePath(output_dir, f"{plate_name}.csv.gz") ap = AggregateProfiles(sql_file=sql_file, strata=strata, operation=aggregate_method) ap.aggregate_profiles(output_file=out_file, float_format=float_format, compression="gzip") # Count cells count_file = pathlib.PurePath(cell_count_dir, f"{plate_name}_cell_count.tsv") cell_count_df = ap.count_cells() cell_count_df.to_csv(count_file, sep="\t") del ap # Annotate profiles - Level 3 Data anno_file = pathlib.PurePath(output_dir, f"{plate_name}_augmented.csv.gz") anno_df = annotate(
def test_AggregateProfiles_reset_variables(): """ Testing initialization of AggregateProfiles """ ap_switch = AggregateProfiles(sql_file=file) assert ap_switch.subsample_frac == 1 assert ap_switch.subsample_n == "all" assert ap_switch.subsampling_random_state == "none" ap_switch.set_subsample_frac(0.8) assert ap_switch.subsample_frac == 0.8 ap_switch.set_subsample_frac(1) ap_switch.set_subsample_n(4) assert ap_switch.subsample_n == 4 ap_switch.set_subsample_random_state(42) assert ap_switch.subsampling_random_state == 42 with pytest.raises(AssertionError) as errorinfo: ap_switch.set_subsample_frac(0.8) assert "Do not set both subsample_frac and subsample_n" in str( errorinfo.value.args[0]) with pytest.raises(ValueError) as errorinfo: ap_switch.set_subsample_frac(1) ap_switch.set_subsample_n("wont work") assert "subsample n must be an integer or coercable" in str( errorinfo.value.args[0])
def test_aggregate_count_cells_multiple_strata(): # Lauch a sqlite connection file = "sqlite:///{}/test_strata.sqlite".format(tmpdir) test_engine = create_engine(file) test_conn = test_engine.connect() # Setup data base_image_number = sorted(["x", "y"] * 50) base_table_number = sorted( ["x_hash_a", "x_hash_b", "y_hash_a", "y_hash_b"] * 25) cells_df = build_random_data( compartment="cells", ImageNumber=base_image_number, TableNumber=base_table_number, ) cytoplasm_df = build_random_data( compartment="cytoplasm", ImageNumber=base_image_number, TableNumber=base_table_number, ) nuclei_df = build_random_data( compartment="nuclei", ImageNumber=base_image_number, TableNumber=base_table_number, ) image_df = pd.DataFrame({ "TableNumber": ["x_hash_a", "x_hash_b", "y_hash_a", "y_hash_b"], "ImageNumber": ["x", "x", "y", "y"], "Metadata_Plate": ["plate"] * 4, "Metadata_Well": ["A01", "A02"] * 2, "Metadata_Site": [1, 1, 2, 2], }).sort_values(by="Metadata_Well") # Ingest data into temporary sqlite file image_df.to_sql("image", con=test_engine, index=False, if_exists="replace") cells_df.to_sql("cells", con=test_engine, index=False, if_exists="replace") cytoplasm_df.to_sql("cytoplasm", con=test_engine, index=False, if_exists="replace") nuclei_df.to_sql("nuclei", con=test_engine, index=False, if_exists="replace") # Setup AggregateProfiles Class ap_strata = AggregateProfiles( sql_file=file, subsample_n="4", strata=["Metadata_Plate", "Metadata_Well", "Metadata_Site"], ) count_df = ap_strata.count_cells() expected_count = pd.DataFrame({ "Metadata_Plate": ["plate"] * 4, "Metadata_Well": sorted(["A01", "A02"] * 2), "Metadata_Site": [1, 2] * 2, "cell_count": [25] * 4, }) pd.testing.assert_frame_equal(count_df, expected_count, check_names=False) profiles = ap_strata.aggregate_profiles() count_df = ap_strata.count_cells(count_subset=True) expected_count = pd.DataFrame({ "Metadata_Plate": ["plate"] * 4, "Metadata_Well": sorted(["A01", "A02"] * 2), "Metadata_Site": [1, 2] * 2, "cell_count": [4] * 4, }) pd.testing.assert_frame_equal(count_df, expected_count, check_names=False)
"ImageNumber": ["x", "y"], "Metadata_Plate": ["plate", "plate"], "Metadata_Well": ["A01", "A02"], }) # Ingest data into temporary sqlite file image_df.to_sql("image", con=test_engine, index=False, if_exists="replace") cells_df.to_sql("cells", con=test_engine, index=False, if_exists="replace") cytoplasm_df.to_sql("cytoplasm", con=test_engine, index=False, if_exists="replace") nuclei_df.to_sql("nuclei", con=test_engine, index=False, if_exists="replace") # Setup AggregateProfiles Class ap = AggregateProfiles(sql_file=file) ap_subsample = AggregateProfiles(sql_file=file, subsample_n=2, subsampling_random_state=123) def test_AggregateProfiles_init(): """ Testing initialization of AggregateProfiles """ assert ap.sql_file == file assert ap.strata == ["Metadata_Plate", "Metadata_Well"] assert ap.merge_cols == ["TableNumber", "ImageNumber"] assert ap.features == "infer" pd.testing.assert_frame_equal(image_df, ap.image_df) assert ap.subsample_frac == 1