def test_write_gct_infer_features(): output_filename = os.path.join(tmpdir, "test_gct_nocp.gct") features = ["x", "y", "z"] meta_features = ["g", "h"] write_gct( profiles=data_nocpfeatures_df, output_file=output_filename, features=features, meta_features=meta_features, version="#1.3", ) gct_row_list = [] with open(output_filename, "r") as gct_file: gctreader = csv.reader(gct_file, delimiter="\t") for row in gctreader: gct_row_list.append(row) assert gct_row_list[0] == ["#1.3"] assert gct_row_list[1] == ["3", "6", "1", "2"] assert gct_row_list[2] == [ "id", "cp_feature_name", "SAMPLE_0", "SAMPLE_1", "SAMPLE_2", "SAMPLE_3", "SAMPLE_4", "SAMPLE_5", ] assert gct_row_list[3] == ["g", "nan", "a", "a", "a", "b", "b", "b"] assert gct_row_list[4] == ["h", "nan", "c", "c", "c", "d", "d", "d"] assert gct_row_list[5] == ["x", "x", "1", "1", "-1", "1", "3", "5"] assert gct_row_list[6] == ["y", "y", "5", "5", "-5", "8", "3", "1"] assert gct_row_list[7] == ["z", "z", "2", "2", "-2", "5", "-2", "1"]
def test_write_gct(): output_filename = os.path.join(tmpdir, "test_gct.gct") write_gct( profiles=data_replicate_df, output_file=output_filename, features="infer", version="#1.3", ) gct_row_list = [] with open(output_filename, "r") as gct_file: gctreader = csv.reader(gct_file, delimiter="\t") for row in gctreader: gct_row_list.append(row) assert gct_row_list[0] == ["#1.3"] assert gct_row_list[1] == ["3", "6", "1", "3"] assert gct_row_list[2] == [ "id", "cp_feature_name", "SAMPLE_0", "SAMPLE_1", "SAMPLE_2", "SAMPLE_3", "SAMPLE_4", "SAMPLE_5", ] assert gct_row_list[3] == ["g", "nan", "a", "a", "a", "b", "b", "b"] assert gct_row_list[4] == ["t", "nan", "t", "t", "t", "u", "u", "u"] assert gct_row_list[5] == ["h", "nan", "c", "c", "c", "d", "d", "d"] assert gct_row_list[6] == [ "Cells_x", "Cells_x", "1", "1", "-1", "1", "3", "5" ] assert gct_row_list[7] == [ "Cytoplasm_y", "Cytoplasm_y", "5", "5", "-5", "8", "3", "1", ] assert gct_row_list[8] == [ "Nuclei_z", "Nuclei_z", "2", "2", "-2", "5", "-2", "1" ]
def test_write_gct_assert_error(): with pytest.raises(AssertionError) as ae: output_filename = os.path.join(tmpdir, "test_gct_feature_meta_fail.gct") feature_metadata = pd.DataFrame({ "Cells_x": ["blue", "triangle"], "Cytoplasm_y": ["red", "square"], "Nuclei_z": ["green", "oval"], }).transpose() write_gct( profiles=data_replicate_df, output_file=output_filename, features="infer", meta_features="infer", feature_metadata=feature_metadata, version="#1.3", ) assert "make sure feature metadata has row named 'id'" in str(ae.value)
def pipeline_feature_select(self, steps, suffix=None): feature_select_steps = steps pipeline_output = self.pipeline["output_dir"] level = feature_select_steps["level"] gct = feature_select_steps["gct"] feature_select_operations = feature_select_steps["operations"] feature_select_features = feature_select_steps["features"] all_plates_df = pd.DataFrame() for batch in self.profile_config: batch_df = pd.DataFrame() for plate in self.profile_config[batch]: output_dir = pathlib.PurePath(".", pipeline_output, batch, plate) if suffix: normalize_output_file = pathlib.PurePath( output_dir, f"{plate}_normalized_{suffix}.csv.gz") feature_select_output_file_plate = pathlib.PurePath( output_dir, f"{plate}_normalized_feature_select_{suffix}_plate.csv.gz", ) else: normalize_output_file = pathlib.PurePath( output_dir, f"{plate}_normalized.csv.gz") feature_select_output_file_plate = pathlib.PurePath( output_dir, f"{plate}_normalized_feature_select_plate.csv.gz") if feature_select_features == "infer" and self.noncanonical: feature_select_features = cyto_utils.infer_cp_features( pd.read_csv(normalize_output_file), compartments=self.compartments, ) df = pd.read_csv(normalize_output_file).assign( Metadata_batch=batch) if level == "plate": df = df.drop(columns=["Metadata_batch"]) feature_select( profiles=df, features=feature_select_features, operation=feature_select_operations, output_file=feature_select_output_file_plate, compression_options=self. pipeline_options["compression"], float_format=self.pipeline_options["float_format"], ) elif level == "batch": batch_df = concat_dataframes(batch_df, df) elif level == "all": all_plates_df = concat_dataframes(all_plates_df, df) if level == "batch": fs_df = feature_select( profiles=batch_df, features=feature_select_features, operation=feature_select_operations, ) for plate in self.profile_config[batch]: output_dir = pathlib.PurePath(".", pipeline_output, batch, plate) if suffix: feature_select_output_file_batch = pathlib.PurePath( output_dir, f"{plate}_normalized_feature_select_{suffix}_batch.csv.gz", ) else: feature_select_output_file_batch = pathlib.PurePath( output_dir, f"{plate}_normalized_feature_select_batch.csv.gz", ) if feature_select_features == "infer" and self.noncanonical: feature_select_features = cyto_utils.infer_cp_features( batch_df, compartments=self.compartments) df = fs_df.query("Metadata_Plate==@plate").reset_index( drop=True) df = df.drop(columns=["Metadata_batch"]) cyto_utils.output( output_filename=feature_select_output_file_batch, df=df, compression_options=self. pipeline_options["compression"], float_format=self.pipeline_options["float_format"], ) if gct: create_gct_directories(batch) if suffix: stacked_file = pathlib.PurePath( ".", "gct", batch, f"{batch}_normalized_feature_select_{suffix}_batch.csv.gz", ) gct_file = pathlib.PurePath( ".", "gct", batch, f"{batch}_normalized_feature_select_{suffix}_batch.gct", ) else: stacked_file = pathlib.PurePath( ".", "gct", batch, f"{batch}_normalized_feature_select_batch.csv.gz", ) gct_file = pathlib.PurePath( ".", "gct", batch, f"{batch}_normalized_feature_select_batch.gct", ) cyto_utils.output( output_filename=stacked_file, df=fs_df, compression_options=self. pipeline_options["compression"], float_format=self.pipeline_options["float_format"], ) write_gct(profiles=fs_df, output_file=gct_file) if level == "all": fs_df = feature_select( profiles=all_plates_df, features=feature_select_features, operation=feature_select_operations, ) for batch in self.profile_config: fs_batch_df = fs_df.loc[fs_df.Metadata_batch == batch].reset_index(drop=True) for plate in self.profile_config[batch]: output_dir = pathlib.PurePath(".", pipeline_output, batch, plate) if suffix: feature_select_output_file_all = pathlib.PurePath( output_dir, f"{plate}_normalized_feature_select_{suffix}_all.csv.gz", ) else: feature_select_output_file_all = pathlib.PurePath( output_dir, f"{plate}_normalized_feature_select_all.csv.gz") if feature_select_features == "infer" and self.noncanonical: feature_select_features = cyto_utils.infer_cp_features( all_plates_df, compartments=self.compartments) df = fs_batch_df.query( "Metadata_Plate==@plate").reset_index(drop=True) df = df.drop(columns=["Metadata_batch"]) cyto_utils.output( output_filename=feature_select_output_file_all, df=df, compression_options=self. pipeline_options["compression"], float_format=self.pipeline_options["float_format"], ) if gct: create_gct_directories(batch) if suffix: stacked_file = pathlib.PurePath( ".", "gct", batch, f"{batch}_normalized_feature_select_{suffix}_all.csv.gz", ) gct_file = pathlib.PurePath( ".", "gct", batch, f"{batch}_normalized_feature_select_{suffix}_all.gct", ) else: stacked_file = pathlib.PurePath( ".", "gct", batch, f"{batch}_normalized_feature_select_all.csv.gz", ) gct_file = pathlib.PurePath( ".", "gct", batch, f"{batch}_normalized_feature_select_all.gct", ) cyto_utils.output( output_filename=stacked_file, df=fs_batch_df, compression_options=self. pipeline_options["compression"], float_format=self.pipeline_options["float_format"], ) write_gct(profiles=fs_batch_df, output_file=gct_file)
def test_write_gct_with_feature_metadata(): output_filename = os.path.join(tmpdir, "test_gct_feature_meta.gct") feature_metadata = pd.DataFrame({ "id": ["color", "shape"], "Cells_x": ["blue", "triangle"], "Cytoplasm_y": ["red", "square"], "Nuclei_z": ["green", "oval"], }).transpose() write_gct( profiles=data_replicate_df, output_file=output_filename, features="infer", meta_features="infer", feature_metadata=feature_metadata, version="#1.3", ) gct_row_list = [] with open(output_filename, "r") as gct_file: gctreader = csv.reader(gct_file, delimiter="\t") for row in gctreader: gct_row_list.append(row) assert gct_row_list[0] == ["#1.3"] assert gct_row_list[1] == ["3", "6", "2", "3"] assert gct_row_list[2] == [ "id", "color", "shape", "SAMPLE_0", "SAMPLE_1", "SAMPLE_2", "SAMPLE_3", "SAMPLE_4", "SAMPLE_5", ] assert gct_row_list[3] == ["g", "nan", "nan", "a", "a", "a", "b", "b", "b"] assert gct_row_list[4] == ["t", "nan", "nan", "t", "t", "t", "u", "u", "u"] assert gct_row_list[5] == ["h", "nan", "nan", "c", "c", "c", "d", "d", "d"] assert gct_row_list[6] == [ "Cells_x", "blue", "triangle", "1", "1", "-1", "1", "3", "5", ] assert gct_row_list[7] == [ "Cytoplasm_y", "red", "square", "5", "5", "-5", "8", "3", "1", ] assert gct_row_list[8] == [ "Nuclei_z", "green", "oval", "2", "2", "-2", "5", "-2", "1", ]
df = load_data(batch=batch, suffix=suffix, profile_dir=profile_dir, combine_dfs=True, add_cell_count=True, harmonize_cols=True, cell_count_dir=cell_count_dir) # Save normalized and non-feature selected data profile_batches[batch] = df # Apply feature selection feature_select_df = feature_select(df, operation=feature_select_ops) # Write the dataframe as a gct file for input into Morpheus write_gct(profiles=feature_select_df, output_file=output_gct_file) # ## Merge Profiles Together and Output # In[4]: all_profiles_df = pd.concat(profile_batches.values(), sort=True).reset_index(drop=True) all_profiles_df = all_profiles_df.assign(Metadata_clone_type="resistant") all_profiles_df.loc[all_profiles_df.Metadata_clone_number.str.contains("WT"), "Metadata_clone_type"] = "wildtype" meta_features = infer_cp_features(all_profiles_df, metadata=True) cp_cols = infer_cp_features(all_profiles_df, metadata=False)