コード例 #1
0
ファイル: test_write_gct.py プロジェクト: shntnu/pycytominer
def test_write_gct_infer_features():
    output_filename = os.path.join(tmpdir, "test_gct_nocp.gct")
    features = ["x", "y", "z"]
    meta_features = ["g", "h"]

    write_gct(
        profiles=data_nocpfeatures_df,
        output_file=output_filename,
        features=features,
        meta_features=meta_features,
        version="#1.3",
    )
    gct_row_list = []
    with open(output_filename, "r") as gct_file:
        gctreader = csv.reader(gct_file, delimiter="\t")
        for row in gctreader:
            gct_row_list.append(row)

    assert gct_row_list[0] == ["#1.3"]
    assert gct_row_list[1] == ["3", "6", "1", "2"]
    assert gct_row_list[2] == [
        "id",
        "cp_feature_name",
        "SAMPLE_0",
        "SAMPLE_1",
        "SAMPLE_2",
        "SAMPLE_3",
        "SAMPLE_4",
        "SAMPLE_5",
    ]
    assert gct_row_list[3] == ["g", "nan", "a", "a", "a", "b", "b", "b"]
    assert gct_row_list[4] == ["h", "nan", "c", "c", "c", "d", "d", "d"]
    assert gct_row_list[5] == ["x", "x", "1", "1", "-1", "1", "3", "5"]
    assert gct_row_list[6] == ["y", "y", "5", "5", "-5", "8", "3", "1"]
    assert gct_row_list[7] == ["z", "z", "2", "2", "-2", "5", "-2", "1"]
コード例 #2
0
ファイル: test_write_gct.py プロジェクト: shntnu/pycytominer
def test_write_gct():
    output_filename = os.path.join(tmpdir, "test_gct.gct")
    write_gct(
        profiles=data_replicate_df,
        output_file=output_filename,
        features="infer",
        version="#1.3",
    )
    gct_row_list = []
    with open(output_filename, "r") as gct_file:
        gctreader = csv.reader(gct_file, delimiter="\t")
        for row in gctreader:
            gct_row_list.append(row)

    assert gct_row_list[0] == ["#1.3"]
    assert gct_row_list[1] == ["3", "6", "1", "3"]
    assert gct_row_list[2] == [
        "id",
        "cp_feature_name",
        "SAMPLE_0",
        "SAMPLE_1",
        "SAMPLE_2",
        "SAMPLE_3",
        "SAMPLE_4",
        "SAMPLE_5",
    ]
    assert gct_row_list[3] == ["g", "nan", "a", "a", "a", "b", "b", "b"]
    assert gct_row_list[4] == ["t", "nan", "t", "t", "t", "u", "u", "u"]
    assert gct_row_list[5] == ["h", "nan", "c", "c", "c", "d", "d", "d"]
    assert gct_row_list[6] == [
        "Cells_x", "Cells_x", "1", "1", "-1", "1", "3", "5"
    ]
    assert gct_row_list[7] == [
        "Cytoplasm_y",
        "Cytoplasm_y",
        "5",
        "5",
        "-5",
        "8",
        "3",
        "1",
    ]
    assert gct_row_list[8] == [
        "Nuclei_z", "Nuclei_z", "2", "2", "-2", "5", "-2", "1"
    ]
コード例 #3
0
ファイル: test_write_gct.py プロジェクト: shntnu/pycytominer
def test_write_gct_assert_error():
    with pytest.raises(AssertionError) as ae:
        output_filename = os.path.join(tmpdir,
                                       "test_gct_feature_meta_fail.gct")
        feature_metadata = pd.DataFrame({
            "Cells_x": ["blue", "triangle"],
            "Cytoplasm_y": ["red", "square"],
            "Nuclei_z": ["green", "oval"],
        }).transpose()

        write_gct(
            profiles=data_replicate_df,
            output_file=output_filename,
            features="infer",
            meta_features="infer",
            feature_metadata=feature_metadata,
            version="#1.3",
        )
    assert "make sure feature metadata has row named 'id'" in str(ae.value)
    def pipeline_feature_select(self, steps, suffix=None):
        feature_select_steps = steps
        pipeline_output = self.pipeline["output_dir"]

        level = feature_select_steps["level"]
        gct = feature_select_steps["gct"]
        feature_select_operations = feature_select_steps["operations"]
        feature_select_features = feature_select_steps["features"]

        all_plates_df = pd.DataFrame()

        for batch in self.profile_config:
            batch_df = pd.DataFrame()
            for plate in self.profile_config[batch]:
                output_dir = pathlib.PurePath(".", pipeline_output, batch,
                                              plate)
                if suffix:
                    normalize_output_file = pathlib.PurePath(
                        output_dir, f"{plate}_normalized_{suffix}.csv.gz")
                    feature_select_output_file_plate = pathlib.PurePath(
                        output_dir,
                        f"{plate}_normalized_feature_select_{suffix}_plate.csv.gz",
                    )
                else:
                    normalize_output_file = pathlib.PurePath(
                        output_dir, f"{plate}_normalized.csv.gz")
                    feature_select_output_file_plate = pathlib.PurePath(
                        output_dir,
                        f"{plate}_normalized_feature_select_plate.csv.gz")
                if feature_select_features == "infer" and self.noncanonical:
                    feature_select_features = cyto_utils.infer_cp_features(
                        pd.read_csv(normalize_output_file),
                        compartments=self.compartments,
                    )

                df = pd.read_csv(normalize_output_file).assign(
                    Metadata_batch=batch)

                if level == "plate":
                    df = df.drop(columns=["Metadata_batch"])
                    feature_select(
                        profiles=df,
                        features=feature_select_features,
                        operation=feature_select_operations,
                        output_file=feature_select_output_file_plate,
                        compression_options=self.
                        pipeline_options["compression"],
                        float_format=self.pipeline_options["float_format"],
                    )
                elif level == "batch":
                    batch_df = concat_dataframes(batch_df, df)
                elif level == "all":
                    all_plates_df = concat_dataframes(all_plates_df, df)

            if level == "batch":
                fs_df = feature_select(
                    profiles=batch_df,
                    features=feature_select_features,
                    operation=feature_select_operations,
                )
                for plate in self.profile_config[batch]:
                    output_dir = pathlib.PurePath(".", pipeline_output, batch,
                                                  plate)
                    if suffix:
                        feature_select_output_file_batch = pathlib.PurePath(
                            output_dir,
                            f"{plate}_normalized_feature_select_{suffix}_batch.csv.gz",
                        )
                    else:
                        feature_select_output_file_batch = pathlib.PurePath(
                            output_dir,
                            f"{plate}_normalized_feature_select_batch.csv.gz",
                        )
                    if feature_select_features == "infer" and self.noncanonical:
                        feature_select_features = cyto_utils.infer_cp_features(
                            batch_df, compartments=self.compartments)

                    df = fs_df.query("Metadata_Plate==@plate").reset_index(
                        drop=True)
                    df = df.drop(columns=["Metadata_batch"])

                    cyto_utils.output(
                        output_filename=feature_select_output_file_batch,
                        df=df,
                        compression_options=self.
                        pipeline_options["compression"],
                        float_format=self.pipeline_options["float_format"],
                    )

                if gct:
                    create_gct_directories(batch)
                    if suffix:
                        stacked_file = pathlib.PurePath(
                            ".",
                            "gct",
                            batch,
                            f"{batch}_normalized_feature_select_{suffix}_batch.csv.gz",
                        )
                        gct_file = pathlib.PurePath(
                            ".",
                            "gct",
                            batch,
                            f"{batch}_normalized_feature_select_{suffix}_batch.gct",
                        )
                    else:
                        stacked_file = pathlib.PurePath(
                            ".",
                            "gct",
                            batch,
                            f"{batch}_normalized_feature_select_batch.csv.gz",
                        )
                        gct_file = pathlib.PurePath(
                            ".",
                            "gct",
                            batch,
                            f"{batch}_normalized_feature_select_batch.gct",
                        )
                    cyto_utils.output(
                        output_filename=stacked_file,
                        df=fs_df,
                        compression_options=self.
                        pipeline_options["compression"],
                        float_format=self.pipeline_options["float_format"],
                    )
                    write_gct(profiles=fs_df, output_file=gct_file)

        if level == "all":
            fs_df = feature_select(
                profiles=all_plates_df,
                features=feature_select_features,
                operation=feature_select_operations,
            )
            for batch in self.profile_config:
                fs_batch_df = fs_df.loc[fs_df.Metadata_batch ==
                                        batch].reset_index(drop=True)
                for plate in self.profile_config[batch]:
                    output_dir = pathlib.PurePath(".", pipeline_output, batch,
                                                  plate)
                    if suffix:
                        feature_select_output_file_all = pathlib.PurePath(
                            output_dir,
                            f"{plate}_normalized_feature_select_{suffix}_all.csv.gz",
                        )
                    else:
                        feature_select_output_file_all = pathlib.PurePath(
                            output_dir,
                            f"{plate}_normalized_feature_select_all.csv.gz")
                    if feature_select_features == "infer" and self.noncanonical:
                        feature_select_features = cyto_utils.infer_cp_features(
                            all_plates_df, compartments=self.compartments)

                    df = fs_batch_df.query(
                        "Metadata_Plate==@plate").reset_index(drop=True)

                    df = df.drop(columns=["Metadata_batch"])

                    cyto_utils.output(
                        output_filename=feature_select_output_file_all,
                        df=df,
                        compression_options=self.
                        pipeline_options["compression"],
                        float_format=self.pipeline_options["float_format"],
                    )

                if gct:
                    create_gct_directories(batch)
                    if suffix:
                        stacked_file = pathlib.PurePath(
                            ".",
                            "gct",
                            batch,
                            f"{batch}_normalized_feature_select_{suffix}_all.csv.gz",
                        )
                        gct_file = pathlib.PurePath(
                            ".",
                            "gct",
                            batch,
                            f"{batch}_normalized_feature_select_{suffix}_all.gct",
                        )
                    else:
                        stacked_file = pathlib.PurePath(
                            ".",
                            "gct",
                            batch,
                            f"{batch}_normalized_feature_select_all.csv.gz",
                        )
                        gct_file = pathlib.PurePath(
                            ".",
                            "gct",
                            batch,
                            f"{batch}_normalized_feature_select_all.gct",
                        )
                    cyto_utils.output(
                        output_filename=stacked_file,
                        df=fs_batch_df,
                        compression_options=self.
                        pipeline_options["compression"],
                        float_format=self.pipeline_options["float_format"],
                    )
                    write_gct(profiles=fs_batch_df, output_file=gct_file)
コード例 #5
0
ファイル: test_write_gct.py プロジェクト: shntnu/pycytominer
def test_write_gct_with_feature_metadata():
    output_filename = os.path.join(tmpdir, "test_gct_feature_meta.gct")

    feature_metadata = pd.DataFrame({
        "id": ["color", "shape"],
        "Cells_x": ["blue", "triangle"],
        "Cytoplasm_y": ["red", "square"],
        "Nuclei_z": ["green", "oval"],
    }).transpose()

    write_gct(
        profiles=data_replicate_df,
        output_file=output_filename,
        features="infer",
        meta_features="infer",
        feature_metadata=feature_metadata,
        version="#1.3",
    )
    gct_row_list = []
    with open(output_filename, "r") as gct_file:
        gctreader = csv.reader(gct_file, delimiter="\t")
        for row in gctreader:
            gct_row_list.append(row)

    assert gct_row_list[0] == ["#1.3"]
    assert gct_row_list[1] == ["3", "6", "2", "3"]
    assert gct_row_list[2] == [
        "id",
        "color",
        "shape",
        "SAMPLE_0",
        "SAMPLE_1",
        "SAMPLE_2",
        "SAMPLE_3",
        "SAMPLE_4",
        "SAMPLE_5",
    ]
    assert gct_row_list[3] == ["g", "nan", "nan", "a", "a", "a", "b", "b", "b"]
    assert gct_row_list[4] == ["t", "nan", "nan", "t", "t", "t", "u", "u", "u"]
    assert gct_row_list[5] == ["h", "nan", "nan", "c", "c", "c", "d", "d", "d"]
    assert gct_row_list[6] == [
        "Cells_x",
        "blue",
        "triangle",
        "1",
        "1",
        "-1",
        "1",
        "3",
        "5",
    ]
    assert gct_row_list[7] == [
        "Cytoplasm_y",
        "red",
        "square",
        "5",
        "5",
        "-5",
        "8",
        "3",
        "1",
    ]
    assert gct_row_list[8] == [
        "Nuclei_z",
        "green",
        "oval",
        "2",
        "2",
        "-2",
        "5",
        "-2",
        "1",
    ]
    df = load_data(batch=batch,
                   suffix=suffix,
                   profile_dir=profile_dir,
                   combine_dfs=True,
                   add_cell_count=True,
                   harmonize_cols=True,
                   cell_count_dir=cell_count_dir)

    # Save normalized and non-feature selected data
    profile_batches[batch] = df

    # Apply feature selection
    feature_select_df = feature_select(df, operation=feature_select_ops)

    # Write the dataframe as a gct file for input into Morpheus
    write_gct(profiles=feature_select_df, output_file=output_gct_file)

# ## Merge Profiles Together and Output

# In[4]:

all_profiles_df = pd.concat(profile_batches.values(),
                            sort=True).reset_index(drop=True)

all_profiles_df = all_profiles_df.assign(Metadata_clone_type="resistant")
all_profiles_df.loc[all_profiles_df.Metadata_clone_number.str.contains("WT"),
                    "Metadata_clone_type"] = "wildtype"

meta_features = infer_cp_features(all_profiles_df, metadata=True)
cp_cols = infer_cp_features(all_profiles_df, metadata=False)