Python annotateの例、pycytominer.annotate Pythonの例

コード例 #1

0

ファイルを表示

ファイル: test_annotate_custom.py プロジェクト: michaelbornholdt/pycytominer

def test_annotate_cp_clean():
    data_rename_df = data_df.rename({"Metadata_Well": "Image_Metadata_Well"},
                                    axis="columns")
    data_rename_df = data_rename_df.assign(Image_Metadata_Plate="test")

    anno_result = annotate(
        profiles=data_rename_df,
        platemap=broad_platemap_df,
        clean_cellprofiler=False,
        join_on=["Metadata_well_position", "Image_Metadata_Well"],
    )

    assert all([
        x in anno_result.columns
        for x in ["Image_Metadata_Well", "Image_Metadata_Plate"]
    ])

    anno_result = annotate(
        profiles=data_rename_df,
        platemap=broad_platemap_df,
        clean_cellprofiler=True,
        join_on=["Metadata_well_position", "Image_Metadata_Well"],
    )

    assert all([
        x in anno_result.columns for x in ["Metadata_Well", "Metadata_Plate"]
    ])

コード例 #2

0

ファイルを表示

def test_annotate_cmap_pertchemical():
    anno_result = annotate(
        profiles=data_df,
        platemap=broad_platemap_df,
        join_on=["Metadata_well_position", "Metadata_Well"],
        format_broad_cmap=True,
        perturbation_mode="genetic",
    )

    added_cols = [
        "Metadata_pert_id",
        "Metadata_pert_mfc_id",
        "Metadata_pert_well",
        "Metadata_pert_id_vendor",
        "Metadata_cell_id",
        "Metadata_pert_type",
        "Metadata_broad_sample_type",
    ]

    assert all(x in anno_result.columns for x in added_cols)

    some_doses = [1000, 2, 1, 1, 1, 1]
    chemical_platemap = broad_platemap_df.copy()
    chemical_platemap.loc[0, "Metadata_broad_sample"] = "DMSO"
    chemical_platemap = chemical_platemap.assign(
        Metadata_mmoles_per_liter=some_doses,
        Metadata_mg_per_ml=some_doses,
        Metadata_solvent="DMSO",
    )

    anno_result = annotate(
        profiles=data_df,
        platemap=chemical_platemap,
        join_on=["Metadata_well_position", "Metadata_Well"],
        format_broad_cmap=True,
        perturbation_mode="chemical",
    )
    expected_Metadata_pert_type = [
        "control", "trt", "trt", "trt", "trt", "trt"
    ]
    assert anno_result.Metadata_pert_type.tolist(
    ) == expected_Metadata_pert_type
    assert (anno_result.Metadata_broad_sample_type.tolist() ==
            expected_Metadata_pert_type)

    expected_dose = [0, 2, 1, 1, 1, 1]
    assert anno_result.Metadata_mmoles_per_liter.tolist() == expected_dose
    assert anno_result.Metadata_mg_per_ml.tolist() == expected_dose

    added_cols += [
        "Metadata_mmoles_per_liter",
        "Metadata_mg_per_ml",
        "Metadata_solvent",
        "Metadata_pert_vehicle",
    ]
    assert all(x in anno_result.columns for x in added_cols)

コード例 #3

0

ファイルを表示

def test_annotate_cmap_externalmetadata():
    external_data_example = pd.DataFrame({
        "test_well_join": ["A01"],
        "test_info_col": ["DMSO is cool"]
    }).reset_index(drop=True)

    external_data_example.to_csv(output_file, index=False, sep=",")

    some_doses = [1000, 2, 1, 1, 1, 1]
    chemical_platemap = broad_platemap_df.copy()
    chemical_platemap.loc[0, "Metadata_broad_sample"] = "DMSO"
    chemical_platemap = chemical_platemap.assign(
        Metadata_mmoles_per_liter=some_doses,
        Metadata_mg_per_ml=some_doses,
        Metadata_solvent="DMSO",
        Metadata_cell_id="A549",
    )

    anno_result = annotate(
        profiles=data_df,
        platemap=chemical_platemap,
        join_on=["Metadata_well_position", "Metadata_Well"],
        format_broad_cmap=True,
        perturbation_mode="chemical",
        external_metadata=output_file,
        external_join_left="Metadata_Well",
        external_join_right="Metadata_test_well_join",
    )

    assert anno_result.loc[0, "Metadata_test_info_col"] == "DMSO is cool"
    assert anno_result.Metadata_cell_id.unique()[0] == "A549"

コード例 #4

0

ファイルを表示

def test_annotate_cmap_assert():
    with pytest.raises(AssertionError) as nocmap:
        anno_result = annotate(
            profiles=data_df,
            platemap=platemap_df,
            join_on=["Metadata_well_position", "Metadata_Well"],
            format_broad_cmap=True,
            perturbation_mode="none",
        )

        assert "Are you sure this is a CMAP file?" in str(nocmap.value)

コード例 #5

0

ファイルを表示

def test_annotate_cmap_pertgenetic():
    anno_result = annotate(
        profiles=data_df,
        platemap=broad_platemap_df.assign(
            Metadata_pert_name=example_genetic_perts),
        join_on=["Metadata_well_position", "Metadata_Well"],
        format_broad_cmap=True,
        perturbation_mode="genetic",
    )

    expected_Metadata_pert_type = [
        "trt", "trt", "trt", "trt", "control", "control"
    ]
    assert anno_result.Metadata_pert_type.tolist(
    ) == expected_Metadata_pert_type
    assert (anno_result.Metadata_broad_sample_type.tolist() ==
            expected_Metadata_pert_type)
    assert anno_result.Metadata_pert_id.tolist() == expected_pert_ids

コード例 #6

0

ファイルを表示

def test_annotate_cmap_pertnone():
    anno_result = annotate(
        profiles=data_df,
        platemap=broad_platemap_df,
        join_on=["Metadata_well_position", "Metadata_Well"],
        format_broad_cmap=True,
        perturbation_mode="none",
    )

    added_cols = [
        "Metadata_pert_id",
        "Metadata_pert_mfc_id",
        "Metadata_pert_well",
        "Metadata_pert_id_vendor",
        "Metadata_cell_id",
        "Metadata_pert_type",
        "Metadata_broad_sample_type",
    ]

    assert all(x in anno_result.columns for x in added_cols)
    assert anno_result.Metadata_pert_id.tolist() == expected_pert_ids

コード例 #7

0

ファイルを表示

ファイル: profile.py プロジェクト: broadinstitute/2019_06_04_Cardiomyocytes-profiling-recipe

    def pipeline_annotate(self, batch, plate):
        annotate_steps = self.pipeline["annotate"]
        output_dir = pathlib.PurePath(".", self.pipeline_output, batch, plate)
        aggregate_output_file = pathlib.PurePath(output_dir, f"{plate}.csv.gz")
        annotate_output_file = pathlib.PurePath(output_dir,
                                                f"{plate}_augmented.csv.gz")

        metadata_dir = pathlib.PurePath(".", "metadata", "platemaps", batch)
        barcode_plate_map_file = pathlib.PurePath(metadata_dir,
                                                  "barcode_platemap.csv")
        barcode_plate_map_df = pd.read_csv(barcode_plate_map_file,
                                           dtype={"Assay_Plate_Barcode": str})
        plate_map_name = barcode_plate_map_df.query(
            "Assay_Plate_Barcode == @plate").Plate_Map_Name.values[0]
        plate_map_file = pathlib.PurePath(metadata_dir, "platemap",
                                          f"{plate_map_name}.txt")
        plate_map_df = pd.read_csv(plate_map_file, sep="\t")
        plate_map_df.columns = [
            f"Metadata_{x}" if not x.startswith("Metadata_") else x
            for x in plate_map_df.columns
        ]

        platemap_well_column = self.pipeline["platemap_well_column"]
        annotate_well_column = annotate_steps["well_column"]

        if annotate_steps["external"]["perform"]:
            external_df = pd.read_csv(
                pathlib.PurePath(
                    ".",
                    "metadata",
                    "external_metadata",
                    annotate_steps["external"]["file"],
                ),
                sep="\t",
            )

            if annotate_steps["external"]["merge_column"].startswith(
                    "Metadata"):
                external_join_column = [
                    annotate_steps["external"]["merge_column"]
                ]
            else:
                external_join_column = [
                    "Metadata_" + annotate_steps["external"]["merge_column"]
                ]

            annotate(
                profiles=aggregate_output_file,
                platemap=plate_map_df,
                join_on=[platemap_well_column, annotate_well_column],
                external_metadata=external_df,
                external_join_left=external_join_column,
                external_join_right=external_join_column,
                output_file=annotate_output_file,
                compression_options=self.pipeline_options["compression"],
                float_format=self.pipeline_options["float_format"],
                clean_cellprofiler=True,
            )
        else:
            annotate(
                profiles=aggregate_output_file,
                platemap=plate_map_df,
                join_on=[platemap_well_column, annotate_well_column],
                output_file=annotate_output_file,
                compression_options=self.pipeline_options["compression"],
                float_format=self.pipeline_options["float_format"],
                clean_cellprofiler=True,
            )

コード例 #8

0

ファイルを表示

ファイル: profile_util.py プロジェクト: DavidStirling/profiling-resistance-mechanisms

def process_profile(sql_file, batch, plate, pipeline):
    """
    Given batch details and a pipeline, process morphology profiles
    """
    assert batch in sql_file, "batch {} not recognized in sql file {}".format(
        batch, sql_file)
    assert plate in sql_file, "plate {} not recognized in sql file {}".format(
        plate, sql_file)

    # Set output directory information
    pipeline_output = pipeline["output_dir"]
    output_dir = os.path.join(pipeline_output, batch, plate)
    os.makedirs(output_dir, exist_ok=True)

    # Set output file information
    aggregate_out_file = os.path.join(output_dir, "{}.csv.gz".format(plate))
    annotate_out_file = os.path.join(output_dir,
                                     "{}_augmented.csv.gz".format(plate))
    normalize_out_file = os.path.join(output_dir,
                                      "{}_normalized.csv.gz".format(plate))
    feature_out_file = os.path.join(
        output_dir, "{}_normalized_feature_selected.csv.gz".format(plate))

    # Load pipeline options
    compression = process_pipeline(pipeline["options"], option="compression")
    samples = process_pipeline(pipeline["options"], option="samples")

    # Load and setup platemap info
    workspace_dir = pipeline["workspace_dir"]
    batch_dir = os.path.join(workspace_dir, "backend", batch)
    metadata_dir = os.path.join(workspace_dir, "metadata", batch)

    barcode_plate_map_file = os.path.join(metadata_dir,
                                          sorted(os.listdir(metadata_dir))[0])
    barcode_plate_map_df = pd.read_csv(barcode_plate_map_file)
    plate_map_name = barcode_plate_map_df.query(
        "Assay_Plate_Barcode == @plate").Plate_Map_Name.values[0]
    plate_map_file = os.path.join(metadata_dir, "platemap",
                                  "{}.txt".format(plate_map_name))
    plate_map_df = pd.read_csv(plate_map_file, sep="\t")
    plate_map_df.columns = [
        "Metadata_{}".format(x) if not x.startswith("Metadata_") else x
        for x in plate_map_df.columns
    ]
    platemap_well_column = pipeline["platemap_well_column"]

    # Step 1: Aggregate
    aggregate_steps = pipeline["aggregate"]
    if aggregate_steps["perform"]:
        aggregate_features = aggregate_steps["features"]
        aggregate_operation = aggregate_steps["method"]
        aggregate_plate_column = aggregate_steps["plate_column"]
        aggregate_well_column = aggregate_steps["well_column"]

        strata = [aggregate_plate_column, aggregate_well_column]

        if "site_column" in aggregate_steps:
            aggregate_site_column = aggregate_steps["site_column"]
            strata += [aggregate_site_column]

        ap = AggregateProfiles(
            sql_file,
            strata=strata,
            features=aggregate_features,
            operation=aggregate_operation,
        )

        ap.aggregate_profiles(output_file=aggregate_out_file,
                              compression=compression)

        if pipeline["count"]["perform"]:
            count_dir = pipeline["count"]["output_dir"]
            os.makedirs(count_dir, exist_ok=True)

            cell_count_file = os.path.join(
                count_dir, "{}_{}_cell_count.tsv".format(batch, plate))

            cell_count_df = ap.count_cells()

            cell_count_df = cell_count_df.merge(
                plate_map_df,
                left_on=aggregate_well_column,
                right_on=platemap_well_column,
            ).drop(platemap_well_column, axis="columns")

            cell_count_df.to_csv(cell_count_file, sep="\t", index=False)

    # Annotate Profiles
    annotate_steps = pipeline["annotate"]
    if annotate_steps["perform"]:
        annotate_well_column = annotate_steps["well_column"]
        annotate(
            profiles=aggregate_out_file,
            platemap=plate_map_df,
            join_on=[platemap_well_column, annotate_well_column],
            output_file=annotate_out_file,
            compression=compression,
        )

    # Normalize Profiles
    normalize_steps = pipeline["normalize"]
    if normalize_steps["perform"]:
        norm_features = normalize_steps["features"]
        norm_method = normalize_steps["method"]
        normalize(
            profiles=annotate_out_file,
            features=norm_features,
            samples=samples,
            method=norm_method,
            output_file=normalize_out_file,
            compression=compression,
        )

    # Apply feature selection
    feature_select_steps = pipeline["feature_select"]
    if feature_select_steps["perform"]:
        feature_select_operations = feature_select_steps["operations"]
        feature_select_features = feature_select_steps["features"]
        feature_select(
            profiles=normalize_out_file,
            features=feature_select_features,
            samples="none",
            operation=feature_select_operations,
            output_file=feature_out_file,
            compression=compression,
            corr_threshold=0.9,
            corr_method="pearson",
        )

コード例 #9

0

ファイルを表示

def process_profile(sql_file, batch, plate, pipeline):
    """
    Given batch details and a pipeline, process morphology profiles
    """
    assert batch in sql_file, "batch {} not recognized in sql file {}".format(
        batch, sql_file)
    assert plate in sql_file, "plate {} not recognized in sql file {}".format(
        plate, sql_file)

    # Set output directory information
    pipeline_output = pipeline["output_dir"]
    output_dir = os.path.join(pipeline_output, batch, plate)
    os.makedirs(output_dir, exist_ok=True)

    # Set output file information
    aggregate_out_file = os.path.join(output_dir, "{}.csv.gz".format(plate))
    annotate_out_file = os.path.join(output_dir,
                                     "{}_augmented.csv.gz".format(plate))
    normalize_out_file = os.path.join(output_dir,
                                      "{}_normalized.csv.gz".format(plate))
    feature_out_file = os.path.join(
        output_dir, "{}_normalized_feature_selected.csv.gz".format(plate))

    # Load pipeline options
    compression = process_pipeline(pipeline["options"], option="compression")
    sc_float_format = process_pipeline(pipeline["options"],
                                       option="sc_float_format")
    samples = process_pipeline(pipeline["options"], option="samples")

    # Load and setup platemap info
    workspace_dir = pipeline["workspace_dir"]
    batch_dir = os.path.join(workspace_dir, "backend", batch)
    metadata_dir = os.path.join(workspace_dir, "metadata", batch)

    barcode_plate_map_file = os.path.join(metadata_dir,
                                          sorted(os.listdir(metadata_dir))[0])
    barcode_plate_map_df = pd.read_csv(barcode_plate_map_file)
    plate_map_name = barcode_plate_map_df.query(
        "Assay_Plate_Barcode == @plate").Plate_Map_Name.values[0]
    plate_map_file = os.path.join(metadata_dir, "platemap",
                                  "{}.txt".format(plate_map_name))
    plate_map_df = pd.read_csv(plate_map_file, sep="\t")
    plate_map_df.columns = [
        "Metadata_{}".format(x) if not x.startswith("Metadata_") else x
        for x in plate_map_df.columns
    ]
    platemap_well_column = pipeline["platemap_well_column"]

    # Process Bulk profiles
    # Step 1: Aggregate
    aggregate_steps = pipeline["aggregate"]
    aggregate_features = aggregate_steps["features"]
    aggregate_operation = aggregate_steps["method"]
    aggregate_plate_column = aggregate_steps["plate_column"]
    aggregate_well_column = aggregate_steps["well_column"]

    strata = [aggregate_plate_column, aggregate_well_column]

    if "site_column" in aggregate_steps:
        aggregate_site_column = aggregate_steps["site_column"]
        strata += [aggregate_site_column]

    if aggregate_steps["perform"]:
        ap = AggregateProfiles(
            sql_file,
            strata=strata,
            features=aggregate_features,
            operation=aggregate_operation,
        )

        ap.aggregate_profiles(output_file=aggregate_out_file,
                              compression=compression)

    if pipeline["count"]["perform"]:
        if not aggregate_steps["perform"]:
            ap = AggregateProfiles(
                sql_file,
                strata=strata,
                features=aggregate_features,
                operation=aggregate_operation,
            )
        count_dir = pipeline["count"]["output_dir"]
        os.makedirs(count_dir, exist_ok=True)

        cell_count_file = os.path.join(
            count_dir, "{}_{}_cell_count.tsv".format(batch, plate))

        cell_count_df = ap.count_cells()

        cell_count_df = cell_count_df.merge(
            plate_map_df,
            left_on=aggregate_well_column,
            right_on=platemap_well_column,
        ).drop(platemap_well_column, axis="columns")

        cell_count_df.to_csv(cell_count_file, sep="\t", index=False)

    # Annotate Profiles
    annotate_steps = pipeline["annotate"]
    annotate_well_column = annotate_steps["well_column"]
    if annotate_steps["perform"]:
        annotate(
            profiles=aggregate_out_file,
            platemap=plate_map_df,
            join_on=[platemap_well_column, annotate_well_column],
            output_file=annotate_out_file,
            compression=compression,
        )

    # Normalize Profiles
    normalize_steps = pipeline["normalize"]
    norm_features = normalize_steps["features"]
    norm_method = normalize_steps["method"]
    if normalize_steps["perform"]:
        normalize(
            profiles=annotate_out_file,
            features=norm_features,
            samples=samples,
            method=norm_method,
            output_file=normalize_out_file,
            compression=compression,
        )

    # Apply feature selection
    feature_select_steps = pipeline["feature_select"]
    feature_select_operations = feature_select_steps["operations"]
    feature_select_features = feature_select_steps["features"]
    if feature_select_steps["perform"]:
        feature_select(
            profiles=normalize_out_file,
            features=feature_select_features,
            samples=samples,
            operation=feature_select_operations,
            output_file=feature_out_file,
            compression=compression,
            corr_threshold=0.9,
            corr_method="pearson",
        )

    sc_steps = pipeline["single_cell"]
    if sc_steps["perform"]:
        if not aggregate_steps["perform"]:
            ap = AggregateProfiles(
                sql_file,
                strata=strata,
                features=aggregate_features,
                operation=aggregate_operation,
            )

        # Load cells
        query = "select * from cells"
        cell_df = pd.read_sql(sql=query, con=ap.conn)

        # Load cytoplasm
        query = "select * from cytoplasm"
        cytoplasm_df = pd.read_sql(sql=query, con=ap.conn)

        # Load nuclei
        query = "select * from nuclei"
        nuclei_df = pd.read_sql(sql=query, con=ap.conn)

        # Merge single cells together
        sc_merged_df = (cell_df.merge(
            cytoplasm_df.drop("ObjectNumber", axis="columns"),
            left_on=["TableNumber", "ImageNumber", "ObjectNumber"],
            right_on=["TableNumber", "ImageNumber", "Cytoplasm_Parent_Cells"],
            how="inner",
        ).drop("ObjectNumber", axis="columns").merge(
            nuclei_df,
            left_on=["TableNumber", "ImageNumber", "Cytoplasm_Parent_Nuclei"],
            right_on=["TableNumber", "ImageNumber", "ObjectNumber"],
            how="inner",
        ))

        # Merge image data info
        sc_merged_df = ap.image_df.merge(sc_merged_df,
                                         how="right",
                                         on=ap.merge_cols)

        # Make sure column names are correctly prefixed
        prefix = ["Metadata", "Cells", "Cytoplasm", "Nuclei"]
        cols = []
        for col in sc_merged_df.columns:
            if any([col.startswith(x) for x in prefix]):
                cols.append(col)
            else:
                cols.append(f"Metadata_{col}")
        sc_merged_df.columns = cols

        sc_merged_df = annotate(
            profiles=sc_merged_df,
            platemap=plate_map_df,
            join_on=[platemap_well_column, annotate_well_column],
            output_file="none",
        )

        if sc_steps["normalize"]:
            sc_merged_df = normalize(
                profiles=sc_merged_df,
                features=norm_features,
                samples=samples,
                method=norm_method,
                output_file="none",
            )

        if sc_steps["feature_select"]:
            sc_merged_df = feature_select(
                profiles=sc_merged_df,
                features=feature_select_features,
                samples=samples,
                operation=feature_select_operations,
                output_file="none",
                corr_threshold=0.9,
                corr_method="pearson",
            )

        sc_pipeline_output = pipeline["sc_output_dir"]
        sc_output_dir = os.path.join(sc_pipeline_output, batch, plate)
        os.makedirs(sc_output_dir, exist_ok=True)

        # Set output file information
        sc_out_file = os.path.join(sc_output_dir,
                                   "{}_single_cell.csv.gz".format(plate))
        output(
            df=sc_merged_df,
            output_filename=sc_out_file,
            compression="gzip",
            float_format=sc_float_format,
        )

コード例 #10

0

ファイルを表示

# Count cells
count_file = pathlib.PurePath(cell_count_dir, f"{plate_name}_cell_count.csv")
cell_count_df = sc.count_cells()
cell_count_df.to_csv(count_file, sep=",", index=False)

del sc

# Annotate profiles - Level 3 Data
anno_file = pathlib.PurePath(output_dir, f"{plate_name}_augmented.csv.gz")
anno_df = annotate(
    profiles=out_file,
    platemap=platemap_file,
    join_on=["Metadata_well_position", well_col],
    format_broad_cmap=True,
    external_metadata=moa_df,
    external_join_left=["Metadata_broad_sample"],
    external_join_right=["Metadata_broad_sample"],
    cmap_args={
        "cell_id": cell_id,
        "perturbation_mode": "chemical"
    },
)

# Rename columns
anno_df = anno_df.rename(
    {
        "Image_Metadata_Plate": "Metadata_Plate",
        "Image_Metadata_Well": "Metadata_Well"
    },
    axis="columns",
)

コード例 #11

0

ファイルを表示

def get_profiles(plate, backend_dir, metadata_dir, barcode_platemap_df):
    """
    Apply all profiling steps for a given plate.

    Output:
    Will write a series of processed files to disk
    """
    print("Processing {}.....".format(plate))
    sqlite_file = "sqlite:///{}/{}.sqlite".format(backend_dir, plate)

    # Load specific platemap
    platemap = barcode_platemap_df.query(
        "Assay_Plate_Barcode == @plate").Plate_Map_Name.values[0]
    platemap_file = os.path.join(metadata_dir, "platemap",
                                 "{}.csv".format(platemap))
    platemap_df = pd.read_csv(platemap_file)

    # Prepare sql file for processing
    ap = AggregateProfiles(
        sqlite_file, strata=["Image_Metadata_Plate", "Image_Metadata_Well"])

    # Count cells and output
    cell_count_file = os.path.join("results",
                                   "{}_cell_count.tsv".format(plate))
    cell_count_df = ap.count_cells()
    cell_count_df = cell_count_df.merge(
        platemap_df, left_on="Image_Metadata_Well",
        right_on="well_position").drop(["WellRow", "WellCol", "well_position"],
                                       axis="columns")
    cell_count_df.to_csv(cell_count_file, sep="\t", index=False)

    # Begin processing profiles
    output_dir = os.path.join("data", "profiles", plate)
    os.makedirs(output_dir, exist_ok=True)

    # Aggregate single cells into well profiles
    out_file = os.path.join(output_dir, "{}.csv.gz".format(plate))
    ap.aggregate_profiles(output_file=out_file, compression="gzip")

    # Annotate Profiles
    anno_file = os.path.join(output_dir, "{}_augmented.csv.gz".format(plate))
    annotate(
        profiles=out_file,
        platemap=platemap_df,
        join_on=["Metadata_well_position", "Image_Metadata_Well"],
        output_file=anno_file,
        compression="gzip",
    )

    # Define metadata features
    meta_features = [
        "Image_Metadata_Plate",
        "Image_Metadata_Well",
        "Metadata_WellRow",
        "Metadata_WellCol",
        "Metadata_gene_name",
        "Metadata_pert_name",
        "Metadata_broad_sample",
        "Metadata_cell_line",
    ]

    # Normalize Profiles
    norm_file = os.path.join(output_dir, "{}_normalized.csv.gz".format(plate))
    normalize(
        profiles=anno_file,
        features="infer",
        meta_features=meta_features,
        samples="Metadata_pert_name == 'EMPTY'",
        method="mad_robustize",
        output_file=norm_file,
        compression="gzip",
    )

    # Perform feature selection (just drop columns with high number of missingness)
    # Drop columns with high number of missingness, extreme values, and blacklist
    feat_file = os.path.join(
        output_dir, "{}_normalized_feature_select.csv.gz".format(plate))
    feature_select(
        profiles=norm_file,
        features="infer",
        samples="none",
        operation=[
            "drop_na_columns",
            "blacklist",
            "variance_threshold",
            "drop_outliers",
        ],
        output_file=feat_file,
        compression="gzip",
    )

    # Perform audits
    profile_df = pd.read_csv(feat_file).drop(
        ["Image_Metadata_Well", "Image_Metadata_Plate"], axis="columns")

    # Audit guide replicability
    audit_file = os.path.join("results", "{}_audit_guide.csv".format(plate))
    audit(
        profiles=profile_df,
        audit_groups=[
            "Metadata_pert_name", "Metadata_gene_name", "Metadata_cell_line"
        ],
        iterations=10,
        output_file=audit_file,
    )

    # Audit gene replicability
    audit_file = os.path.join("results", "{}_audit_gene.csv".format(plate))
    audit(
        profiles=profile_df,
        audit_groups=["Metadata_gene_name", "Metadata_cell_line"],
        iterations=10,
        output_file=audit_file,
    )

コード例 #12

0

ファイルを表示

# Count cells
count_file = pathlib.PurePath(cell_count_dir, f"{plate_name}_cell_count.tsv")
cell_count_df = ap.count_cells()
cell_count_df.to_csv(count_file, sep="\t")

del ap

# Annotate profiles - Level 3 Data
anno_file = pathlib.PurePath(output_dir, f"{plate_name}_augmented.csv.gz")
anno_df = annotate(
    profiles=out_file,
    platemap=platemap_file,
    join_on=["Metadata_well_position", "Image_Metadata_Well"],
    cell_id=cell_id,
    format_broad_cmap=True,
    perturbation_mode="chemical",
    external_metadata=moa_df,
    external_join_left=["Metadata_broad_sample"],
    external_join_right=["Metadata_broad_sample"],
)

# Rename columns
anno_df = anno_df.rename(
    {
        "Image_Metadata_Plate": "Metadata_Plate",
        "Image_Metadata_Well": "Metadata_Well"
    },
    axis="columns",
)