Python AggregateProfilesの例

プログラミング言語: Python

名前空間/パッケージ名: pycytominer.aggregate

クラス/型: AggregateProfiles

hotexamples.comのコード掲載数: 7

Python AggregateProfiles - 7件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのpycytominer.aggregate.AggregateProfilesの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

よく使われるメソッド

表示非表示

AggregateProfiles(7)

aggregate_profiles(5)

count_cells(5)

set_subsample_frac(1)

set_subsample_n(1)

set_subsample_random_state(1)

コード例 #1

ファイルを表示

def process_profile(sql_file, batch, plate, pipeline):
    """
    Given batch details and a pipeline, process morphology profiles
    """
    assert batch in sql_file, "batch {} not recognized in sql file {}".format(
        batch, sql_file)
    assert plate in sql_file, "plate {} not recognized in sql file {}".format(
        plate, sql_file)

    # Set output directory information
    pipeline_output = pipeline["output_dir"]
    output_dir = os.path.join(pipeline_output, batch, plate)
    os.makedirs(output_dir, exist_ok=True)

    # Set output file information
    aggregate_out_file = os.path.join(output_dir, "{}.csv.gz".format(plate))
    annotate_out_file = os.path.join(output_dir,
                                     "{}_augmented.csv.gz".format(plate))
    normalize_out_file = os.path.join(output_dir,
                                      "{}_normalized.csv.gz".format(plate))
    feature_out_file = os.path.join(
        output_dir, "{}_normalized_feature_selected.csv.gz".format(plate))

    # Load pipeline options
    compression = process_pipeline(pipeline["options"], option="compression")
    sc_float_format = process_pipeline(pipeline["options"],
                                       option="sc_float_format")
    samples = process_pipeline(pipeline["options"], option="samples")

    # Load and setup platemap info
    workspace_dir = pipeline["workspace_dir"]
    batch_dir = os.path.join(workspace_dir, "backend", batch)
    metadata_dir = os.path.join(workspace_dir, "metadata", batch)

    barcode_plate_map_file = os.path.join(metadata_dir,
                                          sorted(os.listdir(metadata_dir))[0])
    barcode_plate_map_df = pd.read_csv(barcode_plate_map_file)
    plate_map_name = barcode_plate_map_df.query(
        "Assay_Plate_Barcode == @plate").Plate_Map_Name.values[0]
    plate_map_file = os.path.join(metadata_dir, "platemap",
                                  "{}.txt".format(plate_map_name))
    plate_map_df = pd.read_csv(plate_map_file, sep="\t")
    plate_map_df.columns = [
        "Metadata_{}".format(x) if not x.startswith("Metadata_") else x
        for x in plate_map_df.columns
    ]
    platemap_well_column = pipeline["platemap_well_column"]

    # Process Bulk profiles
    # Step 1: Aggregate
    aggregate_steps = pipeline["aggregate"]
    aggregate_features = aggregate_steps["features"]
    aggregate_operation = aggregate_steps["method"]
    aggregate_plate_column = aggregate_steps["plate_column"]
    aggregate_well_column = aggregate_steps["well_column"]

    strata = [aggregate_plate_column, aggregate_well_column]

    if "site_column" in aggregate_steps:
        aggregate_site_column = aggregate_steps["site_column"]
        strata += [aggregate_site_column]

    if aggregate_steps["perform"]:
        ap = AggregateProfiles(
            sql_file,
            strata=strata,
            features=aggregate_features,
            operation=aggregate_operation,
        )

        ap.aggregate_profiles(output_file=aggregate_out_file,
                              compression=compression)

    if pipeline["count"]["perform"]:
        if not aggregate_steps["perform"]:
            ap = AggregateProfiles(
                sql_file,
                strata=strata,
                features=aggregate_features,
                operation=aggregate_operation,
            )
        count_dir = pipeline["count"]["output_dir"]
        os.makedirs(count_dir, exist_ok=True)

        cell_count_file = os.path.join(
            count_dir, "{}_{}_cell_count.tsv".format(batch, plate))

        cell_count_df = ap.count_cells()

        cell_count_df = cell_count_df.merge(
            plate_map_df,
            left_on=aggregate_well_column,
            right_on=platemap_well_column,
        ).drop(platemap_well_column, axis="columns")

        cell_count_df.to_csv(cell_count_file, sep="\t", index=False)

    # Annotate Profiles
    annotate_steps = pipeline["annotate"]
    annotate_well_column = annotate_steps["well_column"]
    if annotate_steps["perform"]:
        annotate(
            profiles=aggregate_out_file,
            platemap=plate_map_df,
            join_on=[platemap_well_column, annotate_well_column],
            output_file=annotate_out_file,
            compression=compression,
        )

    # Normalize Profiles
    normalize_steps = pipeline["normalize"]
    norm_features = normalize_steps["features"]
    norm_method = normalize_steps["method"]
    if normalize_steps["perform"]:
        normalize(
            profiles=annotate_out_file,
            features=norm_features,
            samples=samples,
            method=norm_method,
            output_file=normalize_out_file,
            compression=compression,
        )

    # Apply feature selection
    feature_select_steps = pipeline["feature_select"]
    feature_select_operations = feature_select_steps["operations"]
    feature_select_features = feature_select_steps["features"]
    if feature_select_steps["perform"]:
        feature_select(
            profiles=normalize_out_file,
            features=feature_select_features,
            samples=samples,
            operation=feature_select_operations,
            output_file=feature_out_file,
            compression=compression,
            corr_threshold=0.9,
            corr_method="pearson",
        )

    sc_steps = pipeline["single_cell"]
    if sc_steps["perform"]:
        if not aggregate_steps["perform"]:
            ap = AggregateProfiles(
                sql_file,
                strata=strata,
                features=aggregate_features,
                operation=aggregate_operation,
            )

        # Load cells
        query = "select * from cells"
        cell_df = pd.read_sql(sql=query, con=ap.conn)

        # Load cytoplasm
        query = "select * from cytoplasm"
        cytoplasm_df = pd.read_sql(sql=query, con=ap.conn)

        # Load nuclei
        query = "select * from nuclei"
        nuclei_df = pd.read_sql(sql=query, con=ap.conn)

        # Merge single cells together
        sc_merged_df = (cell_df.merge(
            cytoplasm_df.drop("ObjectNumber", axis="columns"),
            left_on=["TableNumber", "ImageNumber", "ObjectNumber"],
            right_on=["TableNumber", "ImageNumber", "Cytoplasm_Parent_Cells"],
            how="inner",
        ).drop("ObjectNumber", axis="columns").merge(
            nuclei_df,
            left_on=["TableNumber", "ImageNumber", "Cytoplasm_Parent_Nuclei"],
            right_on=["TableNumber", "ImageNumber", "ObjectNumber"],
            how="inner",
        ))

        # Merge image data info
        sc_merged_df = ap.image_df.merge(sc_merged_df,
                                         how="right",
                                         on=ap.merge_cols)

        # Make sure column names are correctly prefixed
        prefix = ["Metadata", "Cells", "Cytoplasm", "Nuclei"]
        cols = []
        for col in sc_merged_df.columns:
            if any([col.startswith(x) for x in prefix]):
                cols.append(col)
            else:
                cols.append(f"Metadata_{col}")
        sc_merged_df.columns = cols

        sc_merged_df = annotate(
            profiles=sc_merged_df,
            platemap=plate_map_df,
            join_on=[platemap_well_column, annotate_well_column],
            output_file="none",
        )

        if sc_steps["normalize"]:
            sc_merged_df = normalize(
                profiles=sc_merged_df,
                features=norm_features,
                samples=samples,
                method=norm_method,
                output_file="none",
            )

        if sc_steps["feature_select"]:
            sc_merged_df = feature_select(
                profiles=sc_merged_df,
                features=feature_select_features,
                samples=samples,
                operation=feature_select_operations,
                output_file="none",
                corr_threshold=0.9,
                corr_method="pearson",
            )

        sc_pipeline_output = pipeline["sc_output_dir"]
        sc_output_dir = os.path.join(sc_pipeline_output, batch, plate)
        os.makedirs(sc_output_dir, exist_ok=True)

        # Set output file information
        sc_out_file = os.path.join(sc_output_dir,
                                   "{}_single_cell.csv.gz".format(plate))
        output(
            df=sc_merged_df,
            output_filename=sc_out_file,
            compression="gzip",
            float_format=sc_float_format,
        )

コード例 #2

ファイルを表示

ファイル: profile_util.py プロジェクト: DavidStirling/profiling-resistance-mechanisms

def process_profile(sql_file, batch, plate, pipeline):
    """
    Given batch details and a pipeline, process morphology profiles
    """
    assert batch in sql_file, "batch {} not recognized in sql file {}".format(
        batch, sql_file)
    assert plate in sql_file, "plate {} not recognized in sql file {}".format(
        plate, sql_file)

    # Set output directory information
    pipeline_output = pipeline["output_dir"]
    output_dir = os.path.join(pipeline_output, batch, plate)
    os.makedirs(output_dir, exist_ok=True)

    # Set output file information
    aggregate_out_file = os.path.join(output_dir, "{}.csv.gz".format(plate))
    annotate_out_file = os.path.join(output_dir,
                                     "{}_augmented.csv.gz".format(plate))
    normalize_out_file = os.path.join(output_dir,
                                      "{}_normalized.csv.gz".format(plate))
    feature_out_file = os.path.join(
        output_dir, "{}_normalized_feature_selected.csv.gz".format(plate))

    # Load pipeline options
    compression = process_pipeline(pipeline["options"], option="compression")
    samples = process_pipeline(pipeline["options"], option="samples")

    # Load and setup platemap info
    workspace_dir = pipeline["workspace_dir"]
    batch_dir = os.path.join(workspace_dir, "backend", batch)
    metadata_dir = os.path.join(workspace_dir, "metadata", batch)

    barcode_plate_map_file = os.path.join(metadata_dir,
                                          sorted(os.listdir(metadata_dir))[0])
    barcode_plate_map_df = pd.read_csv(barcode_plate_map_file)
    plate_map_name = barcode_plate_map_df.query(
        "Assay_Plate_Barcode == @plate").Plate_Map_Name.values[0]
    plate_map_file = os.path.join(metadata_dir, "platemap",
                                  "{}.txt".format(plate_map_name))
    plate_map_df = pd.read_csv(plate_map_file, sep="\t")
    plate_map_df.columns = [
        "Metadata_{}".format(x) if not x.startswith("Metadata_") else x
        for x in plate_map_df.columns
    ]
    platemap_well_column = pipeline["platemap_well_column"]

    # Step 1: Aggregate
    aggregate_steps = pipeline["aggregate"]
    if aggregate_steps["perform"]:
        aggregate_features = aggregate_steps["features"]
        aggregate_operation = aggregate_steps["method"]
        aggregate_plate_column = aggregate_steps["plate_column"]
        aggregate_well_column = aggregate_steps["well_column"]

        strata = [aggregate_plate_column, aggregate_well_column]

        if "site_column" in aggregate_steps:
            aggregate_site_column = aggregate_steps["site_column"]
            strata += [aggregate_site_column]

        ap = AggregateProfiles(
            sql_file,
            strata=strata,
            features=aggregate_features,
            operation=aggregate_operation,
        )

        ap.aggregate_profiles(output_file=aggregate_out_file,
                              compression=compression)

        if pipeline["count"]["perform"]:
            count_dir = pipeline["count"]["output_dir"]
            os.makedirs(count_dir, exist_ok=True)

            cell_count_file = os.path.join(
                count_dir, "{}_{}_cell_count.tsv".format(batch, plate))

            cell_count_df = ap.count_cells()

            cell_count_df = cell_count_df.merge(
                plate_map_df,
                left_on=aggregate_well_column,
                right_on=platemap_well_column,
            ).drop(platemap_well_column, axis="columns")

            cell_count_df.to_csv(cell_count_file, sep="\t", index=False)

    # Annotate Profiles
    annotate_steps = pipeline["annotate"]
    if annotate_steps["perform"]:
        annotate_well_column = annotate_steps["well_column"]
        annotate(
            profiles=aggregate_out_file,
            platemap=plate_map_df,
            join_on=[platemap_well_column, annotate_well_column],
            output_file=annotate_out_file,
            compression=compression,
        )

    # Normalize Profiles
    normalize_steps = pipeline["normalize"]
    if normalize_steps["perform"]:
        norm_features = normalize_steps["features"]
        norm_method = normalize_steps["method"]
        normalize(
            profiles=annotate_out_file,
            features=norm_features,
            samples=samples,
            method=norm_method,
            output_file=normalize_out_file,
            compression=compression,
        )

    # Apply feature selection
    feature_select_steps = pipeline["feature_select"]
    if feature_select_steps["perform"]:
        feature_select_operations = feature_select_steps["operations"]
        feature_select_features = feature_select_steps["features"]
        feature_select(
            profiles=normalize_out_file,
            features=feature_select_features,
            samples="none",
            operation=feature_select_operations,
            output_file=feature_out_file,
            compression=compression,
            corr_threshold=0.9,
            corr_method="pearson",
        )

コード例 #3

ファイルを表示

def get_profiles(plate, backend_dir, metadata_dir, barcode_platemap_df):
    """
    Apply all profiling steps for a given plate.

    Output:
    Will write a series of processed files to disk
    """
    print("Processing {}.....".format(plate))
    sqlite_file = "sqlite:///{}/{}.sqlite".format(backend_dir, plate)

    # Load specific platemap
    platemap = barcode_platemap_df.query(
        "Assay_Plate_Barcode == @plate").Plate_Map_Name.values[0]
    platemap_file = os.path.join(metadata_dir, "platemap",
                                 "{}.csv".format(platemap))
    platemap_df = pd.read_csv(platemap_file)

    # Prepare sql file for processing
    ap = AggregateProfiles(
        sqlite_file, strata=["Image_Metadata_Plate", "Image_Metadata_Well"])

    # Count cells and output
    cell_count_file = os.path.join("results",
                                   "{}_cell_count.tsv".format(plate))
    cell_count_df = ap.count_cells()
    cell_count_df = cell_count_df.merge(
        platemap_df, left_on="Image_Metadata_Well",
        right_on="well_position").drop(["WellRow", "WellCol", "well_position"],
                                       axis="columns")
    cell_count_df.to_csv(cell_count_file, sep="\t", index=False)

    # Begin processing profiles
    output_dir = os.path.join("data", "profiles", plate)
    os.makedirs(output_dir, exist_ok=True)

    # Aggregate single cells into well profiles
    out_file = os.path.join(output_dir, "{}.csv.gz".format(plate))
    ap.aggregate_profiles(output_file=out_file, compression="gzip")

    # Annotate Profiles
    anno_file = os.path.join(output_dir, "{}_augmented.csv.gz".format(plate))
    annotate(
        profiles=out_file,
        platemap=platemap_df,
        join_on=["Metadata_well_position", "Image_Metadata_Well"],
        output_file=anno_file,
        compression="gzip",
    )

    # Define metadata features
    meta_features = [
        "Image_Metadata_Plate",
        "Image_Metadata_Well",
        "Metadata_WellRow",
        "Metadata_WellCol",
        "Metadata_gene_name",
        "Metadata_pert_name",
        "Metadata_broad_sample",
        "Metadata_cell_line",
    ]

    # Normalize Profiles
    norm_file = os.path.join(output_dir, "{}_normalized.csv.gz".format(plate))
    normalize(
        profiles=anno_file,
        features="infer",
        meta_features=meta_features,
        samples="Metadata_pert_name == 'EMPTY'",
        method="mad_robustize",
        output_file=norm_file,
        compression="gzip",
    )

    # Perform feature selection (just drop columns with high number of missingness)
    # Drop columns with high number of missingness, extreme values, and blacklist
    feat_file = os.path.join(
        output_dir, "{}_normalized_feature_select.csv.gz".format(plate))
    feature_select(
        profiles=norm_file,
        features="infer",
        samples="none",
        operation=[
            "drop_na_columns",
            "blacklist",
            "variance_threshold",
            "drop_outliers",
        ],
        output_file=feat_file,
        compression="gzip",
    )

    # Perform audits
    profile_df = pd.read_csv(feat_file).drop(
        ["Image_Metadata_Well", "Image_Metadata_Plate"], axis="columns")

    # Audit guide replicability
    audit_file = os.path.join("results", "{}_audit_guide.csv".format(plate))
    audit(
        profiles=profile_df,
        audit_groups=[
            "Metadata_pert_name", "Metadata_gene_name", "Metadata_cell_line"
        ],
        iterations=10,
        output_file=audit_file,
    )

    # Audit gene replicability
    audit_file = os.path.join("results", "{}_audit_gene.csv".format(plate))
    audit(
        profiles=profile_df,
        audit_groups=["Metadata_gene_name", "Metadata_cell_line"],
        iterations=10,
        output_file=audit_file,
    )

コード例 #4

ファイルを表示

feature_select_ops = [
    "drop_na_columns",
    "variance_threshold",
    "correlation_threshold",
    "blacklist",
]

# Define external metadata to add to annotation
moa_df = pd.read_csv(moa_file, sep="\t")
barcode_platemap_df = pd.read_csv(barcode_platemap_file).query(
    "Assay_Plate_Barcode == @plate_name")

# Aggregate profiles
out_file = pathlib.PurePath(output_dir, f"{plate_name}.csv.gz")
ap = AggregateProfiles(sql_file=sql_file,
                       strata=strata,
                       operation=aggregate_method)
ap.aggregate_profiles(output_file=out_file,
                      float_format=float_format,
                      compression="gzip")

# Count cells
count_file = pathlib.PurePath(cell_count_dir, f"{plate_name}_cell_count.tsv")
cell_count_df = ap.count_cells()
cell_count_df.to_csv(count_file, sep="\t")

del ap

# Annotate profiles - Level 3 Data
anno_file = pathlib.PurePath(output_dir, f"{plate_name}_augmented.csv.gz")
anno_df = annotate(

コード例 #5

ファイルを表示

def test_AggregateProfiles_reset_variables():
    """
    Testing initialization of AggregateProfiles
    """
    ap_switch = AggregateProfiles(sql_file=file)
    assert ap_switch.subsample_frac == 1
    assert ap_switch.subsample_n == "all"
    assert ap_switch.subsampling_random_state == "none"
    ap_switch.set_subsample_frac(0.8)
    assert ap_switch.subsample_frac == 0.8
    ap_switch.set_subsample_frac(1)
    ap_switch.set_subsample_n(4)
    assert ap_switch.subsample_n == 4
    ap_switch.set_subsample_random_state(42)
    assert ap_switch.subsampling_random_state == 42

    with pytest.raises(AssertionError) as errorinfo:
        ap_switch.set_subsample_frac(0.8)
    assert "Do not set both subsample_frac and subsample_n" in str(
        errorinfo.value.args[0])

    with pytest.raises(ValueError) as errorinfo:
        ap_switch.set_subsample_frac(1)
        ap_switch.set_subsample_n("wont work")

    assert "subsample n must be an integer or coercable" in str(
        errorinfo.value.args[0])

コード例 #6

ファイルを表示

def test_aggregate_count_cells_multiple_strata():
    # Lauch a sqlite connection
    file = "sqlite:///{}/test_strata.sqlite".format(tmpdir)

    test_engine = create_engine(file)
    test_conn = test_engine.connect()

    # Setup data
    base_image_number = sorted(["x", "y"] * 50)
    base_table_number = sorted(
        ["x_hash_a", "x_hash_b", "y_hash_a", "y_hash_b"] * 25)
    cells_df = build_random_data(
        compartment="cells",
        ImageNumber=base_image_number,
        TableNumber=base_table_number,
    )
    cytoplasm_df = build_random_data(
        compartment="cytoplasm",
        ImageNumber=base_image_number,
        TableNumber=base_table_number,
    )
    nuclei_df = build_random_data(
        compartment="nuclei",
        ImageNumber=base_image_number,
        TableNumber=base_table_number,
    )
    image_df = pd.DataFrame({
        "TableNumber": ["x_hash_a", "x_hash_b", "y_hash_a", "y_hash_b"],
        "ImageNumber": ["x", "x", "y", "y"],
        "Metadata_Plate": ["plate"] * 4,
        "Metadata_Well": ["A01", "A02"] * 2,
        "Metadata_Site": [1, 1, 2, 2],
    }).sort_values(by="Metadata_Well")

    # Ingest data into temporary sqlite file
    image_df.to_sql("image", con=test_engine, index=False, if_exists="replace")
    cells_df.to_sql("cells", con=test_engine, index=False, if_exists="replace")
    cytoplasm_df.to_sql("cytoplasm",
                        con=test_engine,
                        index=False,
                        if_exists="replace")
    nuclei_df.to_sql("nuclei",
                     con=test_engine,
                     index=False,
                     if_exists="replace")

    # Setup AggregateProfiles Class
    ap_strata = AggregateProfiles(
        sql_file=file,
        subsample_n="4",
        strata=["Metadata_Plate", "Metadata_Well", "Metadata_Site"],
    )

    count_df = ap_strata.count_cells()
    expected_count = pd.DataFrame({
        "Metadata_Plate": ["plate"] * 4,
        "Metadata_Well": sorted(["A01", "A02"] * 2),
        "Metadata_Site": [1, 2] * 2,
        "cell_count": [25] * 4,
    })
    pd.testing.assert_frame_equal(count_df, expected_count, check_names=False)

    profiles = ap_strata.aggregate_profiles()

    count_df = ap_strata.count_cells(count_subset=True)
    expected_count = pd.DataFrame({
        "Metadata_Plate": ["plate"] * 4,
        "Metadata_Well": sorted(["A01", "A02"] * 2),
        "Metadata_Site": [1, 2] * 2,
        "cell_count": [4] * 4,
    })
    pd.testing.assert_frame_equal(count_df, expected_count, check_names=False)

コード例 #7

ファイルを表示

    "ImageNumber": ["x", "y"],
    "Metadata_Plate": ["plate", "plate"],
    "Metadata_Well": ["A01", "A02"],
})

# Ingest data into temporary sqlite file
image_df.to_sql("image", con=test_engine, index=False, if_exists="replace")
cells_df.to_sql("cells", con=test_engine, index=False, if_exists="replace")
cytoplasm_df.to_sql("cytoplasm",
                    con=test_engine,
                    index=False,
                    if_exists="replace")
nuclei_df.to_sql("nuclei", con=test_engine, index=False, if_exists="replace")

# Setup AggregateProfiles Class
ap = AggregateProfiles(sql_file=file)
ap_subsample = AggregateProfiles(sql_file=file,
                                 subsample_n=2,
                                 subsampling_random_state=123)


def test_AggregateProfiles_init():
    """
    Testing initialization of AggregateProfiles
    """
    assert ap.sql_file == file
    assert ap.strata == ["Metadata_Plate", "Metadata_Well"]
    assert ap.merge_cols == ["TableNumber", "ImageNumber"]
    assert ap.features == "infer"
    pd.testing.assert_frame_equal(image_df, ap.image_df)
    assert ap.subsample_frac == 1