Python aggregateの例、pycytominer.aggregate Pythonの例

コード例 #1

0

ファイルを表示

ファイル: test_aggregate.py プロジェクト: michaelbornholdt/pycytominer

def test_aggregate_median_allvar():
    """
    Testing aggregate pycytominer function
    """
    aggregate_result = aggregate(
        population_df=data_df, strata=["g"], features="infer", operation="median"
    )

    expected_result = pd.concat(
        [
            pd.DataFrame({"g": "a", "Cells_x": [3], "Nuclei_y": [3]}),
            pd.DataFrame({"g": "b", "Cells_x": [3], "Nuclei_y": [3]}),
        ]
    ).reset_index(drop=True)
    expected_result = expected_result.astype(dtype_convert_dict)

    assert aggregate_result.equals(expected_result)

    # Test output
    aggregate(
        population_df=data_df,
        strata=["g"],
        features="infer",
        operation="median",
        output_file=test_output_file,
    )

    test_df = pd.read_csv(test_output_file)
    pd.testing.assert_frame_equal(test_df, expected_result)

コード例 #2

0

ファイルを表示

ファイル: test_aggregate.py プロジェクト: shntnu/pycytominer

def test_aggregate_median_with_missing_values():
    """
    Testing aggregate pycytominer function
    """

    # Convert dtype of one variable to object
    data_dtype_df = data_missing_df.copy()
    data_dtype_df.Cells_x = data_dtype_df.Cells_x.astype(str)

    aggregate_result = aggregate(population_df=data_dtype_df,
                                 strata=["g"],
                                 features="infer",
                                 operation="median")
    print(aggregate_result)
    expected_result = pd.concat([
        pd.DataFrame({
            "g": "a",
            "Cells_x": [3],
            "Nuclei_y": [3]
        }),
        pd.DataFrame({
            "g": "b",
            "Cells_x": [3],
            "Nuclei_y": [3]
        }),
    ]).reset_index(drop=True)
    expected_result = expected_result.astype(dtype_convert_dict)

    assert aggregate_result.equals(expected_result)

コード例 #3

0

ファイルを表示

ファイル: test_aggregate.py プロジェクト: michaelbornholdt/pycytominer

def test_aggregate_mean_subsetvar():
    """
    Testing aggregate pycytominer function
    """
    aggregate_result = aggregate(
        population_df=data_df, strata=["g"], features=["Cells_x"], operation="mean"
    )

    expected_result = pd.DataFrame({"g": ["a", "b"], "Cells_x": [4, 3]})
    expected_result.Cells_x = expected_result.Cells_x.astype(float)

    assert aggregate_result.equals(expected_result)

コード例 #4

0

ファイルを表示

ファイル: test_aggregate.py プロジェクト: shntnu/pycytominer

def test_aggregate_incorrect_object_feature():
    """
    Testing aggregate pycytominer function
    """

    incorrect_object_feature = "DOES NOT EXIST"

    with pytest.raises(KeyError) as err:
        aggregate_result = aggregate(
            population_df=data_df,
            strata=["g"],
            features="infer",
            operation="median",
            compute_object_count=True,
            object_feature=incorrect_object_feature,
        )

        assert (
            f"The following labels were missing: Index(['{incorrect_object_feature}'], dtype='object')"
            in str(err))

    # Test that aggregate doesn't drop samples if strata is na
    data_missing_group_df = pd.concat([
        data_df,
        pd.DataFrame({
            "g": np.nan,
            "Cells_x": [1, 3, 8],
            "Nuclei_y": [5, 3, 1]
        }),
    ])

    result = aggregate(
        population_df=data_missing_group_df,
        strata=["g"],
        features="infer",
        operation="median",
    )
    # There should be three total groups
    assert result.shape[0] == 3

コード例 #5

0

ファイルを表示

ファイル: test_cells.py プロジェクト: shntnu/pycytominer

def test_aggregate_comparment():
    df = image_df.merge(cells_df,
                        how="inner",
                        on=["TableNumber", "ImageNumber"])
    result = aggregate(df)
    ap_result = ap.aggregate_compartment("cells")

    expected_result = pd.DataFrame({
        "Metadata_Plate": ["plate", "plate"],
        "Metadata_Well": ["A01", "A02"],
        "Cells_a": [368.0, 583.5],
        "Cells_b": [482.0, 478.5],
        "Cells_c": [531.0, 461.5],
        "Cells_d": [585.5, 428.0],
    })

    pd.testing.assert_frame_equal(result, expected_result)
    pd.testing.assert_frame_equal(result, ap_result)
    pd.testing.assert_frame_equal(ap_result, expected_result)

コード例 #6

0

ファイルを表示

def test_custom_objectnumber_feature():
    """
    Testing aggregate pycytominer function
    """

    data_df_copy = (
        data_df.copy()
        .rename(columns={'Metadata_ObjectNumber': 'Custom_ObjectNumber_Feature'})
    )

    aggregate_result = aggregate(
        population_df=data_df_copy,
        strata=["g"],
        features="infer",
        operation="median",
        compute_object_count=True,
        object_feature='Custom_ObjectNumber_Feature'
    )

    expected_result = pd.concat(
        [
            pd.DataFrame(
                {
                    "g": "a",
                    "Metadata_Object_Count": [3],
                    "Cells_x": [3],
                    "Nuclei_y": [3],
                }
            ),
            pd.DataFrame(
                {
                    "g": "b",
                    "Metadata_Object_Count": [3],
                    "Cells_x": [3],
                    "Nuclei_y": [3],
                }
            ),
        ]
    ).reset_index(drop=True)
    expected_result = expected_result.astype(dtype_convert_dict)

    assert aggregate_result.equals(expected_result)

コード例 #7

0

ファイルを表示

ファイル: test_cells.py プロジェクト: michaelbornholdt/pycytominer

def test_aggregate_profiles():
    result = ap.aggregate_profiles()

    expected_result = pd.DataFrame(
        {
            "Metadata_Plate": ["plate", "plate"],
            "Metadata_Well": ["A01", "A02"],
            "Metadata_Object_Count": [50, 50],
            "Metadata_Site_Count": [1, 1],
            "Cells_a": [368.0, 583.5],
            "Cells_b": [482.0, 478.5],
            "Cells_c": [531.0, 461.5],
            "Cells_d": [585.5, 428.0],
            "Cytoplasm_a": [479.5, 495.5],
            "Cytoplasm_b": [445.5, 459.0],
            "Cytoplasm_c": [407.5, 352.0],
            "Cytoplasm_d": [533.0, 545.0],
            "Nuclei_a": [591.5, 435.5],
            "Nuclei_b": [574.0, 579.0],
            "Nuclei_c": [588.5, 538.5],
            "Nuclei_d": [483.0, 560.0],
        }
    )

    pd.testing.assert_frame_equal(
        result.sort_index(axis=1), expected_result.sort_index(axis=1)
    )

    # Confirm aggregation after merging single cells
    sc_df = ap.merge_single_cells()
    sc_aggregated_df = aggregate(sc_df, compute_object_count=True).sort_index(
        axis="columns"
    )

    pd.testing.assert_frame_equal(
        result.sort_index(axis="columns").drop("Metadata_Site_Count", axis="columns"),
        sc_aggregated_df,
    )

コード例 #8

0

ファイルを表示

    def aggregate_compartment(
        self,
        compartment,
        compute_subsample=False,
        compute_counts=False,
        add_image_features=False,
        n_aggregation_memory_strata=1,
    ):
        """Aggregate morphological profiles. Uses pycytominer.aggregate()

        Parameters
        ----------
        compartment : str
            Compartment to aggregate.
        compute_subsample : bool, default False
            Whether or not to subsample.
        compute_counts : bool, default False
            Whether or not to compute the number of objects in each compartment
            and the number of fields of view per well.
        add_image_features : bool, default False
            Whether or not to add image features.
        n_aggregation_memory_strata : int, default 1
            Number of unique strata to pull from the database into working memory
            at once.  Typically 1 is fastest.  A larger number uses more memory.
            For example, if aggregating by "well", then n_aggregation_memory_strata=1
            means that one "well" will be pulled from the SQLite database into
            memory at a time.

        Returns
        -------
        pandas.core.frame.DataFrame
            DataFrame of aggregated profiles.

        """

        check_compartments(compartment)

        if (self.subsample_frac < 1 or self.subsample_n != "all") and compute_subsample:
            self.get_subsample(compartment=compartment)

        # Load image data if not already loaded
        if not self.load_image_data:
            self.load_image()
            self.load_image_data = True

        # Iteratively call aggregate() on chunks of the full compartment table
        object_dfs = []
        for compartment_df in self._compartment_df_generator(
            compartment=compartment,
            n_aggregation_memory_strata=n_aggregation_memory_strata,
        ):

            population_df = self.image_df.merge(
                compartment_df,
                how="inner",
                on=self.merge_cols,
            ).rename(self.linking_col_rename, axis="columns")

            if self.features == "infer":
                aggregate_features = infer_cp_features(
                    population_df, compartments=compartment
                )
            else:
                aggregate_features = self.features

            partial_object_df = aggregate(
                population_df=population_df,
                strata=self.strata,
                compute_object_count=compute_counts,
                operation=self.aggregation_operation,
                subset_data_df=self.subset_data_df,
                features=aggregate_features,
                object_feature=self.object_feature,
            )

            if compute_counts and self.fields_of_view_feature not in self.strata:
                fields_count_df = aggregate_fields_count(
                    self.image_df, self.strata, self.fields_of_view_feature
                )

                if add_image_features:
                    fields_count_df = aggregate_image_features(
                        fields_count_df,
                        self.image_features_df,
                        self.image_feature_categories,
                        self.image_cols,
                        self.strata,
                        self.aggregation_operation,
                    )

                partial_object_df = fields_count_df.merge(
                    partial_object_df,
                    on=self.strata,
                    how="right",
                )

                # Separate all the metadata and feature columns.
                metadata_cols = infer_cp_features(partial_object_df, metadata=True)
                feature_cols = infer_cp_features(partial_object_df, image_features=True)

                partial_object_df = partial_object_df.reindex(
                    columns=metadata_cols + feature_cols
                )

            object_dfs.append(partial_object_df)

        # Concatenate one or more aggregated dataframes row-wise into final output
        object_df = pd.concat(object_dfs, axis=0).reset_index(drop=True)

        return object_df

コード例 #9

0

ファイルを表示

ファイル: 1.aggregate.py プロジェクト: gwaygenomics/pooled-cell-painting-profiling-recipe

        else:
            warnings.warn(
                f"{site_file} does not exist. There must have been an error in processing"
            )

    single_cell_df = pd.concat(single_cell_df,
                               axis="rows").reset_index(drop=True)

# Perform the aggregation based on the defined levels and columns
aggregate_output_dir.mkdir(parents=True, exist_ok=True)
for aggregate_level, aggregate_columns in aggregate_levels.items():
    aggregate_output_file = aggregate_output_files[aggregate_level]

    print(
        f"Now aggregating by {aggregate_level}...with operation: {aggregate_operation}"
    )

    aggregate_df = aggregate(
        population_df=single_cell_df,
        strata=aggregate_columns,
        features=aggregate_features,
        operation=aggregate_operation,
    )

    output(
        aggregate_df,
        output_filename=aggregate_output_file,
        compression=compression,
        float_format=float_format,
    )

コード例 #10

0

ファイルを表示

ファイル: cells.py プロジェクト: shntnu/pycytominer

    def aggregate_compartment(
        self,
        compartment,
        compute_subsample=False,
        compute_counts=False,
        aggregate_args=None,
    ):
        """Aggregate morphological profiles. Uses pycytominer.aggregate()

        Parameters
        ----------
        compartment : str
            Compartment to aggregate.
        compute_subsample : bool, default False
            Whether or not to subsample.
        compute_counts : bool, default False
            Whether or not to compute the number of objects in each compartment and the number of fields of view per well.
        aggregate_args : dict, optional
            Additional arguments passed as input to pycytominer.aggregate().

        Returns
        -------
        pandas.core.frame.DataFrame
            DataFrame of aggregated profiles.

        """

        check_compartments(compartment)

        if (self.subsample_frac < 1 or self.subsample_n != "all") and compute_subsample:
            self.get_subsample(compartment=compartment)

        # Load image data if not already loaded
        if not self.load_image_data:
            self.load_image()
            self.load_image_data = True

        population_df = self.image_df.merge(
            self.load_compartment(compartment=compartment),
            how="inner",
            on=self.merge_cols,
        ).rename(self.linking_col_rename, axis="columns")

        # Infering features is tricky with non-canonical data
        if aggregate_args is None:
            aggregate_args = {}
            features = infer_cp_features(population_df, compartments=compartment)
        elif "features" not in aggregate_args:
            features = infer_cp_features(population_df, compartments=compartment)
        elif aggregate_args["features"] == "infer":
            features = infer_cp_features(population_df, compartments=compartment)
        else:
            features = aggregate_args["features"]

        aggregate_args["features"] = features
        if "object_feature" not in aggregate_args:
            aggregate_args["object_feature"] = self.object_feature

        object_df = aggregate(
            population_df=population_df,
            strata=self.strata,
            compute_object_count=compute_counts,
            operation=self.aggregation_operation,
            subset_data_df=self.subset_data_df,
            **aggregate_args,
        )

        if compute_counts and self.fields_of_view_feature not in self.strata:
            fields_count_df = self.image_df.loc[
                :, list(np.union1d(self.strata, self.fields_of_view_feature))
            ]
            fields_count_df = (
                fields_count_df.groupby(self.strata)[self.fields_of_view_feature]
                .count()
                .reset_index()
                .rename(
                    columns={f"{self.fields_of_view_feature}": f"Metadata_Site_Count"}
                )
            )

            object_df = fields_count_df.merge(object_df, on=self.strata, how="right")

        return object_df

コード例 #11

0

ファイルを表示

def consensus(
    profiles,
    replicate_columns=["Metadata_Plate", "Metadata_Well"],
    operation="median",
    features="infer",
    output_file="none",
    compression_options=None,
    float_format=None,
    modz_args={"method": "spearman"},
):
    """Form level 5 consensus profile data.

    :param profiles: A file or pandas DataFrame of profile data
    :type profiles: str
    :param replicate_columns: Metadata columns indicating which replicates to collapse, defaults to ["Metadata_Plate", "Metadata_Well"]
    :type replicate_columns: list
    :param operation: The method used to form consensus profiles, defaults to "median"
    :type operation: str
    :param features: The features to collapse, defaults to "infer"
    :type features: str, list
    :param output_file: If specified, the location to write the file, defaults to "none"
    :type output_file: str
    :param modz_args: Additional custom arguments passed as kwargs if operation="modz". See pycytominer.cyto_utils.modz for more details.
    :type modz_args: dict
    :param compression_options: the method to compress output data, defaults to None. See pycytominer.cyto_utils.output.py for options
    :type compression_options: str
    :param float_format: decimal precision to use in writing output file, defaults to None. For example, use "%.3g" for 3 decimal precision.

    :Example:

    import pandas as pd
    from pycytominer import consensus

    data_df = pd.concat(
        [
            pd.DataFrame(
                {
                    "Metadata_Plate": "X",
                    "Metadata_Well": "a",
                    "Cells_x": [0.1, 0.3, 0.8],
                    "Nuclei_y": [0.5, 0.3, 0.1],
                }
            ),
            pd.DataFrame(
                {
                    "Metadata_Plate": "X",
                    "Metadata_Well": "b",
                    "Cells_x": [0.4, 0.2, -0.5],
                    "Nuclei_y": [-0.8, 1.2, -0.5],
                }
            ),
        ]
    ).reset_index(drop=True)

    consensus_df = consensus(
        profiles=data_df,
        replicate_columns=["Metadata_Plate", "Metadata_Well"],
        operation="median",
        features="infer",
        output_file="none",
    )
    """
    # Confirm that the operation is supported
    check_consensus_operation(operation)

    # Load Data
    profiles = load_profiles(profiles)

    if operation == "modz":
        consensus_df = modz(population_df=profiles,
                            replicate_columns=replicate_columns,
                            features=features,
                            **modz_args)
    else:
        consensus_df = aggregate(
            population_df=profiles,
            strata=replicate_columns,
            features=features,
            operation=operation,
            subset_data_df="none",
        )

    if output_file != "none":
        output(
            df=consensus_df,
            output_filename=output_file,
            compression_options=compression_options,
            float_format=float_format,
        )
    else:
        return consensus_df

コード例 #12

0

ファイルを表示

ファイル: 2.build-consensus-signatures.py プロジェクト: broadinstitute/cell-health

                            axis="columns"))

file = os.path.join("results", "all_profile_metadata.tsv")
all_measurements_df.to_csv(file, sep='\t', index=False)

print(all_measurements_df.shape)
all_measurements_df.head()

# ## A. Apply Median Consensus Aggregation
#
# ### 1) To the Cell Painting Data

# In[10]:

x_median_df = aggregate(x_df,
                        strata=["Metadata_cell_line", "Metadata_pert_name"],
                        features="infer",
                        operation="median")

x_median_df = (x_median_df.query(
    "Metadata_pert_name in @all_measurements_df.Metadata_pert_name.unique()"
).query(
    "Metadata_cell_line in @all_measurements_df.Metadata_cell_line.unique()"
).reset_index(drop=True).reset_index().rename({"index": "Metadata_profile_id"},
                                              axis='columns'))
x_median_df.Metadata_profile_id = [
    "profile_{}".format(x) for x in x_median_df.Metadata_profile_id
]

print(x_median_df.shape)
x_median_df.head()

コード例 #13

0

ファイルを表示

ファイル: 3.finalize-perturbseq.py プロジェクト: broadinstitute/grit-benchmark

    output_file,
    index=False,
    sep="\t",
    compression={"method": "gzip", "mtime": 1}
)


# ## Calculate bulk perturbseq data

# In[7]:


# Perform single cell aggregation into bulk
bulk_df = aggregate(
    population_df=sc_df,
    strata=["Metadata_guide_identity"],
    features=gene_features,
    operation="median"
)

# create a column for the gene
bulk_df = (
    bulk_df
    .assign(Metadata_gene_identity=[x.split("_")[0] for x in bulk_df.Metadata_guide_identity])
    .query("Metadata_gene_identity != '*'")
)

bulk_df = bulk_df.reindex(["Metadata_guide_identity", "Metadata_gene_identity"] + gene_features, axis="columns")

print(bulk_df.shape)
bulk_df.head()