def pipeline_normalize(self, batch, plate, steps, samples, suffix=None):
        normalize_steps = steps
        output_dir = pathlib.PurePath(".", self.pipeline_output, batch, plate)
        annotate_output_file = pathlib.PurePath(output_dir,
                                                f"{plate}_augmented.csv.gz")
        normalize_output_file = pathlib.PurePath(output_dir,
                                                 f"{plate}_normalized.csv.gz")
        if suffix:
            normalize_output_file = pathlib.PurePath(
                output_dir, f"{plate}_normalized_{suffix}.csv.gz")

        normalization_features = normalize_steps["features"]
        normalization_method = normalize_steps["method"]

        if normalization_features == "infer" and self.noncanonical:
            normalization_features = cyto_utils.infer_cp_features(
                pd.read_csv(annotate_output_file),
                compartments=self.compartments)

        normalize(
            profiles=annotate_output_file,
            features=normalization_features,
            samples=samples,
            method=normalization_method,
            output_file=normalize_output_file,
            compression_options=self.pipeline_options["compression"],
            float_format=self.pipeline_options["float_format"],
        )
def normalize_profile(plate,
                      output_dir,
                      commit="cd91bd0daacef2b5ea25dcceb62482bb664d9de1"):
    link = f"https://github.com/broadinstitute/cell-health/raw/{commit}/1.generate-profiles/data/profiles/{plate}/{plate}_augmented.csv.gz"

    annotate_df = pd.read_csv(link)

    norm_file = pathlib.Path(
        f"{output_dir}/{plate}_wholeplate_normalized.csv.gz")
    feat_select_file = pathlib.Path(
        f"{output_dir}/{plate}_wholeplate_normalized_feature_selected.csv.gz")

    normalize(profiles=annotate_df,
              features="infer",
              meta_features=meta_features,
              samples="all",
              method="mad_robustize",
              output_file=norm_file,
              compression_options={
                  "method": "gzip",
                  "mtime": 1
              })
Exemple #3
0
normalize_singlecell_from_single_file = sc_config[
    "output_one_single_cell_file_only"]

normalize_args = config["options"]["profile"]["normalize"]
normalize_levels = normalize_args["levels"]
normalize_by_samples = normalize_args["by_samples"]
normalize_these_features = normalize_args["features"]
normalize_method = normalize_args["method"]

for data_level in normalize_levels:
    if data_level == "single_cell":
        if not normalize_singlecell_from_single_file:
            continue

    file_to_normalize = normalize_input_files[data_level]
    output_file = normalize_output_files[data_level]

    print(f"Now normalizing {data_level}...with operation: {normalize_method}")

    df = pd.read_csv(file_to_normalize)

    normalize(
        profiles=df,
        features=normalize_these_features,
        samples=normalize_by_samples,
        method=normalize_method,
        output_file=output_file,
        compression=compression,
        float_format=float_format,
    )
Exemple #4
0
def test_merge_single_cells():
    sc_merged_df = ap.merge_single_cells()

    # Assert that the image data was merged
    assert all(x in sc_merged_df.columns
               for x in ["Metadata_Plate", "Metadata_Well"])

    # Assert that metadata columns were renamed appropriately
    for x in ap.full_merge_suffix_rename:
        assert ap.full_merge_suffix_rename[x] == "Metadata_{x}".format(x=x)

    # Perform a manual merge
    manual_merge = cytoplasm_df.merge(
        cells_df,
        left_on=["TableNumber", "ImageNumber", "Cytoplasm_Parent_Cells"],
        right_on=["TableNumber", "ImageNumber", "ObjectNumber"],
        suffixes=["_cytoplasm", "_cells"],
    ).merge(
        nuclei_df,
        left_on=["TableNumber", "ImageNumber", "Cytoplasm_Parent_Nuclei"],
        right_on=["TableNumber", "ImageNumber", "ObjectNumber"],
        suffixes=["_cytoplasm", "_nuclei"],
    )

    manual_merge = image_df.merge(manual_merge, on=ap.merge_cols,
                                  how="right").rename(
                                      ap.full_merge_suffix_rename,
                                      axis="columns")

    # Confirm that the merge correctly reversed the object number (opposite from Parent)
    assert (sc_merged_df.Metadata_ObjectNumber_cytoplasm.tolist()[::-1] ==
            sc_merged_df.Metadata_ObjectNumber.tolist())
    assert (manual_merge.Metadata_ObjectNumber_cytoplasm.tolist()[::-1] ==
            sc_merged_df.Metadata_ObjectNumber.tolist())
    assert (manual_merge.Metadata_ObjectNumber_cytoplasm.tolist()[::-1] ==
            sc_merged_df.Metadata_ObjectNumber.tolist())
    assert (manual_merge.Metadata_ObjectNumber_cells.tolist() ==
            sc_merged_df.Metadata_ObjectNumber.tolist())

    # Confirm the merge and adding merge options
    for method in ["standardize", "robustize"]:
        for samples in ["all", "Metadata_ImageNumber == 'x'"]:
            for features in ["infer", ["Cytoplasm_a", "Cells_a"]]:

                norm_method_df = ap.merge_single_cells(
                    single_cell_normalize=True,
                    normalize_args={
                        "method": method,
                        "samples": samples,
                        "features": features,
                    },
                )

                manual_merge_normalize = normalize(manual_merge,
                                                   method=method,
                                                   samples=samples,
                                                   features=features)

                pd.testing.assert_frame_equal(
                    norm_method_df.sort_index(axis=1),
                    manual_merge_normalize.sort_index(axis=1),
                )

    # Test non-canonical compartment merging
    new_sc_merge_df = ap_new.merge_single_cells()

    assert sum(new_sc_merge_df.columns.str.startswith("New")) == 4
    assert (new_compartment_df.ObjectNumber.tolist()[::-1] ==
            new_sc_merge_df.Metadata_ObjectNumber_new.tolist())

    norm_new_method_df = ap_new.merge_single_cells(
        single_cell_normalize=True,
        normalize_args={
            "method": "standardize",
            "samples": "all",
            "features": "infer",
        },
    )

    norm_new_method_no_feature_infer_df = ap_new.merge_single_cells(
        single_cell_normalize=True,
        normalize_args={
            "method": "standardize",
            "samples": "all",
        },
    )

    default_feature_infer_df = ap_new.merge_single_cells(
        single_cell_normalize=True)

    pd.testing.assert_frame_equal(norm_new_method_df, default_feature_infer_df)
    pd.testing.assert_frame_equal(norm_new_method_df,
                                  norm_new_method_no_feature_infer_df)

    new_compartment_cols = infer_cp_features(new_compartment_df,
                                             compartments=ap_new.compartments)
    traditional_norm_df = normalize(
        ap_new.image_df.merge(new_compartment_df, on=ap.merge_cols),
        features=new_compartment_cols,
        samples="all",
        method="standardize",
    )

    pd.testing.assert_frame_equal(
        norm_new_method_df.loc[:, new_compartment_cols].abs().describe(),
        traditional_norm_df.loc[:, new_compartment_cols].abs().describe(),
    )
def process_profile(sql_file, batch, plate, pipeline):
    """
    Given batch details and a pipeline, process morphology profiles
    """
    assert batch in sql_file, "batch {} not recognized in sql file {}".format(
        batch, sql_file)
    assert plate in sql_file, "plate {} not recognized in sql file {}".format(
        plate, sql_file)

    # Set output directory information
    pipeline_output = pipeline["output_dir"]
    output_dir = os.path.join(pipeline_output, batch, plate)
    os.makedirs(output_dir, exist_ok=True)

    # Set output file information
    aggregate_out_file = os.path.join(output_dir, "{}.csv.gz".format(plate))
    annotate_out_file = os.path.join(output_dir,
                                     "{}_augmented.csv.gz".format(plate))
    normalize_out_file = os.path.join(output_dir,
                                      "{}_normalized.csv.gz".format(plate))
    feature_out_file = os.path.join(
        output_dir, "{}_normalized_feature_selected.csv.gz".format(plate))

    # Load pipeline options
    compression = process_pipeline(pipeline["options"], option="compression")
    samples = process_pipeline(pipeline["options"], option="samples")

    # Load and setup platemap info
    workspace_dir = pipeline["workspace_dir"]
    batch_dir = os.path.join(workspace_dir, "backend", batch)
    metadata_dir = os.path.join(workspace_dir, "metadata", batch)

    barcode_plate_map_file = os.path.join(metadata_dir,
                                          sorted(os.listdir(metadata_dir))[0])
    barcode_plate_map_df = pd.read_csv(barcode_plate_map_file)
    plate_map_name = barcode_plate_map_df.query(
        "Assay_Plate_Barcode == @plate").Plate_Map_Name.values[0]
    plate_map_file = os.path.join(metadata_dir, "platemap",
                                  "{}.txt".format(plate_map_name))
    plate_map_df = pd.read_csv(plate_map_file, sep="\t")
    plate_map_df.columns = [
        "Metadata_{}".format(x) if not x.startswith("Metadata_") else x
        for x in plate_map_df.columns
    ]
    platemap_well_column = pipeline["platemap_well_column"]

    # Step 1: Aggregate
    aggregate_steps = pipeline["aggregate"]
    if aggregate_steps["perform"]:
        aggregate_features = aggregate_steps["features"]
        aggregate_operation = aggregate_steps["method"]
        aggregate_plate_column = aggregate_steps["plate_column"]
        aggregate_well_column = aggregate_steps["well_column"]

        strata = [aggregate_plate_column, aggregate_well_column]

        if "site_column" in aggregate_steps:
            aggregate_site_column = aggregate_steps["site_column"]
            strata += [aggregate_site_column]

        ap = AggregateProfiles(
            sql_file,
            strata=strata,
            features=aggregate_features,
            operation=aggregate_operation,
        )

        ap.aggregate_profiles(output_file=aggregate_out_file,
                              compression=compression)

        if pipeline["count"]["perform"]:
            count_dir = pipeline["count"]["output_dir"]
            os.makedirs(count_dir, exist_ok=True)

            cell_count_file = os.path.join(
                count_dir, "{}_{}_cell_count.tsv".format(batch, plate))

            cell_count_df = ap.count_cells()

            cell_count_df = cell_count_df.merge(
                plate_map_df,
                left_on=aggregate_well_column,
                right_on=platemap_well_column,
            ).drop(platemap_well_column, axis="columns")

            cell_count_df.to_csv(cell_count_file, sep="\t", index=False)

    # Annotate Profiles
    annotate_steps = pipeline["annotate"]
    if annotate_steps["perform"]:
        annotate_well_column = annotate_steps["well_column"]
        annotate(
            profiles=aggregate_out_file,
            platemap=plate_map_df,
            join_on=[platemap_well_column, annotate_well_column],
            output_file=annotate_out_file,
            compression=compression,
        )

    # Normalize Profiles
    normalize_steps = pipeline["normalize"]
    if normalize_steps["perform"]:
        norm_features = normalize_steps["features"]
        norm_method = normalize_steps["method"]
        normalize(
            profiles=annotate_out_file,
            features=norm_features,
            samples=samples,
            method=norm_method,
            output_file=normalize_out_file,
            compression=compression,
        )

    # Apply feature selection
    feature_select_steps = pipeline["feature_select"]
    if feature_select_steps["perform"]:
        feature_select_operations = feature_select_steps["operations"]
        feature_select_features = feature_select_steps["features"]
        feature_select(
            profiles=normalize_out_file,
            features=feature_select_features,
            samples="none",
            operation=feature_select_operations,
            output_file=feature_out_file,
            compression=compression,
            corr_threshold=0.9,
            corr_method="pearson",
        )
print(feature_df.shape)
feature_df.head()

# In[6]:

# Perform spherize transform
for file in data_files:
    # Extract plate from file name
    plate = str(file).split("/")[-1].split("_")[0]
    print(f"Now processing {plate}...")

    # Load data and apply feature selection
    df = pd.read_csv(file).reindex(feature_df.index, axis="columns")

    # Get feature names
    metadata_cols = ["Image_Metadata_Well"] + infer_cp_features(df,
                                                                metadata=True)
    feature_cols = infer_cp_features(
        df, compartments=["Cells", "Cytoplasm", "Nuclei"])

    output_file = pathlib.Path(f"{data_dir}/{plate}_{output_file_suffix}")

    # Apply spherize transformation and output files
    normalize(profiles=df,
              features=feature_cols,
              meta_features=metadata_cols,
              method="spherize",
              spherize_method="ZCA-cor",
              spherize_center=True,
              output_file=output_file)
Exemple #7
0
def process_profile(sql_file, batch, plate, pipeline):
    """
    Given batch details and a pipeline, process morphology profiles
    """
    assert batch in sql_file, "batch {} not recognized in sql file {}".format(
        batch, sql_file)
    assert plate in sql_file, "plate {} not recognized in sql file {}".format(
        plate, sql_file)

    # Set output directory information
    pipeline_output = pipeline["output_dir"]
    output_dir = os.path.join(pipeline_output, batch, plate)
    os.makedirs(output_dir, exist_ok=True)

    # Set output file information
    aggregate_out_file = os.path.join(output_dir, "{}.csv.gz".format(plate))
    annotate_out_file = os.path.join(output_dir,
                                     "{}_augmented.csv.gz".format(plate))
    normalize_out_file = os.path.join(output_dir,
                                      "{}_normalized.csv.gz".format(plate))
    feature_out_file = os.path.join(
        output_dir, "{}_normalized_feature_selected.csv.gz".format(plate))

    # Load pipeline options
    compression = process_pipeline(pipeline["options"], option="compression")
    sc_float_format = process_pipeline(pipeline["options"],
                                       option="sc_float_format")
    samples = process_pipeline(pipeline["options"], option="samples")

    # Load and setup platemap info
    workspace_dir = pipeline["workspace_dir"]
    batch_dir = os.path.join(workspace_dir, "backend", batch)
    metadata_dir = os.path.join(workspace_dir, "metadata", batch)

    barcode_plate_map_file = os.path.join(metadata_dir,
                                          sorted(os.listdir(metadata_dir))[0])
    barcode_plate_map_df = pd.read_csv(barcode_plate_map_file)
    plate_map_name = barcode_plate_map_df.query(
        "Assay_Plate_Barcode == @plate").Plate_Map_Name.values[0]
    plate_map_file = os.path.join(metadata_dir, "platemap",
                                  "{}.txt".format(plate_map_name))
    plate_map_df = pd.read_csv(plate_map_file, sep="\t")
    plate_map_df.columns = [
        "Metadata_{}".format(x) if not x.startswith("Metadata_") else x
        for x in plate_map_df.columns
    ]
    platemap_well_column = pipeline["platemap_well_column"]

    # Process Bulk profiles
    # Step 1: Aggregate
    aggregate_steps = pipeline["aggregate"]
    aggregate_features = aggregate_steps["features"]
    aggregate_operation = aggregate_steps["method"]
    aggregate_plate_column = aggregate_steps["plate_column"]
    aggregate_well_column = aggregate_steps["well_column"]

    strata = [aggregate_plate_column, aggregate_well_column]

    if "site_column" in aggregate_steps:
        aggregate_site_column = aggregate_steps["site_column"]
        strata += [aggregate_site_column]

    if aggregate_steps["perform"]:
        ap = AggregateProfiles(
            sql_file,
            strata=strata,
            features=aggregate_features,
            operation=aggregate_operation,
        )

        ap.aggregate_profiles(output_file=aggregate_out_file,
                              compression=compression)

    if pipeline["count"]["perform"]:
        if not aggregate_steps["perform"]:
            ap = AggregateProfiles(
                sql_file,
                strata=strata,
                features=aggregate_features,
                operation=aggregate_operation,
            )
        count_dir = pipeline["count"]["output_dir"]
        os.makedirs(count_dir, exist_ok=True)

        cell_count_file = os.path.join(
            count_dir, "{}_{}_cell_count.tsv".format(batch, plate))

        cell_count_df = ap.count_cells()

        cell_count_df = cell_count_df.merge(
            plate_map_df,
            left_on=aggregate_well_column,
            right_on=platemap_well_column,
        ).drop(platemap_well_column, axis="columns")

        cell_count_df.to_csv(cell_count_file, sep="\t", index=False)

    # Annotate Profiles
    annotate_steps = pipeline["annotate"]
    annotate_well_column = annotate_steps["well_column"]
    if annotate_steps["perform"]:
        annotate(
            profiles=aggregate_out_file,
            platemap=plate_map_df,
            join_on=[platemap_well_column, annotate_well_column],
            output_file=annotate_out_file,
            compression=compression,
        )

    # Normalize Profiles
    normalize_steps = pipeline["normalize"]
    norm_features = normalize_steps["features"]
    norm_method = normalize_steps["method"]
    if normalize_steps["perform"]:
        normalize(
            profiles=annotate_out_file,
            features=norm_features,
            samples=samples,
            method=norm_method,
            output_file=normalize_out_file,
            compression=compression,
        )

    # Apply feature selection
    feature_select_steps = pipeline["feature_select"]
    feature_select_operations = feature_select_steps["operations"]
    feature_select_features = feature_select_steps["features"]
    if feature_select_steps["perform"]:
        feature_select(
            profiles=normalize_out_file,
            features=feature_select_features,
            samples=samples,
            operation=feature_select_operations,
            output_file=feature_out_file,
            compression=compression,
            corr_threshold=0.9,
            corr_method="pearson",
        )

    sc_steps = pipeline["single_cell"]
    if sc_steps["perform"]:
        if not aggregate_steps["perform"]:
            ap = AggregateProfiles(
                sql_file,
                strata=strata,
                features=aggregate_features,
                operation=aggregate_operation,
            )

        # Load cells
        query = "select * from cells"
        cell_df = pd.read_sql(sql=query, con=ap.conn)

        # Load cytoplasm
        query = "select * from cytoplasm"
        cytoplasm_df = pd.read_sql(sql=query, con=ap.conn)

        # Load nuclei
        query = "select * from nuclei"
        nuclei_df = pd.read_sql(sql=query, con=ap.conn)

        # Merge single cells together
        sc_merged_df = (cell_df.merge(
            cytoplasm_df.drop("ObjectNumber", axis="columns"),
            left_on=["TableNumber", "ImageNumber", "ObjectNumber"],
            right_on=["TableNumber", "ImageNumber", "Cytoplasm_Parent_Cells"],
            how="inner",
        ).drop("ObjectNumber", axis="columns").merge(
            nuclei_df,
            left_on=["TableNumber", "ImageNumber", "Cytoplasm_Parent_Nuclei"],
            right_on=["TableNumber", "ImageNumber", "ObjectNumber"],
            how="inner",
        ))

        # Merge image data info
        sc_merged_df = ap.image_df.merge(sc_merged_df,
                                         how="right",
                                         on=ap.merge_cols)

        # Make sure column names are correctly prefixed
        prefix = ["Metadata", "Cells", "Cytoplasm", "Nuclei"]
        cols = []
        for col in sc_merged_df.columns:
            if any([col.startswith(x) for x in prefix]):
                cols.append(col)
            else:
                cols.append(f"Metadata_{col}")
        sc_merged_df.columns = cols

        sc_merged_df = annotate(
            profiles=sc_merged_df,
            platemap=plate_map_df,
            join_on=[platemap_well_column, annotate_well_column],
            output_file="none",
        )

        if sc_steps["normalize"]:
            sc_merged_df = normalize(
                profiles=sc_merged_df,
                features=norm_features,
                samples=samples,
                method=norm_method,
                output_file="none",
            )

        if sc_steps["feature_select"]:
            sc_merged_df = feature_select(
                profiles=sc_merged_df,
                features=feature_select_features,
                samples=samples,
                operation=feature_select_operations,
                output_file="none",
                corr_threshold=0.9,
                corr_method="pearson",
            )

        sc_pipeline_output = pipeline["sc_output_dir"]
        sc_output_dir = os.path.join(sc_pipeline_output, batch, plate)
        os.makedirs(sc_output_dir, exist_ok=True)

        # Set output file information
        sc_out_file = os.path.join(sc_output_dir,
                                   "{}_single_cell.csv.gz".format(plate))
        output(
            df=sc_merged_df,
            output_filename=sc_out_file,
            compression="gzip",
            float_format=sc_float_format,
        )
Exemple #8
0
    def merge_single_cells(
        self,
        compute_subsample=False,
        sc_output_file="none",
        compression_options=None,
        float_format=None,
        single_cell_normalize=False,
        normalize_args=None,
    ):
        """Given the linking columns, merge single cell data. Normalization is also supported.

        Parameters
        ----------
        compute_subsample : bool, default False
            Whether or not to compute subsample.
        sc_output_file : str, optional
            The name of a file to output.
        compression_options : str, optional
            Compression arguments as input to pandas.to_csv() with pandas version >= 1.2.
        float_format : str, optional
            Decimal precision to use in writing output file.
        single_cell_normalize : bool, default False
            Whether or not to normalize the single cell data.
        normalize_args : dict, optional
            Additional arguments passed as input to pycytominer.normalize().

        Returns
        -------
        pandas.core.frame.DataFrame
            Either a dataframe (if output_file="none") or will write to file.

        """

        # Load the single cell dataframe by merging on the specific linking columns
        sc_df = ""
        linking_check_cols = []
        merge_suffix_rename = []
        for left_compartment in self.compartment_linking_cols:
            for right_compartment in self.compartment_linking_cols[left_compartment]:
                # Make sure only one merge per combination occurs
                linking_check = "-".join(sorted([left_compartment, right_compartment]))
                if linking_check in linking_check_cols:
                    continue

                # Specify how to indicate merge suffixes
                merge_suffix = [
                    "_{comp_l}".format(comp_l=left_compartment),
                    "_{comp_r}".format(comp_r=right_compartment),
                ]
                merge_suffix_rename += merge_suffix
                left_link_col = self.compartment_linking_cols[left_compartment][
                    right_compartment
                ]
                right_link_col = self.compartment_linking_cols[right_compartment][
                    left_compartment
                ]

                if isinstance(sc_df, str):
                    initial_df = self.load_compartment(compartment=left_compartment)

                    if compute_subsample:
                        # Sample cells proportionally by self.strata
                        self.get_subsample(df=initial_df, rename_col=False)

                        subset_logic_df = self.subset_data_df.drop(
                            self.image_df.columns, axis="columns"
                        )

                        initial_df = subset_logic_df.merge(
                            initial_df, how="left", on=subset_logic_df.columns.tolist()
                        ).reindex(initial_df.columns, axis="columns")

                    sc_df = initial_df.merge(
                        self.load_compartment(compartment=right_compartment),
                        left_on=self.merge_cols + [left_link_col],
                        right_on=self.merge_cols + [right_link_col],
                        suffixes=merge_suffix,
                    )
                else:
                    sc_df = sc_df.merge(
                        self.load_compartment(compartment=right_compartment),
                        left_on=self.merge_cols + [left_link_col],
                        right_on=self.merge_cols + [right_link_col],
                        suffixes=merge_suffix,
                    )

                linking_check_cols.append(linking_check)

        # Add metadata prefix to merged suffixes
        full_merge_suffix_rename = []
        full_merge_suffix_original = []
        for col_name in self.merge_cols + list(self.linking_col_rename.keys()):
            full_merge_suffix_original.append(col_name)
            full_merge_suffix_rename.append("Metadata_{x}".format(x=col_name))

        for col_name in self.merge_cols + list(self.linking_col_rename.keys()):
            for suffix in set(merge_suffix_rename):
                full_merge_suffix_original.append("{x}{y}".format(x=col_name, y=suffix))
                full_merge_suffix_rename.append(
                    "Metadata_{x}{y}".format(x=col_name, y=suffix)
                )

        self.full_merge_suffix_rename = dict(
            zip(full_merge_suffix_original, full_merge_suffix_rename)
        )

        # Add image data to single cell dataframe
        if not self.load_image_data:
            self.load_image()
            self.load_image_data = True

        sc_df = (
            self.image_df.merge(sc_df, on=self.merge_cols, how="right")
            .rename(self.linking_col_rename, axis="columns")
            .rename(self.full_merge_suffix_rename, axis="columns")
        )
        if single_cell_normalize:
            # Infering features is tricky with non-canonical data
            if normalize_args is None:
                normalize_args = {}
                features = infer_cp_features(sc_df, compartments=self.compartments)
            elif "features" not in normalize_args:
                features = infer_cp_features(sc_df, compartments=self.compartments)
            elif normalize_args["features"] == "infer":
                features = infer_cp_features(sc_df, compartments=self.compartments)
            else:
                features = normalize_args["features"]

            normalize_args["features"] = features

            sc_df = normalize(profiles=sc_df, **normalize_args)

        if sc_output_file != "none":
            output(
                df=sc_df,
                output_filename=sc_output_file,
                compression_options=compression_options,
                float_format=float_format,
            )
        else:
            return sc_df
Exemple #9
0
# Output annotated file
cyto_utils.output(
    df=anno_df,
    output_filename=anno_file,
    float_format=float_format,
    compression_options=compression,
)

# Normalize Profiles (DMSO Control) - Level 4A Data
norm_dmso_file = pathlib.PurePath(output_dir,
                                  f"{plate_name}_normalized_dmso.csv.gz")
normalize(
    profiles=anno_df,
    samples="Metadata_broad_sample == 'DMSO'",
    method=norm_method,
    output_file=norm_dmso_file,
    float_format=float_format,
    compression_options=compression,
)

# Normalize Profiles (Whole Plate) - Level 4A Data
norm_file = pathlib.PurePath(output_dir, f"{plate_name}_normalized.csv.gz")
normalize(
    profiles=anno_df,
    samples="all",
    method=norm_method,
    output_file=norm_file,
    float_format=float_format,
    compression_options=compression,
)
Exemple #10
0
# Merge with the image information
merged_df = image_df.merge(merged_df,
                           on=["TableNumber", "ImageNumber"],
                           how="right")

print(merged_df.shape)
merged_df.head()

# ## Apply normalization, feature select, and output data

# In[12]:

normalized_df = normalize(merged_df,
                          features="infer",
                          meta_features="infer",
                          samples="all",
                          method="standardize")

# In[13]:

feature_select_df = feature_select(
    normalized_df,
    features="infer",
    operation=feature_select_opts,
    output_file="none",
    na_cutoff=na_cutoff,
    corr_threshold=corr_threshold,
)

print(feature_select_df.shape)
Exemple #11
0
def get_profiles(plate, backend_dir, metadata_dir, barcode_platemap_df):
    """
    Apply all profiling steps for a given plate.

    Output:
    Will write a series of processed files to disk
    """
    print("Processing {}.....".format(plate))
    sqlite_file = "sqlite:///{}/{}.sqlite".format(backend_dir, plate)

    # Load specific platemap
    platemap = barcode_platemap_df.query(
        "Assay_Plate_Barcode == @plate").Plate_Map_Name.values[0]
    platemap_file = os.path.join(metadata_dir, "platemap",
                                 "{}.csv".format(platemap))
    platemap_df = pd.read_csv(platemap_file)

    # Prepare sql file for processing
    ap = AggregateProfiles(
        sqlite_file, strata=["Image_Metadata_Plate", "Image_Metadata_Well"])

    # Count cells and output
    cell_count_file = os.path.join("results",
                                   "{}_cell_count.tsv".format(plate))
    cell_count_df = ap.count_cells()
    cell_count_df = cell_count_df.merge(
        platemap_df, left_on="Image_Metadata_Well",
        right_on="well_position").drop(["WellRow", "WellCol", "well_position"],
                                       axis="columns")
    cell_count_df.to_csv(cell_count_file, sep="\t", index=False)

    # Begin processing profiles
    output_dir = os.path.join("data", "profiles", plate)
    os.makedirs(output_dir, exist_ok=True)

    # Aggregate single cells into well profiles
    out_file = os.path.join(output_dir, "{}.csv.gz".format(plate))
    ap.aggregate_profiles(output_file=out_file, compression="gzip")

    # Annotate Profiles
    anno_file = os.path.join(output_dir, "{}_augmented.csv.gz".format(plate))
    annotate(
        profiles=out_file,
        platemap=platemap_df,
        join_on=["Metadata_well_position", "Image_Metadata_Well"],
        output_file=anno_file,
        compression="gzip",
    )

    # Define metadata features
    meta_features = [
        "Image_Metadata_Plate",
        "Image_Metadata_Well",
        "Metadata_WellRow",
        "Metadata_WellCol",
        "Metadata_gene_name",
        "Metadata_pert_name",
        "Metadata_broad_sample",
        "Metadata_cell_line",
    ]

    # Normalize Profiles
    norm_file = os.path.join(output_dir, "{}_normalized.csv.gz".format(plate))
    normalize(
        profiles=anno_file,
        features="infer",
        meta_features=meta_features,
        samples="Metadata_pert_name == 'EMPTY'",
        method="mad_robustize",
        output_file=norm_file,
        compression="gzip",
    )

    # Perform feature selection (just drop columns with high number of missingness)
    # Drop columns with high number of missingness, extreme values, and blacklist
    feat_file = os.path.join(
        output_dir, "{}_normalized_feature_select.csv.gz".format(plate))
    feature_select(
        profiles=norm_file,
        features="infer",
        samples="none",
        operation=[
            "drop_na_columns",
            "blacklist",
            "variance_threshold",
            "drop_outliers",
        ],
        output_file=feat_file,
        compression="gzip",
    )

    # Perform audits
    profile_df = pd.read_csv(feat_file).drop(
        ["Image_Metadata_Well", "Image_Metadata_Plate"], axis="columns")

    # Audit guide replicability
    audit_file = os.path.join("results", "{}_audit_guide.csv".format(plate))
    audit(
        profiles=profile_df,
        audit_groups=[
            "Metadata_pert_name", "Metadata_gene_name", "Metadata_cell_line"
        ],
        iterations=10,
        output_file=audit_file,
    )

    # Audit gene replicability
    audit_file = os.path.join("results", "{}_audit_gene.csv".format(plate))
    audit(
        profiles=profile_df,
        audit_groups=["Metadata_gene_name", "Metadata_cell_line"],
        iterations=10,
        output_file=audit_file,
    )
    
    # Set console output
    print(f"Now processing... {output_file}")

    # Initiate single cell class
    sc = cells.SingleCells(
        file_or_conn=sql_file,
        strata=["Image_Metadata_Plate", "Image_Metadata_Well"],
    )
    
    # Merge single cells
    sc_df = sc.merge_single_cells()
    
    # Normalize data
    sc_df = normalize(
        profiles=sc_df,
        method="standardize"
    )
    
    # Merge well and plate metadata
    sc_df = (
        sc.image_df.merge(
            metadata_df,
            left_on="Image_Metadata_Well",
            right_on="Metadata_well_position",
            how="left"
        ).merge(
            sc_df,
            left_on=["TableNumber", "ImageNumber"],
            right_on=["Metadata_TableNumber", "Metadata_ImageNumber"],
            how="right"
        )
for batch in matched_plates:
    compound_matches = matched_plates[batch]
    for compound in compound_matches:
        matched_plates[batch][compound]["data"] = (pd.concat(
            matched_plates[batch][compound]["data"]).reset_index(drop=True))

# In[7]:

# Normalize profiles
for batch in matched_plates:
    compound_matches = matched_plates[batch]
    for compound in compound_matches:
        df = matched_plates[batch][compound]["data"]
        normalized_data = normalize(profiles=df,
                                    features="infer",
                                    meta_features="infer",
                                    samples="all",
                                    method="standardize")

        matched_plates[batch][compound]["normalized_data"] = normalized_data

# In[8]:

# Detect the impact of batch - is it necessary to adjust?
n_components = 20
pca_columns = [f"pca_{x}" for x in range(0, n_components)]
model_formula = "pca_value ~ Metadata_clone_number + Metadata_treatment + Metadata_Plate + Metadata_treatment * Metadata_Plate"

anova_results_full_new_normalized = []
for batch in matched_plates:
    compound_matches = matched_plates[batch]
                                        na_cutoff=na_cut)
        else:
            profile_df = feature_select(profiles=profile_df,
                                        operation=feature_select_ops,
                                        na_cutoff=na_cut,
                                        corr_threshold=corr_threshold,
                                        blocklist_file=full_blocklist_file)

        # Step 2: Spherize transform
        if batch == "2017_12_05_Batch2":
            spherize_df = (profile_df.groupby([
                "Metadata_cell_line", "Metadata_time_point"
            ]).apply(
                lambda x: normalize(profiles=x,
                                    features="infer",
                                    meta_features="infer",
                                    samples="Metadata_broad_sample == 'DMSO'",
                                    method="spherize")))
        else:
            spherize_df = normalize(profiles=profile_df,
                                    features="infer",
                                    meta_features="infer",
                                    samples="Metadata_broad_sample == 'DMSO'",
                                    method="spherize")

        print(spherize_df.shape)
        spherize_df.head()

        # Step 3: Output profiles
        output(df=spherize_df, output_filename=output_file)