Example #1
0
def feature_selection(dataset_link):
    """
    Perform feature selection by dropping columns with null or 
    only zeros values, and highly correlated values from the data.
    
    params: 
    dataset_link: string of github link to the consensus dataset

    Returns:
    data: returned consensus dataframe
    
    """
    data = pd.read_csv(dataset_link, compression='gzip', error_bad_lines=False)
    cols = data.columns.tolist()
    drop_cols = [
        x for x in cols
        if ((data[x].isnull().sum()) | all(y == 0.0 for y in data[x].values))
    ]
    data.drop(drop_cols, axis=1, inplace=True)
    data = feature_select(
        data,
        operation=["correlation_threshold", "variance_threshold", "blocklist"],
        blocklist_file=
        "https://raw.githubusercontent.com/broadinstitute/lincs-cell-painting/1769b32c7cef3385ccc4cea7057386e8a1bde39a/utils/consensus_blocklist.txt"
    )
    return data
Example #2
0
def feature_selection(df_lvl4): 
    """
    Perform feature selection by dropping columns with null values 
    (greater than 384 i.e. equivalent to one plate worth of cell profiles) 
    and highly correlated values from the data.
    """
    metadata_columns = [x for x in df_lvl4.columns if (x.startswith("Metadata_"))]
    df_lvl4_metadata = df_lvl4[metadata_columns].copy()
    df_lvl4_features = df_lvl4.drop(metadata_columns, axis = 1)
    null_cols = [col for col in df_lvl4_features.columns if df_lvl4_features[col].isnull().sum() > 384]
    df_lvl4_features.drop(null_cols, axis = 1, inplace=True)
    df_lvl4_features = feature_select(df_lvl4_features, operation=["correlation_threshold", "variance_threshold"])
    
    for col in df_lvl4_features.columns:
        if df_lvl4_features[col].isnull().sum():
            df_lvl4_features[col].fillna(value=df_lvl4_features[col].mean(), inplace = True)
            
    df_meta_info = df_lvl4_metadata[['Metadata_broad_sample', 'Metadata_pert_id', 'Metadata_Plate', 'Metadata_Well',
                                     'Metadata_broad_id', 'Metadata_moa', 'Metadata_dose_recode']].copy()
    df_lvl4_new = pd.concat([df_meta_info, df_lvl4_features], axis=1)
    
    return df_lvl4_new
df.head(2)


# In[3]:


# Perform feature selection
feature_select_ops = [
    "variance_threshold",
    "correlation_threshold",
    "drop_na_columns",
    "blocklist",
    "drop_outliers",
]

df = feature_select(profiles=df, operation=feature_select_ops, na_cutoff=0)

features = infer_cp_features(df)
meta_features = infer_cp_features(df, metadata=True)

print(df.shape)
df.head(2)


# In[4]:


# Output feature selected file
output_file = pathlib.Path("data/cell_health_merged_feature_select.csv.gz")

output(
def process_profile(sql_file, batch, plate, pipeline):
    """
    Given batch details and a pipeline, process morphology profiles
    """
    assert batch in sql_file, "batch {} not recognized in sql file {}".format(
        batch, sql_file)
    assert plate in sql_file, "plate {} not recognized in sql file {}".format(
        plate, sql_file)

    # Set output directory information
    pipeline_output = pipeline["output_dir"]
    output_dir = os.path.join(pipeline_output, batch, plate)
    os.makedirs(output_dir, exist_ok=True)

    # Set output file information
    aggregate_out_file = os.path.join(output_dir, "{}.csv.gz".format(plate))
    annotate_out_file = os.path.join(output_dir,
                                     "{}_augmented.csv.gz".format(plate))
    normalize_out_file = os.path.join(output_dir,
                                      "{}_normalized.csv.gz".format(plate))
    feature_out_file = os.path.join(
        output_dir, "{}_normalized_feature_selected.csv.gz".format(plate))

    # Load pipeline options
    compression = process_pipeline(pipeline["options"], option="compression")
    samples = process_pipeline(pipeline["options"], option="samples")

    # Load and setup platemap info
    workspace_dir = pipeline["workspace_dir"]
    batch_dir = os.path.join(workspace_dir, "backend", batch)
    metadata_dir = os.path.join(workspace_dir, "metadata", batch)

    barcode_plate_map_file = os.path.join(metadata_dir,
                                          sorted(os.listdir(metadata_dir))[0])
    barcode_plate_map_df = pd.read_csv(barcode_plate_map_file)
    plate_map_name = barcode_plate_map_df.query(
        "Assay_Plate_Barcode == @plate").Plate_Map_Name.values[0]
    plate_map_file = os.path.join(metadata_dir, "platemap",
                                  "{}.txt".format(plate_map_name))
    plate_map_df = pd.read_csv(plate_map_file, sep="\t")
    plate_map_df.columns = [
        "Metadata_{}".format(x) if not x.startswith("Metadata_") else x
        for x in plate_map_df.columns
    ]
    platemap_well_column = pipeline["platemap_well_column"]

    # Step 1: Aggregate
    aggregate_steps = pipeline["aggregate"]
    if aggregate_steps["perform"]:
        aggregate_features = aggregate_steps["features"]
        aggregate_operation = aggregate_steps["method"]
        aggregate_plate_column = aggregate_steps["plate_column"]
        aggregate_well_column = aggregate_steps["well_column"]

        strata = [aggregate_plate_column, aggregate_well_column]

        if "site_column" in aggregate_steps:
            aggregate_site_column = aggregate_steps["site_column"]
            strata += [aggregate_site_column]

        ap = AggregateProfiles(
            sql_file,
            strata=strata,
            features=aggregate_features,
            operation=aggregate_operation,
        )

        ap.aggregate_profiles(output_file=aggregate_out_file,
                              compression=compression)

        if pipeline["count"]["perform"]:
            count_dir = pipeline["count"]["output_dir"]
            os.makedirs(count_dir, exist_ok=True)

            cell_count_file = os.path.join(
                count_dir, "{}_{}_cell_count.tsv".format(batch, plate))

            cell_count_df = ap.count_cells()

            cell_count_df = cell_count_df.merge(
                plate_map_df,
                left_on=aggregate_well_column,
                right_on=platemap_well_column,
            ).drop(platemap_well_column, axis="columns")

            cell_count_df.to_csv(cell_count_file, sep="\t", index=False)

    # Annotate Profiles
    annotate_steps = pipeline["annotate"]
    if annotate_steps["perform"]:
        annotate_well_column = annotate_steps["well_column"]
        annotate(
            profiles=aggregate_out_file,
            platemap=plate_map_df,
            join_on=[platemap_well_column, annotate_well_column],
            output_file=annotate_out_file,
            compression=compression,
        )

    # Normalize Profiles
    normalize_steps = pipeline["normalize"]
    if normalize_steps["perform"]:
        norm_features = normalize_steps["features"]
        norm_method = normalize_steps["method"]
        normalize(
            profiles=annotate_out_file,
            features=norm_features,
            samples=samples,
            method=norm_method,
            output_file=normalize_out_file,
            compression=compression,
        )

    # Apply feature selection
    feature_select_steps = pipeline["feature_select"]
    if feature_select_steps["perform"]:
        feature_select_operations = feature_select_steps["operations"]
        feature_select_features = feature_select_steps["features"]
        feature_select(
            profiles=normalize_out_file,
            features=feature_select_features,
            samples="none",
            operation=feature_select_operations,
            output_file=feature_out_file,
            compression=compression,
            corr_threshold=0.9,
            corr_method="pearson",
        )
Example #5
0
def process_profile(sql_file, batch, plate, pipeline):
    """
    Given batch details and a pipeline, process morphology profiles
    """
    assert batch in sql_file, "batch {} not recognized in sql file {}".format(
        batch, sql_file)
    assert plate in sql_file, "plate {} not recognized in sql file {}".format(
        plate, sql_file)

    # Set output directory information
    pipeline_output = pipeline["output_dir"]
    output_dir = os.path.join(pipeline_output, batch, plate)
    os.makedirs(output_dir, exist_ok=True)

    # Set output file information
    aggregate_out_file = os.path.join(output_dir, "{}.csv.gz".format(plate))
    annotate_out_file = os.path.join(output_dir,
                                     "{}_augmented.csv.gz".format(plate))
    normalize_out_file = os.path.join(output_dir,
                                      "{}_normalized.csv.gz".format(plate))
    feature_out_file = os.path.join(
        output_dir, "{}_normalized_feature_selected.csv.gz".format(plate))

    # Load pipeline options
    compression = process_pipeline(pipeline["options"], option="compression")
    sc_float_format = process_pipeline(pipeline["options"],
                                       option="sc_float_format")
    samples = process_pipeline(pipeline["options"], option="samples")

    # Load and setup platemap info
    workspace_dir = pipeline["workspace_dir"]
    batch_dir = os.path.join(workspace_dir, "backend", batch)
    metadata_dir = os.path.join(workspace_dir, "metadata", batch)

    barcode_plate_map_file = os.path.join(metadata_dir,
                                          sorted(os.listdir(metadata_dir))[0])
    barcode_plate_map_df = pd.read_csv(barcode_plate_map_file)
    plate_map_name = barcode_plate_map_df.query(
        "Assay_Plate_Barcode == @plate").Plate_Map_Name.values[0]
    plate_map_file = os.path.join(metadata_dir, "platemap",
                                  "{}.txt".format(plate_map_name))
    plate_map_df = pd.read_csv(plate_map_file, sep="\t")
    plate_map_df.columns = [
        "Metadata_{}".format(x) if not x.startswith("Metadata_") else x
        for x in plate_map_df.columns
    ]
    platemap_well_column = pipeline["platemap_well_column"]

    # Process Bulk profiles
    # Step 1: Aggregate
    aggregate_steps = pipeline["aggregate"]
    aggregate_features = aggregate_steps["features"]
    aggregate_operation = aggregate_steps["method"]
    aggregate_plate_column = aggregate_steps["plate_column"]
    aggregate_well_column = aggregate_steps["well_column"]

    strata = [aggregate_plate_column, aggregate_well_column]

    if "site_column" in aggregate_steps:
        aggregate_site_column = aggregate_steps["site_column"]
        strata += [aggregate_site_column]

    if aggregate_steps["perform"]:
        ap = AggregateProfiles(
            sql_file,
            strata=strata,
            features=aggregate_features,
            operation=aggregate_operation,
        )

        ap.aggregate_profiles(output_file=aggregate_out_file,
                              compression=compression)

    if pipeline["count"]["perform"]:
        if not aggregate_steps["perform"]:
            ap = AggregateProfiles(
                sql_file,
                strata=strata,
                features=aggregate_features,
                operation=aggregate_operation,
            )
        count_dir = pipeline["count"]["output_dir"]
        os.makedirs(count_dir, exist_ok=True)

        cell_count_file = os.path.join(
            count_dir, "{}_{}_cell_count.tsv".format(batch, plate))

        cell_count_df = ap.count_cells()

        cell_count_df = cell_count_df.merge(
            plate_map_df,
            left_on=aggregate_well_column,
            right_on=platemap_well_column,
        ).drop(platemap_well_column, axis="columns")

        cell_count_df.to_csv(cell_count_file, sep="\t", index=False)

    # Annotate Profiles
    annotate_steps = pipeline["annotate"]
    annotate_well_column = annotate_steps["well_column"]
    if annotate_steps["perform"]:
        annotate(
            profiles=aggregate_out_file,
            platemap=plate_map_df,
            join_on=[platemap_well_column, annotate_well_column],
            output_file=annotate_out_file,
            compression=compression,
        )

    # Normalize Profiles
    normalize_steps = pipeline["normalize"]
    norm_features = normalize_steps["features"]
    norm_method = normalize_steps["method"]
    if normalize_steps["perform"]:
        normalize(
            profiles=annotate_out_file,
            features=norm_features,
            samples=samples,
            method=norm_method,
            output_file=normalize_out_file,
            compression=compression,
        )

    # Apply feature selection
    feature_select_steps = pipeline["feature_select"]
    feature_select_operations = feature_select_steps["operations"]
    feature_select_features = feature_select_steps["features"]
    if feature_select_steps["perform"]:
        feature_select(
            profiles=normalize_out_file,
            features=feature_select_features,
            samples=samples,
            operation=feature_select_operations,
            output_file=feature_out_file,
            compression=compression,
            corr_threshold=0.9,
            corr_method="pearson",
        )

    sc_steps = pipeline["single_cell"]
    if sc_steps["perform"]:
        if not aggregate_steps["perform"]:
            ap = AggregateProfiles(
                sql_file,
                strata=strata,
                features=aggregate_features,
                operation=aggregate_operation,
            )

        # Load cells
        query = "select * from cells"
        cell_df = pd.read_sql(sql=query, con=ap.conn)

        # Load cytoplasm
        query = "select * from cytoplasm"
        cytoplasm_df = pd.read_sql(sql=query, con=ap.conn)

        # Load nuclei
        query = "select * from nuclei"
        nuclei_df = pd.read_sql(sql=query, con=ap.conn)

        # Merge single cells together
        sc_merged_df = (cell_df.merge(
            cytoplasm_df.drop("ObjectNumber", axis="columns"),
            left_on=["TableNumber", "ImageNumber", "ObjectNumber"],
            right_on=["TableNumber", "ImageNumber", "Cytoplasm_Parent_Cells"],
            how="inner",
        ).drop("ObjectNumber", axis="columns").merge(
            nuclei_df,
            left_on=["TableNumber", "ImageNumber", "Cytoplasm_Parent_Nuclei"],
            right_on=["TableNumber", "ImageNumber", "ObjectNumber"],
            how="inner",
        ))

        # Merge image data info
        sc_merged_df = ap.image_df.merge(sc_merged_df,
                                         how="right",
                                         on=ap.merge_cols)

        # Make sure column names are correctly prefixed
        prefix = ["Metadata", "Cells", "Cytoplasm", "Nuclei"]
        cols = []
        for col in sc_merged_df.columns:
            if any([col.startswith(x) for x in prefix]):
                cols.append(col)
            else:
                cols.append(f"Metadata_{col}")
        sc_merged_df.columns = cols

        sc_merged_df = annotate(
            profiles=sc_merged_df,
            platemap=plate_map_df,
            join_on=[platemap_well_column, annotate_well_column],
            output_file="none",
        )

        if sc_steps["normalize"]:
            sc_merged_df = normalize(
                profiles=sc_merged_df,
                features=norm_features,
                samples=samples,
                method=norm_method,
                output_file="none",
            )

        if sc_steps["feature_select"]:
            sc_merged_df = feature_select(
                profiles=sc_merged_df,
                features=feature_select_features,
                samples=samples,
                operation=feature_select_operations,
                output_file="none",
                corr_threshold=0.9,
                corr_method="pearson",
            )

        sc_pipeline_output = pipeline["sc_output_dir"]
        sc_output_dir = os.path.join(sc_pipeline_output, batch, plate)
        os.makedirs(sc_output_dir, exist_ok=True)

        # Set output file information
        sc_out_file = os.path.join(sc_output_dir,
                                   "{}_single_cell.csv.gz".format(plate))
        output(
            df=sc_merged_df,
            output_filename=sc_out_file,
            compression="gzip",
            float_format=sc_float_format,
        )
Example #6
0
# In[4]:


for plate in plate_files:
    plate_file = plate_files[plate]
    output_file = pathlib.Path(f"{sc_dir}/{plate}_normalized_featureselected.csv.gz")

    # Set console output
    print(f"Now performing feature selection for... {plate_file}")
    sc_df = pd.read_csv(plate_file, low_memory=False)
    print("Before feature selection:")
    print(sc_df.shape)
    
    sc_df = feature_select(
        profiles=sc_df,
        operation=feature_select_operations,
        na_cutoff=na_cutoff,
    )
    
    print("After feature selection:")
    print(sc_df.shape)
    
    # Output file to disk
    output(
        df=sc_df,
        output_filename=output_file,
        sep=",",
        float_format="%.5f",
        compression_options=compression_options,
    )
Example #7
0
complete_consensus_df = complete_consensus_df.assign(
    Metadata_unique_id=complete_consensus_df.Metadata_broad_sample + "_dose_" +
    complete_consensus_df.Metadata_dose_recode.astype(str))

print(complete_consensus_df.shape)
complete_consensus_df.head(2)

# In[16]:

# Perform feature selection
complete_consensus_df = feature_select(
    profiles=complete_consensus_df,
    features="infer",
    samples="none",
    operation=feature_select_opts,
    output_file="none",
    na_cutoff=0,
    corr_threshold=0.9,
    corr_method="pearson",
    freq_cut=0.05,
    unique_cut=0.1,
)

print(complete_consensus_df.shape)

# In[17]:

# Zero One Normalize Data
complete_consensus_df = transform(complete_consensus_df)

# In[18]:
Example #8
0
def get_profiles(plate, backend_dir, metadata_dir, barcode_platemap_df):
    """
    Apply all profiling steps for a given plate.

    Output:
    Will write a series of processed files to disk
    """
    print("Processing {}.....".format(plate))
    sqlite_file = "sqlite:///{}/{}.sqlite".format(backend_dir, plate)

    # Load specific platemap
    platemap = barcode_platemap_df.query(
        "Assay_Plate_Barcode == @plate").Plate_Map_Name.values[0]
    platemap_file = os.path.join(metadata_dir, "platemap",
                                 "{}.csv".format(platemap))
    platemap_df = pd.read_csv(platemap_file)

    # Prepare sql file for processing
    ap = AggregateProfiles(
        sqlite_file, strata=["Image_Metadata_Plate", "Image_Metadata_Well"])

    # Count cells and output
    cell_count_file = os.path.join("results",
                                   "{}_cell_count.tsv".format(plate))
    cell_count_df = ap.count_cells()
    cell_count_df = cell_count_df.merge(
        platemap_df, left_on="Image_Metadata_Well",
        right_on="well_position").drop(["WellRow", "WellCol", "well_position"],
                                       axis="columns")
    cell_count_df.to_csv(cell_count_file, sep="\t", index=False)

    # Begin processing profiles
    output_dir = os.path.join("data", "profiles", plate)
    os.makedirs(output_dir, exist_ok=True)

    # Aggregate single cells into well profiles
    out_file = os.path.join(output_dir, "{}.csv.gz".format(plate))
    ap.aggregate_profiles(output_file=out_file, compression="gzip")

    # Annotate Profiles
    anno_file = os.path.join(output_dir, "{}_augmented.csv.gz".format(plate))
    annotate(
        profiles=out_file,
        platemap=platemap_df,
        join_on=["Metadata_well_position", "Image_Metadata_Well"],
        output_file=anno_file,
        compression="gzip",
    )

    # Define metadata features
    meta_features = [
        "Image_Metadata_Plate",
        "Image_Metadata_Well",
        "Metadata_WellRow",
        "Metadata_WellCol",
        "Metadata_gene_name",
        "Metadata_pert_name",
        "Metadata_broad_sample",
        "Metadata_cell_line",
    ]

    # Normalize Profiles
    norm_file = os.path.join(output_dir, "{}_normalized.csv.gz".format(plate))
    normalize(
        profiles=anno_file,
        features="infer",
        meta_features=meta_features,
        samples="Metadata_pert_name == 'EMPTY'",
        method="mad_robustize",
        output_file=norm_file,
        compression="gzip",
    )

    # Perform feature selection (just drop columns with high number of missingness)
    # Drop columns with high number of missingness, extreme values, and blacklist
    feat_file = os.path.join(
        output_dir, "{}_normalized_feature_select.csv.gz".format(plate))
    feature_select(
        profiles=norm_file,
        features="infer",
        samples="none",
        operation=[
            "drop_na_columns",
            "blacklist",
            "variance_threshold",
            "drop_outliers",
        ],
        output_file=feat_file,
        compression="gzip",
    )

    # Perform audits
    profile_df = pd.read_csv(feat_file).drop(
        ["Image_Metadata_Well", "Image_Metadata_Plate"], axis="columns")

    # Audit guide replicability
    audit_file = os.path.join("results", "{}_audit_guide.csv".format(plate))
    audit(
        profiles=profile_df,
        audit_groups=[
            "Metadata_pert_name", "Metadata_gene_name", "Metadata_cell_line"
        ],
        iterations=10,
        output_file=audit_file,
    )

    # Audit gene replicability
    audit_file = os.path.join("results", "{}_audit_gene.csv".format(plate))
    audit(
        profiles=profile_df,
        audit_groups=["Metadata_gene_name", "Metadata_cell_line"],
        iterations=10,
        output_file=audit_file,
    )
Example #9
0
# ## Apply normalization, feature select, and output data

# In[12]:

normalized_df = normalize(merged_df,
                          features="infer",
                          meta_features="infer",
                          samples="all",
                          method="standardize")

# In[13]:

feature_select_df = feature_select(
    normalized_df,
    features="infer",
    operation=feature_select_opts,
    output_file="none",
    na_cutoff=na_cutoff,
    corr_threshold=corr_threshold,
)

print(feature_select_df.shape)
feature_select_df.head()

# In[14]:

output_filename = pathlib.Path(
    f"data/{batch}/{plate}_singlecell_normalized_feature_select.csv.gz")
output(normalized_df, output_filename, compression="gzip", float_format="%.5g")
                                   "{}_feature_select.gct".format(batch))

    # Load the profile data and add cell counts
    df = load_data(batch=batch,
                   suffix=suffix,
                   profile_dir=profile_dir,
                   combine_dfs=True,
                   add_cell_count=True,
                   cell_count_dir=cell_count_dir)

    # Save normalized and non-feature selected data
    profile_batches[batch] = df

    # Apply feature selection again - this is particularly important for batches
    # with multiple plates
    df = feature_select(df, operation=feature_select_ops)

    # Write the dataframe as a gct file for input into Morpheus
    write_gct(profiles=df, output_file=output_gct_file)

# ## Merge Profiles Together and Output

# In[4]:

all_profiles_df = pd.concat(profile_batches.values(),
                            sort=True).reset_index(drop=True)

meta_features = infer_cp_features(all_profiles_df, metadata=True)
cp_cols = infer_cp_features(all_profiles_df, metadata=False)

all_profiles_df = all_profiles_df.reindex(meta_features + cp_cols,
Example #11
0
                operation=operation,
                features=cp_norm_features,
            )

            # How many DMSO profiles per well?
            print(
                f"  There are {consensus_profiles[operation]['no_feat_select'].shape[0]} {operation} consensus profiles for {norm_strat} normalization"
            )

            # Perform feature selection
            print(
                f"  Now feature selecting on {operation} consensus for {norm_strat} normalization"
            )
            consensus_profiles[operation]["feat_select"] = feature_select(
                profiles=consensus_profiles[operation]["no_feat_select"],
                features="infer",
                operation=feature_select_ops,
                blocklist_file=full_blocklist_file,
            )

            # How many features in feature selected profile?
            print(
                f"  There are {consensus_profiles[operation]['feat_select'].shape[1]} features in {operation} consensus profiles for {norm_strat} normalization"
            )

        all_consensus_dfs[batch][norm_strat] = consensus_profiles
    print("\n")


# ## Merge and output consensus signatures
# 
# Output with and without feature selection.
Example #12
0
    .assign(Metadata_Dataset="FourClone")
)

cloneAE_data_recode_df = (
    cloneAE_data_df.assign(Metadata_treatment="bortezomib")
    .assign(Metadata_Dataset="CloneAE")
)

cloneAE_data_recode_df.loc[cloneAE_data_recode_df.Metadata_Dosage == 0, "Metadata_treatment"] = "DMSO"


# In[17]:


combined_df = pd.concat([fourclone_data_recode_df, cloneAE_data_recode_df], sort=True).reset_index(drop=True)
combined_df = feature_select(combined_df, operation="drop_na_columns")

print(combined_df.shape)
combined_df.head()


# In[18]:


embedding_combined_df = process_umap(combined_df)
embedding_combined_df.head()


# In[19]:

Example #13
0
dataset_a_df.head()

# In[6]:

pd.crosstab(dataset_a_df.Metadata_CellLine, dataset_a_df.Metadata_Dosage)

# In[7]:

dataset_a_name = "combined_cloneAcloneE_dataset"

# In[8]:

output_file = os.path.join(output_dir, "{}.csv.gz".format(dataset_a_name))
dataset_a_df.to_csv(output_file, index=False, compression="gzip")

dataset_a_featureselect_df = feature_select(dataset_a_df,
                                            operation=feature_select_ops)

output_file = os.path.join(output_dir,
                           "{}_feature_select.csv.gz".format(dataset_a_name))
dataset_a_featureselect_df.to_csv(output_file, index=False, compression="gzip")

output_gct_file = os.path.join(gct_dir,
                               "{}_feature_select.gct".format(dataset_a_name))
write_gct(profiles=dataset_a_featureselect_df, output_file=output_gct_file)

print(dataset_a_featureselect_df.shape)
dataset_a_featureselect_df.head()

# ## Process and Output Dataset B

# In[9]:
                                   "{}_feature_select.gct".format(batch))

    # Load the profile data and add cell counts
    df = load_data(batch=batch,
                   suffix=suffix,
                   profile_dir=profile_dir,
                   combine_dfs=True,
                   add_cell_count=True,
                   harmonize_cols=True,
                   cell_count_dir=cell_count_dir)

    # Save normalized and non-feature selected data
    profile_batches[batch] = df

    # Apply feature selection
    feature_select_df = feature_select(df, operation=feature_select_ops)

    # Write the dataframe as a gct file for input into Morpheus
    write_gct(profiles=feature_select_df, output_file=output_gct_file)

# ## Merge Profiles Together and Output

# In[4]:

all_profiles_df = pd.concat(profile_batches.values(),
                            sort=True).reset_index(drop=True)

all_profiles_df = all_profiles_df.assign(Metadata_clone_type="resistant")
all_profiles_df.loc[all_profiles_df.Metadata_clone_number.str.contains("WT"),
                    "Metadata_clone_type"] = "wildtype"
    def pipeline_feature_select(self, steps, suffix=None):
        feature_select_steps = steps
        pipeline_output = self.pipeline["output_dir"]

        level = feature_select_steps["level"]
        gct = feature_select_steps["gct"]
        feature_select_operations = feature_select_steps["operations"]
        feature_select_features = feature_select_steps["features"]

        all_plates_df = pd.DataFrame()

        for batch in self.profile_config:
            batch_df = pd.DataFrame()
            for plate in self.profile_config[batch]:
                output_dir = pathlib.PurePath(".", pipeline_output, batch,
                                              plate)
                if suffix:
                    normalize_output_file = pathlib.PurePath(
                        output_dir, f"{plate}_normalized_{suffix}.csv.gz")
                    feature_select_output_file_plate = pathlib.PurePath(
                        output_dir,
                        f"{plate}_normalized_feature_select_{suffix}_plate.csv.gz",
                    )
                else:
                    normalize_output_file = pathlib.PurePath(
                        output_dir, f"{plate}_normalized.csv.gz")
                    feature_select_output_file_plate = pathlib.PurePath(
                        output_dir,
                        f"{plate}_normalized_feature_select_plate.csv.gz")
                if feature_select_features == "infer" and self.noncanonical:
                    feature_select_features = cyto_utils.infer_cp_features(
                        pd.read_csv(normalize_output_file),
                        compartments=self.compartments,
                    )

                df = pd.read_csv(normalize_output_file).assign(
                    Metadata_batch=batch)

                if level == "plate":
                    df = df.drop(columns=["Metadata_batch"])
                    feature_select(
                        profiles=df,
                        features=feature_select_features,
                        operation=feature_select_operations,
                        output_file=feature_select_output_file_plate,
                        compression_options=self.
                        pipeline_options["compression"],
                        float_format=self.pipeline_options["float_format"],
                    )
                elif level == "batch":
                    batch_df = concat_dataframes(batch_df, df)
                elif level == "all":
                    all_plates_df = concat_dataframes(all_plates_df, df)

            if level == "batch":
                fs_df = feature_select(
                    profiles=batch_df,
                    features=feature_select_features,
                    operation=feature_select_operations,
                )
                for plate in self.profile_config[batch]:
                    output_dir = pathlib.PurePath(".", pipeline_output, batch,
                                                  plate)
                    if suffix:
                        feature_select_output_file_batch = pathlib.PurePath(
                            output_dir,
                            f"{plate}_normalized_feature_select_{suffix}_batch.csv.gz",
                        )
                    else:
                        feature_select_output_file_batch = pathlib.PurePath(
                            output_dir,
                            f"{plate}_normalized_feature_select_batch.csv.gz",
                        )
                    if feature_select_features == "infer" and self.noncanonical:
                        feature_select_features = cyto_utils.infer_cp_features(
                            batch_df, compartments=self.compartments)

                    df = fs_df.query("Metadata_Plate==@plate").reset_index(
                        drop=True)
                    df = df.drop(columns=["Metadata_batch"])

                    cyto_utils.output(
                        output_filename=feature_select_output_file_batch,
                        df=df,
                        compression_options=self.
                        pipeline_options["compression"],
                        float_format=self.pipeline_options["float_format"],
                    )

                if gct:
                    create_gct_directories(batch)
                    if suffix:
                        stacked_file = pathlib.PurePath(
                            ".",
                            "gct",
                            batch,
                            f"{batch}_normalized_feature_select_{suffix}_batch.csv.gz",
                        )
                        gct_file = pathlib.PurePath(
                            ".",
                            "gct",
                            batch,
                            f"{batch}_normalized_feature_select_{suffix}_batch.gct",
                        )
                    else:
                        stacked_file = pathlib.PurePath(
                            ".",
                            "gct",
                            batch,
                            f"{batch}_normalized_feature_select_batch.csv.gz",
                        )
                        gct_file = pathlib.PurePath(
                            ".",
                            "gct",
                            batch,
                            f"{batch}_normalized_feature_select_batch.gct",
                        )
                    cyto_utils.output(
                        output_filename=stacked_file,
                        df=fs_df,
                        compression_options=self.
                        pipeline_options["compression"],
                        float_format=self.pipeline_options["float_format"],
                    )
                    write_gct(profiles=fs_df, output_file=gct_file)

        if level == "all":
            fs_df = feature_select(
                profiles=all_plates_df,
                features=feature_select_features,
                operation=feature_select_operations,
            )
            for batch in self.profile_config:
                fs_batch_df = fs_df.loc[fs_df.Metadata_batch ==
                                        batch].reset_index(drop=True)
                for plate in self.profile_config[batch]:
                    output_dir = pathlib.PurePath(".", pipeline_output, batch,
                                                  plate)
                    if suffix:
                        feature_select_output_file_all = pathlib.PurePath(
                            output_dir,
                            f"{plate}_normalized_feature_select_{suffix}_all.csv.gz",
                        )
                    else:
                        feature_select_output_file_all = pathlib.PurePath(
                            output_dir,
                            f"{plate}_normalized_feature_select_all.csv.gz")
                    if feature_select_features == "infer" and self.noncanonical:
                        feature_select_features = cyto_utils.infer_cp_features(
                            all_plates_df, compartments=self.compartments)

                    df = fs_batch_df.query(
                        "Metadata_Plate==@plate").reset_index(drop=True)

                    df = df.drop(columns=["Metadata_batch"])

                    cyto_utils.output(
                        output_filename=feature_select_output_file_all,
                        df=df,
                        compression_options=self.
                        pipeline_options["compression"],
                        float_format=self.pipeline_options["float_format"],
                    )

                if gct:
                    create_gct_directories(batch)
                    if suffix:
                        stacked_file = pathlib.PurePath(
                            ".",
                            "gct",
                            batch,
                            f"{batch}_normalized_feature_select_{suffix}_all.csv.gz",
                        )
                        gct_file = pathlib.PurePath(
                            ".",
                            "gct",
                            batch,
                            f"{batch}_normalized_feature_select_{suffix}_all.gct",
                        )
                    else:
                        stacked_file = pathlib.PurePath(
                            ".",
                            "gct",
                            batch,
                            f"{batch}_normalized_feature_select_all.csv.gz",
                        )
                        gct_file = pathlib.PurePath(
                            ".",
                            "gct",
                            batch,
                            f"{batch}_normalized_feature_select_all.gct",
                        )
                    cyto_utils.output(
                        output_filename=stacked_file,
                        df=fs_batch_df,
                        compression_options=self.
                        pipeline_options["compression"],
                        float_format=self.pipeline_options["float_format"],
                    )
                    write_gct(profiles=fs_batch_df, output_file=gct_file)
Example #16
0
normalize(
    profiles=anno_df,
    samples="all",
    method=norm_method,
    output_file=norm_file,
    float_format=float_format,
    compression_options=compression,
)

# Feature Selection (DMSO Control) - Level 4B Data
feat_dmso_file = pathlib.PurePath(
    output_dir, f"{plate_name}_normalized_feature_select_dmso.csv.gz")
feature_select(
    profiles=norm_dmso_file,
    features="infer",
    operation=feature_select_ops,
    output_file=feat_dmso_file,
    float_format=float_format,
    compression_options=compression,
)

# Feature Selection (Whole Plate) - Level 4B Data
feat_file = pathlib.PurePath(output_dir,
                             f"{plate_name}_normalized_feature_select.csv.gz")
feature_select(
    profiles=norm_file,
    features="infer",
    operation=feature_select_ops,
    output_file=feat_file,
    float_format=float_format,
    compression_options=compression,
)
feature_select_corr_threshold = feature_select_args["corr_threshold"]

for data_level in feature_select_levels:
    if data_level == "single_cell":
        if not singlecell_from_single_file:
            warnings.warn(
                "Feature select operation is not enabled for site-specific single cell files. Skipping."
            )
            continue

    input_file = feature_select_input_files[data_level]
    output_file = feature_select_output_files[data_level]

    print(
        f"Now performing feature selection for {data_level}...with operations: {feature_select_operations}"
    )

    df = pd.read_csv(input_file)

    feature_select(
        profiles=df,
        features=feature_select_features,
        samples=feature_select_drop_samples,
        operation=feature_select_operations,
        na_cutoff=feature_select_nacutoff,
        corr_threshold=feature_select_corr_threshold,
        output_file=output_file,
        compression=compression,
        float_format=float_format,
    )
# In[10]:

# We see a very large difference in cell count across profiles
# Remember that profiles were generated from averaging feature values for all single cells
full_df.Metadata_cell_count.hist()

# In[11]:

selected_features = []
for dataset in datasets:

    # Apply feature selection
    feature_select_df = feature_select(
        profiles=(full_df.query("Metadata_dataset == @dataset").query(
            "Metadata_model_split == 'training'")),
        operation=feature_select_opts,
        na_cutoff=na_cutoff,
        corr_threshold=corr_threshold)

    dataset_features = infer_cp_features(feature_select_df)

    selected_features.append(
        pd.DataFrame(dataset_features,
                     columns=["features"]).assign(dataset=dataset))

# Output results of feature selection
all_selected_features = pd.concat(selected_features).reset_index(drop=True)

output_file = pathlib.Path(f"{output_dir}/dataset_features_selected.tsv")
all_selected_features.to_csv(output_file, sep="\t", index=False)
Example #19
0
# ## Apply Feature Selection

# In[18]:


meta_features = infer_cp_features(train_df, metadata=True)
meta_features


# In[19]:


train_df = feature_select(
    train_df,
    operation=feature_select_opts,
    na_cutoff=na_cutoff,
    corr_threshold=corr_threshold
)

selected_features = infer_cp_features(train_df)
reindex_features = meta_features + selected_features

test_df = test_df.reindex(reindex_features, axis="columns")
train_df = train_df.reindex(reindex_features, axis="columns")
holdout_df = holdout_df.reindex(reindex_features, axis="columns")
other_df = other_df.reindex(reindex_features, axis="columns")


# In[20]:

            f"{output_dir}/{batch}_dmso_spherized_profiles_with_input_normalized_by_{suffix}.csv.gz"
        )
        print(f"Now processing {output_file}...")

        profile_df = pd.concat([pd.read_csv(x) for x in files[batch][suffix]
                                ]).reset_index(drop=True)
        print(profile_df.shape)

        # Step 1: Perform feature selection
        if batch == "2017_12_05_Batch2":
            profile_df = (profile_df.groupby([
                "Metadata_cell_line", "Metadata_time_point"
            ]).apply(
                lambda x: feature_select(profiles=x,
                                         operation=feature_select_ops,
                                         na_cutoff=na_cut,
                                         corr_threshold=corr_threshold,
                                         blocklist_file=full_blocklist_file)))

            # Drop features that weren't selected in the grouped splits
            profile_df = feature_select(profiles=profile_df,
                                        operation="drop_na_columns",
                                        na_cutoff=na_cut)
        else:
            profile_df = feature_select(profiles=profile_df,
                                        operation=feature_select_ops,
                                        na_cutoff=na_cut,
                                        corr_threshold=corr_threshold,
                                        blocklist_file=full_blocklist_file)

        # Step 2: Spherize transform