def test_coral(emb_df_clean, num_bootstrap=2, percent_norm=False, factor_analys=False): """test set (Mike's CORAL). Args: emb_df_clean (pandas dataframe): input dataframe num_bootstrap (int): number of bootstrap samples to use percent_norm (bool): whether to apply percentile normalization factor_analys (bool): whether to apply factor analysis Returns: metrics_dict_coral (dict): contains batch_classification_scores and moa_scores """ emb_df_test_coral_mike = transform.coral_without_mean_shift_batch( emb_df_clean) emb_df_post = apply_post_processing(emb_df_test_coral_mike, percent_norm, factor_analys) means = transform.drop_unevaluated_comp(emb_df_post).groupby(level=[ metadata.MOA, metadata.COMPOUND, metadata.CONCENTRATION, metadata.BATCH, metadata.TREATMENT_GROUP ]).mean() batch_classification_scores = get_batch_classification_scores( emb_df_post) moa_scores = get_scores_from_means(means) bootstrap_scores = knn_bootstrap(emb_df_test_coral_mike, num_bootstrap=num_bootstrap, percent_norm=percent_norm, factor_analys=factor_analys) return_dict = { "batch_classification_scores": batch_classification_scores, "bootstrap_scores": bootstrap_scores } return_dict.update(moa_scores) return return_dict
def knn_bootstrap(emb_df, num_bootstrap=2, seed=SEED, percent_norm=False, factor_analys=False): """Generate bootstrap statistics. Args: emb_df (pandas dataframe): dataframe to use (includes controls) num_bootstrap (int): number of bootstrap reps seed (int): which seed value to use for bootstrapping percent_norm (bool): whether to apply percentile normalization factor_analys (bool): whether to apply factor analysis Returns: stats_dict (dict): dictionary containing mean and standard deviation information for each overlapping category and value of k. """ boot_knns = [] boot_clustering_scores = [] np.random.seed(seed=seed) for _ in range(num_bootstrap): boot_emb = transform.get_bootstrap_sample(emb_df) boot_post_proc = apply_post_processing(boot_emb, percent_norm=percent_norm, factor_analys=factor_analys) boot_means = transform.drop_unevaluated_comp(boot_post_proc).groupby(level=[ metadata.MOA, metadata.COMPOUND, metadata.CONCENTRATION, metadata.BATCH, metadata.TREATMENT_GROUP]).mean() scores = get_scores_from_means(boot_means, report_confusion_matrix=False) boot_knns.append(scores["knn"]) boot_clustering_scores.append(scores["clustering_score"]) knn_return = {"knn_scores": elementwise_stats(boot_knns), "clustering_scores": boot_clustering_scores} return knn_return
def transform_and_means(contents, emb_df, step, linear=True, percent_norm=False, factor_analys=False, drop_controls=True): """Apply Wasserstein and take means. This functions takes the Wasserstein transform and then means across desired categories. It takes advantage when the transformation is linear to significantly speed things up, by taking the means first and then applying the transformation. If we apply the percentile normalization or factor analysis, we cannot do the means-first trick. Args: contents (dict): Transformation file. emb_df (pandas dataframe): Dataframe to transform and take means of. step (int): step where to take Wasserstein transform. linear (bool): Whether or not transformation is linear. percent_norm (bool): whether to apply percentile normalization factor_analys (bool): whether to apply factor analysis drop_controls (bool): whether or not to drop controls Returns: transformed_means (pandas dataframe): dataframe after transform and taking the mean. """ if "treatment_group" not in emb_df.index.names: raise ValueError( "Must have treatment_group in embeddings index names.") if linear and not percent_norm and not factor_analys: means = emb_df.groupby(level=[ metadata.MOA, metadata.COMPOUND, metadata.CONCENTRATION, metadata.BATCH, metadata.TREATMENT_GROUP ]).mean() transformed_means = wasserstein_transform(contents, means, step) else: emb_df_trans = wasserstein_transform(contents, emb_df, step) df_post_processed = apply_post_processing(emb_df_trans, percent_norm, factor_analys) transformed_means = df_post_processed.groupby(level=[ metadata.MOA, metadata.COMPOUND, metadata.CONCENTRATION, metadata.BATCH, metadata.TREATMENT_GROUP ]).mean() if drop_controls: return transform.drop_unevaluated_comp(transformed_means) else: return transformed_means
def main(argv): del argv emb_df_clean = io_utils.read_dataframe_from_hdf5(FLAGS.input_df) if "treatment_group" not in emb_df_clean.index.names: raise ValueError("Must have treatment_group in embeddings index names.") contents = load_contents(FLAGS.transformation_file) ## dictionary to save things save_dict = {} ## Get steps over training steps = list(contents.keys()) steps.remove("params") steps = np.sort(steps) ## Truncate list of steps steps = steps[:FLAGS.num_steps] ## embeddings without unevaluated compound emb_df_valid = transform.drop_unevaluated_comp(emb_df_clean) if "treatment_group" not in emb_df_valid.index.names: raise ValueError("Must have treatment_group in embeddings index names.") ## list of compounds and number of compounds comp_list = emb_df_valid.index.get_level_values( level=metadata.COMPOUND).unique() n_comp = len(comp_list) ## Set up data structure for leave-one-out cross validation list_of_comp_set = [] for i in range(n_comp): comp_set = {} comp_set["b"] = comp_list[i] comp_set["a"] = list(set(comp_list).difference([comp_list[i]])) list_of_comp_set.append(comp_set) ## Cross validation training with leave-one-out and variable stopping time. (steps_max, cross_validated_scores) = cross_val_train( emb_df_clean, contents, steps, list_of_comp_set, n_comp, percent_norm=FLAGS.percentile_normalize, factor_analys=FLAGS.factor_analysis) ## Find first and last timesteps used, to use for bootstraps boot_steps = [steps[i] for i, v in enumerate(steps) if np.max(steps_max) >= v >= np.min(steps_max)] metrics_dict = evaluate_metrics(contents, emb_df_clean, steps_max, boot_steps, list_of_comp_set, num_bootstrap=FLAGS.num_bootstrap, percent_norm=FLAGS.percentile_normalize, factor_analys=FLAGS.factor_analysis) save_dict["metrics_dict"] = metrics_dict ## time steps where max cross validation results were found save_dict["list_of_time_step_max"] = steps_max ## accuracy for not same compound or batch, obtained at time_step_max ## for each individual compound. save_dict["metrics_dict"]["wdn"]["cross_val_scores"] = cross_validated_scores with gfile.GFile(FLAGS.output_file, mode="w") as f: f.write(pickle.dumps(save_dict))
def test_wdn(emb_df_clean, contents, list_of_time_step_max, steps, list_of_comp_set, num_bootstrap=2, percent_norm=False, factor_analys=False): """test set (WDN). Args: emb_df_clean (pandas dataframe): input dataframe contents (dict): Contents from Wasserstein training routine list_of_time_step_max (int): List of timesteps at which timestep to evaluate WDN statistics. For example, could be the time step where average nsc and nscb for k=1...4 is maximized for a given compound in the cross-validation procedure. steps (list): all timesteps from analysis, used for bootstrapping. list_of_comp_set (list): each element is a dict for cross-validation. num_bootstrap (int): number of bootstrap samples to use percent_norm (bool): whether to apply percentile normalization factor_analys (bool): whether to apply factor analysis Returns: metrics_dict_wdn (dict): contains batch_classification_scores and moa_scores """ batch_classification_scores = {} clustering_scores = {} ## We do not do cross validation for batch classification and Silhouette ## scores. For the BBBC021 dataset, batch classification only applies to ## controls, so the result of leave-one-out cross validation are the same as ## taking the weighted average/standard deviation across left-out compounds. ## For the Silhouette score, it is possible to do leave-one-out cross ## validation, but then we would also have to do it for TVN and CORAL for ## each left-out compound as well. unique_time_step_max = list(set(list_of_time_step_max)) for time_step_max in unique_time_step_max: ## We need both the transformed embeddings as well as the means, so we do ## not use transform_and_means here. emb_df_trans = wasserstein_transform(contents, emb_df_clean, time_step_max) df_post_processed = apply_post_processing(emb_df_trans, percent_norm, factor_analys) means = transform.drop_unevaluated_comp(df_post_processed.groupby(level=[ metadata.MOA, metadata.COMPOUND, metadata.CONCENTRATION, metadata.BATCH, metadata.TREATMENT_GROUP ]).mean()) batch_class_at_time = get_batch_classification_scores(df_post_processed) batch_classification_scores[time_step_max] = batch_class_at_time moa_at_time = get_scores_from_means(means, report_knn=False, report_confusion_matrix=False) clustering_score = moa_at_time["clustering_score"] clustering_scores[time_step_max] = clustering_score knn_bootstrap_scores = cross_val_knn_bootstrap(emb_df_clean, contents, steps, list_of_comp_set, num_bootstrap=num_bootstrap, percent_norm=percent_norm, factor_analys=factor_analys) return_dict = { "batch_classification_scores": batch_classification_scores, "knn_bootstrap_scores": knn_bootstrap_scores, "clustering_scores": clustering_scores } return return_dict
def cross_val_train(emb_df_clean, contents, steps, list_of_comp_set, n_comp, report_confusion_matrix=True, percent_norm=False, factor_analys=False): """Cross validation to find stopping time with each left-one-out compound. Args: emb_df_clean (pandas dataframe): embeddings WITH unevaluated compounds. contents (dict): Contents from Wasserstein training routine steps (list): Steps for training list_of_comp_set (list): dictionaries for each compound for leave-one-out n_comp (int): number of compounds report_confusion_matrix (bool): whether or not to include confusion matrix. percent_norm (bool): whether to apply percentile normalization factor_analys (bool): whether to apply factor analysis Returns: list_of_time_step_max (list): best stopping time for each compound cross_validated_scores (dict): Contains cross-validated accuracy scores and confusion matrices. """ list_of_time_step_max = [] correct_nsc = collections.defaultdict(list) mismatch_nsc = collections.defaultdict(list) correct_nscb = collections.defaultdict(list) mismatch_nscb = collections.defaultdict(list) emb_df_valid = transform.drop_unevaluated_comp(emb_df_clean) match_metadata_values = sorted(emb_df_valid.index.get_level_values( level=metadata.MOA).unique()) num_moa = len(match_metadata_values) if report_confusion_matrix: confusion_matrices_nsc = collections.defaultdict(list) confusion_matrices_nscb = collections.defaultdict(list) for k in range(1, 5): confusion_matrices_nsc[k] = np.zeros((num_moa, num_moa)) confusion_matrices_nscb[k] = np.zeros((num_moa, num_moa)) else: confusion_matrices_nsc = None confusion_matrices_nscb = None dist_at_time = {} all_compounds_valid = emb_df_valid.index.get_level_values( level=metadata.COMPOUND) for i in range(n_comp): print("cross-validation for compound %s" %i) comp_set = list_of_comp_set[i] ## dataframe excluding the left-out compound emb_df_train = emb_df_valid[all_compounds_valid.isin(comp_set["a"])] if "treatment_group" not in emb_df_train.index.names: raise ValueError("Must have treatment_group in embeddings index names.") ## best time step for a given left-out compound ## as far as speed, this would be a significant bottleneck, ## since it has to evaluate at all timesteps time_step_max = find_time_step_max(emb_df_train, contents, steps) # time_step_max = 20000 ## Used for testing purposes list_of_time_step_max.append(time_step_max) if time_step_max in dist_at_time: ## Cache dist matrix at given time. dist = dist_at_time[time_step_max] else: ## find cosine distances given left-out compound at time_step_max means = transform_and_means(contents, emb_df_clean, time_step_max, percent_norm=percent_norm, factor_analys=factor_analys) means_valid = transform.drop_unevaluated_comp(means) dist = distance_analysis.matrix(distance.cosine, means_valid) dist_at_time[time_step_max] = dist # k-NN up to k=4 for k in range(1, 5): update_stats_new_compound(comp_set, dist, k, evaluate.not_same_compound_filter, correct_nsc, mismatch_nsc, match_metadata_values, confusion_matrices_nsc) update_stats_new_compound(comp_set, dist, k, evaluate.not_same_compound_or_batch_filter, correct_nscb, mismatch_nscb, match_metadata_values, confusion_matrices_nscb) ## obtain accuracies from correct and mismatched, for cross validated scores. acc_nsc = calculate_moa_accuracy(correct_nsc, mismatch_nsc) acc_nscb = calculate_moa_accuracy(correct_nscb, mismatch_nscb) cross_validated_scores = { "acc_nsc": acc_nsc, "acc_nscb": acc_nscb } if report_confusion_matrix: cross_validated_scores.update({ "confusion_matrices_nsc": confusion_matrices_nsc, "confusion_matrices_nscb": confusion_matrices_nscb }) return (list_of_time_step_max, cross_validated_scores)
def testDropUnevaluatedComp(self): pandas_testing.assert_frame_equal( pd.concat([self.pos_controls, self.experimental]), transform.drop_unevaluated_comp(self.data))