def find_time_step_max(emb_df_train, contents, steps): """Find time step where training accuracy is maximized. Notice we do NOT apply post-processing of embeddings (e.g. normalization or factor analysis) at this stage. Args: emb_df_train (pandas dataframe): Dataframe with left-one-out compound. Currently this should not include the controls. contents (dict): Contents from Wasserstein training routine steps (list): Steps for training Returns: time_step_max (int): Time step where average nsc and nscb for k=1...4 is maximized. """ if "treatment_group" not in emb_df_train.index.names: raise ValueError( "Must have treatment_group in embeddings index names.") acc_nsc = [] acc_nsc_nsb = [] for time_step in sorted(steps): ## Right now, we are NOT applying post-processing for cross-validation. ## This could be done by setting the values below to the same as the FLAG ## values. However, this would be slower, and also (for factor analysis) ## we would have to include the controls in emb_df_train, and drop them ## later. The reason for this is that they are used to determine the ## factor analysis transformation. means = transform_and_means(contents, emb_df_train, time_step, percent_norm=False, factor_analys=False) df_moa = evaluate.make_knn_moa_dataframe(means) acc_nsc.append(df_moa[["Accuracy NSC"]]) acc_nsc_nsb.append(df_moa[["Accuracy NSC NSB"]]) acc_nsc = np.concatenate(acc_nsc, axis=1) acc_nsc_nsb = np.concatenate(acc_nsc_nsb, axis=1) # select time step by average accuracy time_step_max = steps[np.argmax( np.mean(acc_nsc, 0) + np.mean(acc_nsc_nsb, 0))] return time_step_max
def get_scores_from_means(means, report_knn=True, report_confusion_matrix=True): """Get confusion matrices, accuracy scores, and clustering score. Args: means (pandas dataframe): means for each treatment. report_knn (boolean): whether or not to compute KNN scores. report_confusion_matrix (boolean): whether or not to include confusion matrix. Returns: dict containing the following: confusion_matrix: contains confusion matrices for nsc and nscb and k=1...4 knn_df_dict (dict): contains accuracy scores for nsc and nscb and k=1...4 clustering_score (float): """ moa_name_index = get_index_for_name(means, "moa") dist = distance_analysis.matrix(distance.cosine, means) clustering_score = metrics.silhouette_score( dist, labels=means.index.get_level_values(level=metadata.MOA), metric="precomputed") output_dict = {"clustering_score": clustering_score} if report_knn: knn_df = evaluate.make_knn_moa_dataframe(means) output_dict.update({"knn": knn_df.to_dict()}) if report_confusion_matrix: confusion_matrix = {"nsc": {}, "nscb": {}} for k in range(1, 5): confusion_matrix["nsc"][k] = confusion_matrix_from_dist( dist, k, evaluate.not_same_compound_filter, dist.index.levels[moa_name_index]) confusion_matrix["nscb"][k] = confusion_matrix_from_dist( dist, k, evaluate.not_same_compound_or_batch_filter, dist.index.levels[moa_name_index]) output_dict.update({"confusion_matrix": confusion_matrix}) return output_dict