Ejemplo n.º 1
0
def find_time_step_max(emb_df_train, contents, steps):
    """Find time step where training accuracy is maximized.

  Notice we do NOT apply post-processing of embeddings (e.g. normalization or
  factor analysis) at this stage.

  Args:
    emb_df_train (pandas dataframe): Dataframe with left-one-out compound.
      Currently this should not include the controls.
    contents (dict): Contents from Wasserstein training routine
    steps (list): Steps for training

  Returns:
    time_step_max (int): Time step where average nsc and nscb for k=1...4
      is maximized.
  """
    if "treatment_group" not in emb_df_train.index.names:
        raise ValueError(
            "Must have treatment_group in embeddings index names.")
    acc_nsc = []
    acc_nsc_nsb = []
    for time_step in sorted(steps):
        ## Right now, we are NOT applying post-processing for cross-validation.
        ## This could be done by setting the values below to the same as the FLAG
        ## values. However, this would be slower, and also (for factor analysis)
        ## we would have to include the controls in emb_df_train, and drop them
        ## later. The reason for this is that they are used to determine the
        ## factor analysis transformation.
        means = transform_and_means(contents,
                                    emb_df_train,
                                    time_step,
                                    percent_norm=False,
                                    factor_analys=False)
        df_moa = evaluate.make_knn_moa_dataframe(means)
        acc_nsc.append(df_moa[["Accuracy NSC"]])
        acc_nsc_nsb.append(df_moa[["Accuracy NSC NSB"]])
    acc_nsc = np.concatenate(acc_nsc, axis=1)
    acc_nsc_nsb = np.concatenate(acc_nsc_nsb, axis=1)
    # select time step by average accuracy
    time_step_max = steps[np.argmax(
        np.mean(acc_nsc, 0) + np.mean(acc_nsc_nsb, 0))]
    return time_step_max
Ejemplo n.º 2
0
def get_scores_from_means(means,
                          report_knn=True,
                          report_confusion_matrix=True):
    """Get confusion matrices, accuracy scores, and clustering score.

  Args:
    means (pandas dataframe): means for each treatment.
    report_knn (boolean): whether or not to compute KNN scores.
    report_confusion_matrix (boolean): whether or not to include confusion
      matrix.
  Returns:
    dict containing the following:
      confusion_matrix: contains confusion matrices for nsc and nscb and k=1...4
      knn_df_dict (dict): contains accuracy scores for nsc and nscb and k=1...4
      clustering_score (float):
  """
    moa_name_index = get_index_for_name(means, "moa")
    dist = distance_analysis.matrix(distance.cosine, means)
    clustering_score = metrics.silhouette_score(
        dist,
        labels=means.index.get_level_values(level=metadata.MOA),
        metric="precomputed")
    output_dict = {"clustering_score": clustering_score}

    if report_knn:
        knn_df = evaluate.make_knn_moa_dataframe(means)
        output_dict.update({"knn": knn_df.to_dict()})

    if report_confusion_matrix:
        confusion_matrix = {"nsc": {}, "nscb": {}}
        for k in range(1, 5):
            confusion_matrix["nsc"][k] = confusion_matrix_from_dist(
                dist, k, evaluate.not_same_compound_filter,
                dist.index.levels[moa_name_index])
            confusion_matrix["nscb"][k] = confusion_matrix_from_dist(
                dist, k, evaluate.not_same_compound_or_batch_filter,
                dist.index.levels[moa_name_index])
        output_dict.update({"confusion_matrix": confusion_matrix})
    return output_dict