Example #1
0
def plot_roc_and_precision_and_save_data(per_read_labels_only, per_read_probs_only, name, variants, save_fig_dir,
                                         label_ids=None, threshold=0.5):
    roc_h = ClassificationMetrics(per_read_labels_only, per_read_probs_only, label_ids=label_ids)
    for variant in variants:
        roc_path = None
        precision_recall_path = None
        confusion_recall_path = None
        plot_probability_hist_path = None

        if save_fig_dir:
            roc_path = os.path.join(save_fig_dir, "{}_roc_{}".format(name, variant))
            precision_recall_path = os.path.join(save_fig_dir, "{}_pr_{}".format(name, variant))
            confusion_recall_path = os.path.join(save_fig_dir, "{}_confusion_{}".format(name, variant))
            plot_probability_hist_path = os.path.join(save_fig_dir, "{}_prob_hist_{}".format(name, variant))

        roc_h.plot_roc(variant, title="{} ROC for {}".format(name, variant), save_fig_path=roc_path)
        roc_h.plot_precision_recall(variant, title="{} Precison Recall for {}".format(name, variant),
                                    save_fig_path=precision_recall_path)
        roc_h.plot_confusion_matrix(title="{} Confusion Matrix for {}".format(name, variant),
                                    save_fig_path=confusion_recall_path, threshold=threshold, class_n=variant)
        bins = max(int(len(roc_h.class_probabilities[variant]) / 30), 10)
        roc_h.plot_probability_hist(variant, save_fig_path=plot_probability_hist_path, bins=bins, normalize=False)

    # save pickle of classification metrics class
    if save_fig_dir:
        path = os.path.join(save_fig_dir, "{}_classificationMetrics.pkl".format(name))
        with open(path, "wb") as f:
            pickle.dump(roc_h, f)
    return 0
    def test_plot_multiclass_roc2(self):
        with captured_output() as (_, _):
            label_data, prob_data = generate_perfect_label_prob_data(
                50, list("ABC"))
            cm_h = ClassificationMetrics(label_data, prob_data)

            with tempfile.TemporaryDirectory() as tempdir:
                new_file = os.path.join(tempdir, "test.png")
                cm_h.plot_multiclass_roc(save_fig_path=new_file)
                self.assertTrue(os.path.exists(new_file))
    def test_plot_calibration_curve(self):
        with captured_output() as (_, _):
            label_data, prob_data = generate_random_label_prob_data(
                50, list("AB"))
            cm_h = ClassificationMetrics(label_data, prob_data)

            with tempfile.TemporaryDirectory() as tempdir:
                new_file = os.path.join(tempdir, "test.png")
                cm_h.plot_calibration_curve(class_n="A",
                                            save_fig_path=new_file)
                self.assertTrue(os.path.exists(new_file))
    def test_confusion_matrix(self):
        with captured_output() as (_, _):
            label_data, prob_data = generate_perfect_label_prob_data(
                50, list("ABCD"))
            cm_h = ClassificationMetrics(label_data, prob_data)

            perfect_confusion = cm_h.confusion_matrix()
            for x in range(4):
                self.assertTrue(perfect_confusion[x][x] != 0)
                a = {0, 1, 2, 3} - {x}
                for y in a:
                    self.assertTrue(perfect_confusion[x][y] == 0)
    def setUpClass(cls):
        super(ClassificationTests, cls).setUpClass()
        cls.HOME = '/'.join(os.path.abspath(__file__).split("/")[:-1])
        cls.label_names = get_random_strings(50, 10)

        cls.label_data, cls.prob_data = generate_random_label_prob_data(
            50, list("ABCD"))
        cls.cm_h = ClassificationMetrics(cls.label_data,
                                         cls.prob_data,
                                         label_ids=cls.label_names)
        cls.label_data2, cls.prob_data2 = generate_perfect_label_prob_data(
            50, list("AB"))
        cls.perfect_binary = ClassificationMetrics(cls.label_data2,
                                                   cls.prob_data2,
                                                   label_ids=cls.label_names)
        cls.label_data3, cls.prob_data3 = generate_50_50_prob_data(
            50, list("AB"))
        cls.fifty_fifty = ClassificationMetrics(cls.label_data3,
                                                cls.prob_data3,
                                                label_ids=cls.label_names)
Example #6
0
def main():
    cpg_positions_file = "/Users/andrewbailey/data/references/ecoli/CG_ecoli_k12_mg1655_C_E.positions"
    modified_deepmod_output_dir = "/Users/andrewbailey/CLionProjects/DeepMod/ecoli_pcr_MSssI_R9"
    canonical_deepmod_output_dir = "/Users/andrewbailey/CLionProjects/DeepMod/ecoli_pcr_MSssI_R9"

    output_dir = "/Users/andrewbailey/CLionProjects/modification_detection_pipeline/output_dir/plotting_output/"
    log_file_path = os.path.join(output_dir, "confusion_matrices_file.txt")
    cpg_positions = CustomAmbiguityPositions(cpg_positions_file)

    canonical_data = aggregate_deepmod_data(canonical_deepmod_output_dir)
    canonical_data["E_label"] = 0
    canonical_data["C_label"] = 1

    modified_data = aggregate_deepmod_data(modified_deepmod_output_dir)
    modified_data["E_label"] = 1
    modified_data["C_label"] = 0

    tps = 0
    fps = 0
    tns = 0
    fns = 0
    all_data = []
    with open(log_file_path, "w") as log_file:
        chromosomes = set(modified_data["contig"]) | set(canonical_data["contig"])
        strands = set(modified_data["strand"]) | set(canonical_data["strand"])
        for chromosome in chromosomes:
            for strand in strands:
                # get positions for strand and contig
                sc_positions = cpg_positions.ambig_df.loc[(cpg_positions.ambig_df["strand"] == strand) &
                                                          (cpg_positions.ambig_df["contig"] == chromosome)]
                # pare data sets for specific contig and strand to get positions that are cpgs
                mod_sc_data = modified_data.loc[(modified_data["contig"] == chromosome) &
                                                (modified_data["strand"] == strand)]
                mod_methylation_calls = mod_sc_data.loc[mod_sc_data["start_position"].isin(sc_positions["position"])]
                canon_sc_data = canonical_data.loc[(canonical_data["contig"] == chromosome) &
                                                   (canonical_data["strand"] == strand)]
                canon_methylation_calls = canon_sc_data.loc[
                    canon_sc_data["start_position"].isin(sc_positions["position"])]

                # per site
                n_negative_calls = sum(canon_methylation_calls["n_reads"])
                n_false_negatives = sum(canon_methylation_calls["n_mod_calls"])
                n_true_negatives = n_negative_calls - n_false_negatives

                n_positive_calls = sum(mod_methylation_calls["n_reads"])
                n_true_positives = sum(mod_methylation_calls["n_mod_calls"])
                n_false_positives = n_positive_calls - n_true_positives

                tps += n_true_positives
                fps += n_false_positives
                tns += n_true_negatives
                fns += n_false_negatives
                print("Chromosome {} strand {}:".format(chromosome, strand), file=log_file)
                print("Per-call confusion matrix", file=log_file)
                print(print_confusion_matrix(n_true_positives, n_false_positives, n_false_negatives, n_true_negatives),
                      file=log_file)
                plot_confusion_matrix(n_true_positives, n_false_positives, n_false_negatives, n_true_negatives,
                                      normalize=True,
                                      output_path=os.path.join(output_dir, "per_call_{}_{}_confusion_matrix.png".format(strand, chromosome)),
                                      title="Per call CpG Normalized Confusion Matrix {}{}".format(strand, chromosome))

                # per genomic position
                chr_strand_data = pd.concat([canon_methylation_calls, mod_methylation_calls])
                label_data = chr_strand_data.loc[:, ['C_label', "E_label"]]
                prediction_data = chr_strand_data.loc[:, ['C', "E"]]
                label_data.rename(columns={'C_label': 'C', "E_label": "E"}, inplace=True)
                cmh = ClassificationMetrics(label_data, prediction_data)
                cmh.plot_roc("E", os.path.join(output_dir, "per_genomic_site_{}_{}_roc.png".format(chromosome, strand)))
                cmh.plot_precision_recall("E", os.path.join(output_dir,
                                                            "per_genomic_site_{}_{}_"
                                                            "precision_recall.png".format(chromosome, strand)))
                print("Per-genomic-site confusion matrix", file=log_file)
                print(cmh.confusion_matrix(), file=log_file)

                all_data.append(chr_strand_data)

        print("All Chromosomes both strands:", file=log_file)
        print("Per-call confusion matrix", file=log_file)
        print(print_confusion_matrix(tps, fps, fns, tns),
              file=log_file)
        plot_confusion_matrix(tps, fps, fns, tns,
                              normalize=True,
                              output_path=os.path.join(output_dir,
                                                       "all_calls_confusion_matrix.png"),
                              title="All calls CpG "
                                    "Normalized Confusion Matrix")
        all_data = pd.concat(all_data)
        label_data = all_data.loc[:, ['C_label', "E_label"]]
        prediction_data = all_data.loc[:, ['C', "E"]]
        label_data.rename(columns={'C_label': 'C', "E_label": "E"}, inplace=True)
        cmh = ClassificationMetrics(label_data, prediction_data)
        cmh.plot_roc("E", os.path.join(output_dir, "per_genomic_site_all_chromosomes_roc.png"))
        cmh.plot_precision_recall("E", os.path.join(output_dir,
                                                    "per_genomic_site_all_chromosomes"
                                                    "precision_recall.png"))
        print("Per-genomic-site confusion matrix", file=log_file)
        print(cmh.confusion_matrix(), file=log_file)