Esempio n. 1
0
    def test_nn_call_bag(self, use_train_data=False):
        """
        Get a CallBag for the NN classifier on this plaster.run.
        use_train_data=True when you want to look at over-fitting.
        """
        if use_train_data:
            true_pep_iz = self.test_nn.train_true_pep_iz
            pred_pep_iz = self.test_nn.train_pred_pep_iz
            check.affirm(
                true_pep_iz is not None and pred_pep_iz is not None,
                "The test_nn task was not run with the training_set",
            )
            cached_pr = self.test_nn.train_peps_pr
        else:
            true_pep_iz = self.test_nn.test_true_pep_iz
            pred_pep_iz = self.test_nn.test_pred_pep_iz
            cached_pr = self.test_nn.test_peps_pr

        return CallBag(
            true_pep_iz=true_pep_iz,
            pred_pep_iz=pred_pep_iz,
            scores=self.test_nn.test_scores,
            prep_result=self.prep,
            sim_result=self.sim,
            cached_pr=cached_pr,
            classifier_name="nn",
        )
Esempio n. 2
0
    def test_rf_call_bag(self, use_train_data=False):
        """
        Get a CallBag for the RF classifier on this plaster.run.
        use_train_data=True when you want to look at over-fitting.
        """
        if use_train_data:
            true_pep_iz = self.test_rf.train_true_pep_iz
            pred_pep_iz = self.test_rf.train_pred_pep_iz
            scores = self.test_rf.train_scores
            all_class_scores = self.test_rf.train_all_class_scores
            cached_pr = self.test_rf.train_peps_pr
            cached_pr_abund = self.test_rf.train_peps_pr_abund
            check.affirm(
                true_pep_iz is not None and pred_pep_iz is not None,
                "The test_rf task was not run with the training_set",
            )
        else:
            true_pep_iz = self.test_rf.test_true_pep_iz
            pred_pep_iz = self.test_rf.test_pred_pep_iz
            scores = self.test_rf.test_scores
            all_class_scores = self.test_rf.test_all_class_scores
            cached_pr = self.test_rf.test_peps_pr
            cached_pr_abund = self.test_rf.test_peps_pr_abund

        return CallBag(
            true_pep_iz=true_pep_iz,
            pred_pep_iz=pred_pep_iz,
            scores=scores,
            all_class_scores=all_class_scores,
            prep_result=self.prep,
            sim_result=self.sim,
            cached_pr=cached_pr,
            cached_pr_abund=cached_pr_abund,
            classifier_name="rf",
        )
Esempio n. 3
0
 def classify_rf_call_bag(self):
     """
     Get a call bag for classification of real sigprocv2 data.
     """
     return CallBag(
         pred_pep_iz=self.classify_rf.pred_pep_iz,
         scores=self.classify_rf.scores,
         prep_result=self.prep,
         classifier_name="classify_rf",
     )
Esempio n. 4
0
def zest_pr_curve_no_tied_scores_mean_recall():
    """
    Testing situations with some right and some wrong calls, but no tied scores.
    Adding to this different train_recall factors to test that those get included
    in recall calculations.

    Same fixtures as above except the means, see above for comments.
    """

    # first entry is null peptide
    stub_sim_result = Munch(train_recalls=np.array([-1.0, 0.1, 0.2, 0.3]))
    stub_prep_result = Munch(n_peps=4)

    true_pep_iz = np.array([1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3])
    pred_pep_iz = np.array([1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 1])
    scores = np.array(
        [0.8, 0.9, 0.7, 0.6, 0.85, 0.53, 0.54, 0.55, 0.75, 0.4, 0.3, 0.35])

    cb = CallBag(
        sim_result=stub_sim_result,
        prep_result=stub_prep_result,
        true_pep_iz=true_pep_iz,
        pred_pep_iz=pred_pep_iz,
        scores=scores,
    )

    cum_sum_correct = np.array(
        [0.0, 1.0, 2.0, 3.0, 3.0, 3.0, 3.0, 3.0, 4.0, 5.0, 5.0, 6.0])
    cum_sum_count = np.array(range(1, len(pred_pep_iz) + 1))

    # precision at score threshold
    prec_at_thresh = utils.np_safe_divide(cum_sum_correct,
                                          cum_sum_count,
                                          default=0)
    prec_at_thresh = np.append([1.0], prec_at_thresh)

    # recall at each threshold
    recall_at_thresh = cum_sum_correct / len(pred_pep_iz)
    recall_at_thresh = np.append([0.0], recall_at_thresh)

    zest()
Esempio n. 5
0
def test_nn(test_nn_params,
            prep_result,
            sim_result,
            progress=None,
            pipeline=None):
    n_channels, n_cycles = sim_result.params.n_channels_and_cycles

    n_phases = 6 if test_nn_params.include_training_set else 3
    if pipeline is not None:
        pipeline.set_phase(0, n_phases)

    shape = sim_result.test_radmat.shape
    assert len(shape) == 4
    test_radmat = sim_result.test_radmat.reshape(
        (shape[0] * shape[1], shape[2], shape[3]))
    test_dyemat = sim_result.test_dyemat.reshape(
        (shape[0] * shape[1], shape[2], shape[3]))
    test_result = nn(
        test_nn_params,
        sim_result,
        radmat=test_radmat,
        true_dyemat=test_dyemat,
        progress=progress,
    )

    test_result.true_pep_iz = ArrayResult(
        filename="test_true_pep_iz",
        shape=(shape[0] * shape[1], ),
        dtype=IndexType,
        mode="w+",
    )
    test_result.true_pep_iz[:] = np.repeat(
        np.arange(shape[0]).astype(IndexType), shape[1])
    check.t(test_result.true_pep_iz, ArrayResult)
    check.t(test_result.pred_pep_iz, ArrayResult)

    call_bag = CallBag(
        true_pep_iz=test_result.true_pep_iz.arr(),
        pred_pep_iz=test_result.pred_pep_iz.arr(),
        scores=test_result.scores.arr(),
        prep_result=prep_result,
        sim_result=sim_result,
    )

    if pipeline is not None:
        pipeline.set_phase(1, n_phases)

    test_result.peps_pr = call_bag.pr_curve_by_pep(progress=progress)

    # If there is abundance information, compute the abundance-adjusted PR
    # This call returns None if there is no abundance info avail.
    if pipeline is not None:
        pipeline.set_phase(2, n_phases)

    test_result.peps_pr_abund = call_bag.pr_curve_by_pep_with_abundance(
        progress=progress)

    if test_nn_params.include_training_set:
        # Permit testing for over-fitting by classifying on the train data

        if pipeline is not None:
            pipeline.set_phase(3, n_phases)

        real_pep_iz = prep_result.peps__no_decoys().pep_i.values
        keep_rows = np.isin(sim_result.train_true_pep_iz, real_pep_iz)
        train_radmat = sim_result.train_radmat[keep_rows]
        train_dyemat = sim_result.train_dyemat[keep_rows]

        assert train_radmat.shape == shape

        train_result = nn(
            test_nn_params.use_gmm,
            sim_result,
            radmat=train_radmat,
            true_dyemat=train_dyemat,
            progress=progress,
        )
        train_result.true_pep_iz = sim_result.train_true_pep_iz
        train_result.true_pep_iz = ArrayResult(
            filename="train_true_pep_iz",
            shape=(shape[0] * shape[1], ),
            dtype=IndexType,
            mode="w+",
        )
        train_result.true_pep_iz[:] = np.repeat(
            np.arange(shape[0]).astype(IndexType), shape[1])
        check.t(train_result.true_pep_iz, ArrayResult)
        check.t(train_result.pred_pep_iz, ArrayResult)

        call_bag = CallBag(
            true_pep_iz=train_result.true_pep_iz.arr(),
            pred_pep_iz=train_result.pred_pep_iz.arr(),
            scores=train_result.scores.arr(),
            prep_result=prep_result,
            sim_result=sim_result,
        )

        if pipeline is not None:
            pipeline.set_phase(4, n_phases)

        train_result.peps_pr = call_bag.pr_curve_by_pep(progress=progress)

        if pipeline is not None:
            pipeline.set_phase(5, n_phases)

        train_result.peps_pr_abund = call_bag.pr_curve_by_pep_with_abundance(
            progress=progress)

    else:
        train_result = {k: None for k in test_result.keys()}

    def rename(d, prefix):
        return {f"{prefix}{k}": v for k, v in d.items()}

    return TestNNResult(
        params=test_nn_params,
        **rename(test_result, "test_"),
        **rename(train_result, "train_"),
    )
Esempio n. 6
0
def zest_pr_curve_no_tied_scores():
    """
    Testing situations with some right and some wrong calls, but no tied scores.
    """

    # first entry is null peptide
    stub_sim_result = Munch(train_recalls=np.array([-1.0] + [1.0] * 4))
    stub_prep_result = Munch(n_peps=5)

    # pep 1 is predicted correctly 1/4, #2 is 1/2, #3 is 3/4
    true_pep_iz = np.array([1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3])
    pred_pep_iz = np.array([1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 1])
    scores = np.array(
        [0.8, 0.9, 0.7, 0.6, 0.85, 0.53, 0.54, 0.55, 0.75, 0.4, 0.3, 0.35])

    cb = CallBag(
        sim_result=stub_sim_result,
        prep_result=stub_prep_result,
        true_pep_iz=true_pep_iz,
        pred_pep_iz=pred_pep_iz,
        scores=scores,
    )

    # sorted_i = np.argsort( scores )[::-1]

    # sorted by score, highest->lowest
    # t[1.  2.   1.  3.   1.  1.  2.   2.   2.   3.  3.   3.]
    # p[2.  2.   1.  3.   2.  2.  3.   3.   2.   3.  1.   3.]
    # s[0.9 0.85 0.8 0.75 0.7 0.6 0.55 0.54 0.53 0.4 0.35 0.3]

    # cumulative sum of correct calls, cumulative call count
    # [F.  T.   T.  T.   F.  F.  F.   F.   T.   T.  F.  T.]
    # cum_sum_correct = np.array(
    #     [0.0, 1.0, 2.0, 3.0, 3.0, 3.0, 3.0, 3.0, 4.0, 5.0, 5.0, 6.0]
    # )
    # cum_sum_count = np.array(range(1, len(pred_pep_iz) + 1))

    # Now using a linear stepper of the score so these are now:
    # [F.   T.   T.   T.   F.   F.   F.   F.   T.   T.   F.   T.]
    # [0.0, 1.0, 2.0, 3.0, 3.0, 3.0, 3.0, 3.0, 4.0, 5.0, 5.0, 6.0]
    #  .9        .8        .7   .6             .5   .4        .3

    # Note that the new pr_curve trims out starting values of r=0
    # So the p=0.0 and r=0./12. are removed
    prec = np.array([
        # 0.0 / 1.0,  # 0.9
        2.0 / 3.0,  # 0.8
        3.0 / 5.0,  # 0.7
        3.0 / 6.0,  # 0.6
        4.0 / 9.0,  # 0.5
        5.0 / 10.0,  # 0.4
        6.0 / 12.0,  # 0.3
        6.0 / 12.0,  # 0.2
        6.0 / 12.0,  # 0.1
        6.0 / 12.0,  # 0.0
    ])
    reca = np.array([
        # 0.0 / 12.0,  # 0.9
        2.0 / 12.0,  # 0.8
        3.0 / 12.0,  # 0.7
        3.0 / 12.0,  # 0.6
        4.0 / 12.0,  # 0.5
        5.0 / 12.0,  # 0.4
        6.0 / 12.0,  # 0.3
        6.0 / 12.0,  # 0.2
        6.0 / 12.0,  # 0.1
        6.0 / 12.0,  # 0.0
    ])

    # precision at score threshold
    # prec_at_thresh = utils.np_safe_divide(cum_sum_correct, cum_sum_count, default=0)
    # prec_at_thresh = np.append([1.0], prec_at_thresh)

    # recall at each threshold
    # recall_at_thresh = cum_sum_correct / len(pred_pep_iz)
    # recall_at_thresh = np.append([0.0], recall_at_thresh)

    def it_computes_combined_pr():
        p, r, s, a = cb.pr_curve(n_steps=10)
        assert np.array_equal(p, prec)
        assert np.allclose(s, np.linspace(0.8, 0.0, 9))
        assert np.array_equal(r, reca)

    def it_computes_subset_pr():
        p, r, s, a = cb.pr_curve(pep_iz_subset=[1], n_steps=10)

        # t[1.       1.       1.  1.                     3.     ]
        # p[2.       1.       2.  2.                     1.     ]
        # s[0.9 0.85 0.8 0.75 0.7 0.6 0.55 0.54 0.53 0.4 0.35 0.3]

        prec = np.array([
            # 0.0 / 1.0,  # 0.9
            1.0 / 1.0,  # 0.8
            1.0 / 1.0,  # 0.7
            1.0 / 1.0,  # 0.6
            1.0 / 1.0,  # 0.5
            1.0 / 1.0,  # 0.4
            1.0 / 2.0,  # 0.3
            1.0 / 2.0,  # 0.2
            1.0 / 2.0,  # 0.1
            1.0 / 2.0,  # 0.0
        ])
        reca = np.array([
            # 0.0 / 1.0,  # 0.9
            1.0 / 4.0,  # 0.8
            1.0 / 4.0,  # 0.7
            1.0 / 4.0,  # 0.6
            1.0 / 4.0,  # 0.5
            1.0 / 4.0,  # 0.4
            1.0 / 4.0,  # 0.3
            1.0 / 4.0,  # 0.2
            1.0 / 4.0,  # 0.1
            1.0 / 4.0,  # 0.0
        ])

        # look at true & pred that contain 1
        #          t[1.  1.  1.  1.  3.  ]
        #          p[2.  1.  2.  2.  1.  ]
        #          s[0.9 0.8 0.7 0.6 0.35]
        #
        # calculated prec first, based only on predictions
        # [1] out front is threshold added by sklearn
        # prec = [1][    1,         1/2]
        #
        # the scores for those are used as thresholds:
        # scor = [1][     8,          35 ]

        # the recalls are found by looking at trues at threshold scores
        # e.g. "what fraction of trues have been successfully found at score threshold .8?"
        # reca = [0][    .25,        .25]

        # Note that sklearn truncates when full recall has been obtained,
        # so we'll lose the last element of each of the above since max
        # recall occurs at recall=0.25 -- last one is dropped.  Said another
        # way: "The number of true positives does not go up after the one found
        # at score = 0.8, so stop reporting precision/recall/thresholds right there."
        assert np.array_equal(p, prec)
        assert np.array_equal(r, reca)
        assert np.allclose(s, np.linspace(0.8, 0.0, 9))

        #
        # Test again for a different subset.
        #
        p, r, s, a = cb.pr_curve(pep_iz_subset=[2], n_steps=10)
        #          t[1.  2.    1.  1.  2.   2.   2.  ]
        #          p[2.  2.    2.  2.  3.   3.   2.  ]
        #          s[0.9 0.85  0.7 0.6 0.55 0.54 0.53]
        #
        # calculated prec first, based only on predictions
        # prec = [1][0, .5,   1/3, 1/4,          2/5 ]
        #
        # the scores for those are used as thresholds:
        # scor = [1][9, 85,   7,   6,             53  ]

        # the recalls are found by looking trues and thresholds
        # reca = [0][0, 1/4,  1/4, 1/4,           2/4  ]

        # t[ 1.  2.              1.   1.  2.   2.    2.              ]
        # p[ 2.  2.              2.   2.  3.   3.    2.              ]
        #  [.9        .8        .7   .6             .5   .4        .3

        prec = np.array([
            # 0.0 / 2.0,  # 0.9
            1.0 / 2.0,  # 0.8
            1.0 / 3.0,  # 0.7
            1.0 / 4.0,  # 0.6
            2.0 / 5.0,  # 0.5
            2.0 / 5.0,  # 0.4
            2.0 / 5.0,  # 0.3
            2.0 / 5.0,  # 0.2
            2.0 / 5.0,  # 0.1
            2.0 / 5.0,  # 0.0
        ])
        reca = np.array([
            # 0.0 / 0.0,  # 0.9
            1.0 / 4.0,  # 0.8
            1.0 / 4.0,  # 0.7
            1.0 / 4.0,  # 0.6
            2.0 / 4.0,  # 0.5
            2.0 / 4.0,  # 0.4
            2.0 / 4.0,  # 0.3
            2.0 / 4.0,  # 0.2
            2.0 / 4.0,  # 0.1
            2.0 / 4.0,  # 0.0
        ])

        assert np.array_equal(p, prec)
        assert np.array_equal(r, reca)
        assert np.allclose(s, np.linspace(0.8, 0.0, 9))

    zest()
Esempio n. 7
0
def zest_pr_curve_edge_cases():
    """
    Testing situations in which calls are all wrong or all right.
    """

    # first entry is null peptide
    stub_sim_result = Munch(train_recalls=np.array([-1.0, 1.0, 1.0]))
    stub_prep_result = Munch(n_peps=3)

    # CallBag: all right / all wrong
    true_pep_iz = [1] * 10
    pred_pep_iz_1 = [1] * 10
    pred_pep_iz_2 = [2] * 10
    scores = [0.5] * 10
    cb_all_right = CallBag(
        sim_result=stub_sim_result,
        prep_result=stub_prep_result,
        true_pep_iz=true_pep_iz,
        pred_pep_iz=pred_pep_iz_1,
        scores=scores,
    )
    cb_all_wrong = CallBag(
        sim_result=stub_sim_result,
        prep_result=stub_prep_result,
        true_pep_iz=true_pep_iz,
        pred_pep_iz=pred_pep_iz_2,
        scores=scores,
    )

    zero_pr_result = [[0.0, 0.0], [0.0, 0.0], [0.5, 0.0], [0.0, 0.0]]
    one_pr_result = [[1.0, 1.0], [1.0, 1.0], [0.5, 0.0], [1.0, 1.0]]

    def it_computes_zero_pr():
        p, r, s, a = cb_all_wrong.pr_curve(n_steps=2)
        assert utils.np_array_same([p, r, s, a], zero_pr_result)

    def it_computes_zero_pr_for_subset():
        p, r, s, a = cb_all_wrong.pr_curve(pep_iz_subset=[1], n_steps=2)
        assert utils.np_array_same([p, r, s, a], zero_pr_result)

        p, r, s, a = cb_all_wrong.pr_curve(pep_iz_subset=[2], n_steps=2)
        assert utils.np_array_same([p, r, s, a], zero_pr_result)

        # peptide 2 does not show up in true/pred at all so should get zero pr curve
        p, r, s, a = cb_all_right.pr_curve(pep_iz_subset=[2], n_steps=2)
        assert utils.np_array_same([p, r, s, a], zero_pr_result)

    def it_computes_one_pr():
        p, r, s, a = cb_all_right.pr_curve(n_steps=2)
        # ugh, can't use utils.flatten bc elems are ndarray, not list or tuple
        compare = [a == b for a, b in zip([p, r, s, a], one_pr_result)]
        compare = [
            list(el) if type(el) is np.ndarray else el for el in compare
        ]
        assert all(compare)

    def it_computes_one_pr_for_subset():
        p, r, s, a = cb_all_right.pr_curve(pep_iz_subset=[1], n_steps=2)
        compare = [a == b for a, b in zip([p, r, s, a], one_pr_result)]
        compare = [
            list(el) if type(el) is np.ndarray else el for el in compare
        ]
        assert all(compare)

    def it_handles_all_rows_no_recall():
        p, r, s, a = cb_all_wrong.pr_curve()
        assert np.all(r == 0.0)

    zest()
Esempio n. 8
0
 def it_computes_prs_with_ties():
     p = np.array([0.3, 0.2, 0.2, 0.2, 0.1])
     r = np.array([0.1, 0.2, 0.3, 0.4, 0.5])
     s = np.array([0.9, 0.6, 0.6, 0.6, 0.5])
     result = CallBag._prs_at_prec(0.2, p, r, s)
     assert result == (0.2, 0.2, 0.6)
Esempio n. 9
0
def test_rf(
    test_rf_params,
    prep_result,
    sim_result,
    train_rf_result,
    progress=None,
    pipeline=None,
):
    n_phases = 6 if test_rf_params.include_training_set else 3
    classifier = train_rf_result.classifier

    if pipeline is not None:
        pipeline.set_phase(0, n_phases)

    test_pred_pep_iz, test_scores, test_all_class_scores = classifier.classify(
        sim_result.flat_test_radmat(), test_rf_params.keep_all_class_scores,
        progress)
    test_true_pep_iz = sim_result.test_true_pep_iz()

    # We do some PR calculation during the task so that this information is readily
    # available in results & notebooks don't need to recompute it (costly).
    # TODO: it is probably worth optimizing this by only doing PR for proteins of
    # interest if this has been specified for the run, since otherwise we'll be
    # computing full PR curves for every peptide in the background which is
    # probably not interesting.
    #
    call_bag = CallBag(
        true_pep_iz=test_true_pep_iz,
        pred_pep_iz=test_pred_pep_iz,
        scores=test_scores,
        all_class_scores=test_all_class_scores,
        prep_result=prep_result,
        sim_result=sim_result,
    )

    if pipeline is not None:
        pipeline.set_phase(1, n_phases)

    if pipeline is not None:
        pipeline.set_phase(2, n_phases)

    test_peps_pr = call_bag.pr_curve_by_pep(progress=progress)

    # If there is abundance information, compute the abundance-adjusted PR
    # This call returns None if there is no abundance info avail.
    test_peps_pr_abund = call_bag.pr_curve_by_pep_with_abundance(
        progress=progress)

    if test_rf_params.include_training_set:
        # Permit testing for over-fitting by classifying on the train data

        if pipeline is not None:
            pipeline.set_phase(3, n_phases)

        real_pep_iz = prep_result.peps__no_decoys().pep_i.values

        keep_rows = np.isin(sim_result.train_true_pep_iz, real_pep_iz)

        train_true_pep_iz = sim_result.train_true_pep_iz[keep_rows]
        train_radmat = sim_result.train_radmat[keep_rows]

        train_pred_pep_iz, train_scores, train_all_class_scores = classifier.classify(
            train_radmat, test_rf_params.keep_all_class_scores, progress)

        call_bag = CallBag(
            true_pep_iz=train_true_pep_iz,
            pred_pep_iz=train_pred_pep_iz,
            scores=train_scores,
            all_class_scores=train_all_class_scores,
            prep_result=prep_result,
            sim_result=sim_result,
        )

        if pipeline is not None:
            pipeline.set_phase(4, n_phases)

        train_peps_pr = call_bag.pr_curve_by_pep(progress=progress)

        if pipeline is not None:
            pipeline.set_phase(5, n_phases)

        train_peps_pr_abund = call_bag.pr_curve_by_pep_with_abundance(
            progress=progress)

    else:
        (
            train_pred_pep_iz,
            train_scores,
            train_all_class_scores,
            train_true_pep_iz,
            train_peps_pr,
            train_peps_pr_abund,
        ) = (
            None,
            None,
            None,
            None,
            None,
            None,
        )

    return TestRFResult(
        params=test_rf_params,
        test_true_pep_iz=test_true_pep_iz,
        test_pred_pep_iz=test_pred_pep_iz,
        test_scores=test_scores,
        test_all_class_scores=test_all_class_scores,
        test_peps_pr=test_peps_pr,
        test_peps_pr_abund=test_peps_pr_abund,
        train_true_pep_iz=train_true_pep_iz,
        train_pred_pep_iz=train_pred_pep_iz,
        train_scores=train_scores,
        train_all_class_scores=train_all_class_scores,
        train_peps_pr=train_peps_pr,
        train_peps_pr_abund=train_peps_pr_abund,
    )