def zest_survey_v2_pyx(): # TODO: This prep_result = prep_fixtures.result_simple_fixture(True) sim_v2_result = SimV2Result.from_prep_fixture(prep_result, labels="A,B") # pep 0: # pep 1: 10000 11000 # pep 2: 00000 21100 # pep 3: 00000 21000 pep_i_to_mic_pep_i, pep_i_to_isolation_metric = survey_v2_fast.survey( prep_result.n_peps, sim_v2_result.train_dyemat, sim_v2_result.train_dyepeps, n_threads=1, progress=None, ) assert pep_i_to_mic_pep_i.tolist() == [0, 3, 3, 2] # In the current verion they are all close # The first peptide should be a long way away and the other two should collide assert pep_i_to_isolation_metric[1] < 2 assert pep_i_to_isolation_metric[2] < 2 assert pep_i_to_isolation_metric[3] < 2 # TODO: I really want to do a better job where I compare some contrived # peptides and make sure that the outliers are outliers # I also need to do the sampling to figure out that magic number of the "nothing close" # TODO: Test for unlabelled peptides. I'm sure it is broken zest()
def decoys(): prep_with_decoy = prep_fixtures.result_simple_fixture(has_decoy=True) sim_v2_result, sim_v2_params = _sim(priors=dict(), _prep_result=prep_with_decoy) # def it_maintains_decoys_for_train(): # assert sim_v2_result.train_dyemat.shape == (4, 10) # # def it_removes_decoys_for_test(): # # 1000 because the nul-dye track should be removed # assert sim_v2_result.test_radmat.shape == (1000, 2, 5) zest()
def zest_call_bag_fdr(): stub_sim_result = Munch(train_pep_recalls=np.array([-1.0, 0.1, 0.2, 0.3])) stub_prep_result = prep_fixtures.result_simple_fixture(has_decoy=True) true_pep_iz = np.array([1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3]) pred_pep_iz = np.array([1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 1]) scores = np.array( [0.8, 0.9, 0.7, 0.6, 0.85, 0.53, 0.54, 0.55, 0.75, 0.4, 0.3, 0.35]) cb = CallBag( sim_result=stub_sim_result, prep_result=stub_prep_result, true_pep_iz=true_pep_iz, pred_pep_iz=pred_pep_iz, scores=scores, ) fdr_df = cb.fdr_from_decoys_df()
def zest_sim_v2_worker(): prep_result = prep_fixtures.result_simple_fixture() def _sim(priors=None, _prep_result=None, sim_kwargs=None): if _prep_result is None: _prep_result = prep_result priors = PriorsMLEFixtures.fixture_no_errors(**(priors or {})) if sim_kwargs is None: sim_kwargs = {} sim_kwargs["use_lognormal_model"] = True sim_v2_params = SimV2Params.from_aa_list_fixture( ["A", "B"], priors=priors, n_edmans=4, **(sim_kwargs or {}) ) return sim_v2_worker.sim_v2(sim_v2_params, _prep_result), sim_v2_params @zest.retry(2) def it_returns_train_dyemat(): # Because it has no errors, there's only a perfect dyemats sim_v2_result, sim_v2_params = _sim() assert sim_v2_result.train_dyemat.shape == (4, 5 * 2) # 5 cycles, 2 channels assert utils.np_array_same( sim_v2_result.train_dyemat, np.array( [ [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 2, 1, 0, 0, 0], [0, 0, 0, 0, 0, 2, 1, 1, 0, 0], [1, 0, 0, 0, 0, 1, 1, 0, 0, 0], ], dtype=np.uint8, ), ) def it_returns_train_dyemat_with_a_zero_row(): sim_v2_result, sim_v2_params = _sim() assert np.all(sim_v2_result.train_dyemat[0, :] == 0) @zest.retry(2) def it_returns_train_dyemat_for_cleaved_cterm_labels(): prep_cterm = prep_fixtures.result_cterm_label_fixture() # dyemat when allow_edman_cterm is True sim_v2_result, sim_v2_params = _sim( _prep_result=prep_cterm, sim_kwargs=Munch(allow_edman_cterm=True) ) assert sim_v2_result.train_dyemat.shape == (6, 5 * 2) # 5 cycles, 2 channels assert utils.np_array_same( sim_v2_result.train_dyemat, np.array( [ [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 1, 0, 0, 0, 0], [0, 0, 0, 0, 0, 1, 1, 0, 0, 0], [0, 0, 0, 0, 0, 1, 1, 1, 0, 0], [0, 0, 0, 0, 0, 1, 1, 1, 1, 0], [0, 0, 0, 0, 0, 1, 1, 1, 1, 1], ], dtype=np.uint8, ), ) @zest.retry(2) def it_returns_train_dyemat_for_uncleaved_cterm_labels(): prep_cterm = prep_fixtures.result_cterm_label_fixture() # dyemat when allow_edman_cterm is False (default) sim_v2_result, sim_v2_params = _sim( _prep_result=prep_cterm, sim_kwargs=Munch(allow_edman_cterm=False) ) assert sim_v2_result.train_dyemat.shape == (3, 5 * 2) # 5 cycles, 2 channels assert utils.np_array_same( sim_v2_result.train_dyemat, np.array( [ [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 1, 0, 0, 0, 0], [0, 0, 0, 0, 0, 1, 1, 1, 1, 1], ], dtype=np.uint8, ), ) def it_returns_train_dyepeps(): sim_v2_result, sim_v2_params = _sim() # The order of dyts is not guaranteed, so remove them assert utils.np_array_same( sim_v2_result.train_dyepeps[:, (0, 2)], np.array([[0, 0], [1, 5000], [2, 5000], [3, 5000],], dtype=np.uint64), ) def it_handles_non_fluorescent(): sim_v2_result, sim_v2_params = _sim(priors=dict(p_non_fluorescent=0.5)) # Check that every dyepep other than the nul-row # should have n_reads (col=2) a lot less than 5000. assert np.all(sim_v2_result.train_dyepeps[1:, 2] < 2000) def it_returns_no_all_dark_samples(): sim_v2_result, sim_v2_params = _sim(priors=dict(p_non_fluorescent=0.99)) assert not np.any(sim_v2_result.train_dyepeps[1:, 0] == 0) def it_returns_recalls(): sim_v2_result, sim_v2_params = _sim(priors=dict(p_non_fluorescent=0.50)) assert sim_v2_result.train_pep_recalls.shape[0] == 4 # 4 peps assert ( sim_v2_result.train_pep_recalls[0] == 0.0 ) # The nul record should have no recall assert np.all( sim_v2_result.train_pep_recalls[1:] < 0.85 ) # The exact number is hard to say, but it should be < 1 def it_emergency_escapes(): sim_v2_result, sim_v2_params = _sim(priors=dict(p_non_fluorescent=0.99)) # When nothing is fluorescent, everything should have zero recall assert np.all(sim_v2_result.train_pep_recalls == 0.0) def it_handles_empty_dyepeps(): sim_v2_result, sim_v2_params = _sim(priors=dict(p_non_fluorescent=1.0)) assert np.all(sim_v2_result.train_pep_recalls == 0.0) def decoys(): prep_with_decoy = prep_fixtures.result_simple_fixture(has_decoy=True) sim_v2_result, sim_v2_params = _sim(priors=dict(), _prep_result=prep_with_decoy) # def it_maintains_decoys_for_train(): # assert sim_v2_result.train_dyemat.shape == (4, 10) # # def it_removes_decoys_for_test(): # # 1000 because the nul-dye track should be removed # assert sim_v2_result.test_radmat.shape == (1000, 2, 5) zest() def it_skips_row_noise(): sim_v2_result, sim_v2_params = _sim(priors=dict(row_k_sigma=0.0)) spy(sim_v2_result.test_true_row_ks) assert np.all(sim_v2_result.test_true_row_ks == 1.0) def it_adds_row_noise(): sim_v2_result, sim_v2_params = _sim(priors=dict(row_k_sigma=0.5)) assert np.any(sim_v2_result.test_true_row_ks != 1.0) @zest.skip(reason="Not implemented") def it_raises_if_train_and_test_identical(): raise NotImplementedError zest()
def zest_pr_pro_curve_edge_cases(): """ Testing situations in which calls are all wrong or all right, for pr_curve_pro (protein rather than peptide based) """ # first entry is null peptide stub_sim_result = Munch(train_pep_recalls=np.array([-1.0, 1.0, 1.0, 1.0])) stub_prep_result = prep_fixtures.result_simple_fixture() # CallBag: all right / all wrong true_pep_iz = [1] * 10 pred_pep_iz_1 = [1] * 10 pred_pep_iz_2 = [2] * 10 scores = [0.5] * 10 cb_all_right = CallBag( sim_result=stub_sim_result, prep_result=stub_prep_result, true_pep_iz=true_pep_iz, pred_pep_iz=pred_pep_iz_1, scores=scores, ) cb_all_wrong = CallBag( sim_result=stub_sim_result, prep_result=stub_prep_result, true_pep_iz=true_pep_iz, pred_pep_iz=pred_pep_iz_2, scores=scores, ) zero_pr_result = [[0.0, 0.0], [0.0, 0.0], [0.5, 0.0], [0.0, 0.0]] one_pr_result = [[1.0, 1.0], [1.0, 1.0], [0.5, 0.0], [1.0, 1.0]] def it_computes_zero_pr(): p, r, s, a = cb_all_wrong.pr_curve_pro(n_steps=2) assert utils.np_array_same([p, r, s, a], zero_pr_result) def it_computes_zero_pr_for_subset(): p, r, s, a = cb_all_wrong.pr_curve_pro(pro_iz_subset=[1], n_steps=2) assert utils.np_array_same([p, r, s, a], zero_pr_result) p, r, s, a = cb_all_wrong.pr_curve_pro(pro_iz_subset=[2], n_steps=2) assert utils.np_array_same([p, r, s, a], zero_pr_result) # peptide 2 does not show up in true/pred at all so should get zero pr curve p, r, s, a = cb_all_right.pr_curve_pro(pro_iz_subset=[2], n_steps=2) assert utils.np_array_same([p, r, s, a], zero_pr_result) def it_computes_one_pr(): p, r, s, a = cb_all_right.pr_curve_pro(n_steps=2) # ugh, can't use utils.flatten bc elems are ndarray, not list or tuple compare = [a == b for a, b in zip([p, r, s, a], one_pr_result)] compare = [ list(el) if type(el) is np.ndarray else el for el in compare ] assert all(compare) def it_computes_one_pr_for_subset(): p, r, s, a = cb_all_right.pr_curve_pro(pro_iz_subset=[1], n_steps=2) compare = [a == b for a, b in zip([p, r, s, a], one_pr_result)] compare = [ list(el) if type(el) is np.ndarray else el for el in compare ] assert all(compare) def it_handles_all_rows_no_recall(): p, r, s, a = cb_all_wrong.pr_curve_pro() assert np.all(r == 0.0) zest()
def zest_pr_curve_pro_no_tied_scores(): """ Testing situations with some right and some wrong calls, but no tied scores. """ # first entry is null peptide stub_sim_result = Munch(train_pep_recalls=np.array([-1.0] + [1.0] * 3)) stub_prep_result = prep_fixtures.result_simple_fixture() # pep 1 is predicted correctly 1/4, #2 is 1/2, #3 is 3/4 # we have mispredictions of #2 and #3 always be to #1, to avoid # the unrealistic case of getting the peptide wrong but getting # the protein right anyway, which in reality is expected to be rare true_pep_iz = np.array([1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3]) # true pro iz will be [1,1,1,1,2,2,2,2,2,2,2,2] pred_pep_iz = np.array([1, 2, 2, 2, 2, 2, 1, 1, 3, 3, 3, 1]) # pred pro iz will be [1,2,2,2,2,2,1,1,2,2,2,1] scores = np.array( [0.8, 0.9, 0.7, 0.6, 0.85, 0.53, 0.54, 0.55, 0.75, 0.4, 0.3, 0.35]) cb = CallBag( sim_result=stub_sim_result, prep_result=stub_prep_result, true_pep_iz=true_pep_iz, pred_pep_iz=pred_pep_iz, scores=scores, ) prec = np.array([ 2.0 / 3.0, # 0.8 3.0 / 5.0, # 0.7 3.0 / 6.0, # 0.6 4.0 / 9.0, # 0.5 5.0 / 10.0, # 0.4 6.0 / 12.0, # 0.3 6.0 / 12.0, # 0.2 6.0 / 12.0, # 0.1 6.0 / 12.0, # 0.0 ]) reca = np.array([ 2.0 / 12.0, # 0.8 3.0 / 12.0, # 0.7 3.0 / 12.0, # 0.6 4.0 / 12.0, # 0.5 5.0 / 12.0, # 0.4 6.0 / 12.0, # 0.3 6.0 / 12.0, # 0.2 6.0 / 12.0, # 0.1 6.0 / 12.0, # 0.0 ]) def it_computes_combined_pr(): p, r, s, a = cb.pr_curve_pro(n_steps=10) assert np.array_equal(p, prec) assert np.allclose(s, np.linspace(0.8, 0.0, 9)) assert np.array_equal(r, reca) def it_computes_subset_pr(): p, r, s, a = cb.pr_curve_pro(pro_iz_subset=[1], n_steps=10) prec = np.array([ 1.0 / 1.0, # 0.8 1.0 / 1.0, # 0.7 1.0 / 1.0, # 0.6 1.0 / 3.0, # 0.5 1.0 / 3.0, # 0.4 1.0 / 4.0, # 0.3 1.0 / 4.0, # 0.2 1.0 / 4.0, # 0.1 1.0 / 4.0, # 0.0 ]) reca = np.array([ 1.0 / 4.0, # 0.8 1.0 / 4.0, # 0.7 1.0 / 4.0, # 0.6 1.0 / 4.0, # 0.5 1.0 / 4.0, # 0.4 1.0 / 4.0, # 0.3 1.0 / 4.0, # 0.2 1.0 / 4.0, # 0.1 1.0 / 4.0, # 0.0 ]) assert np.array_equal(p, prec) assert np.array_equal(r, reca) assert np.allclose(s, np.linspace(0.8, 0.0, 9)) # # Test again for a different subset. # p, r, s, a = cb.pr_curve_pro(pro_iz_subset=[2], n_steps=10) prec = np.array([ 1.0 / 2.0, # 0.8 2.0 / 4.0, # 0.7 2.0 / 5.0, # 0.6 3.0 / 6.0, # 0.5 4.0 / 7.0, # 0.4 5.0 / 8.0, # 0.3 5.0 / 8.0, # 0.2 5.0 / 8.0, # 0.1 5.0 / 8.0, # 0.0 ]) reca = np.array([ 1.0 / 8.0, # 0.8 2.0 / 8.0, # 0.7 2.0 / 8.0, # 0.6 3.0 / 8.0, # 0.5 4.0 / 8.0, # 0.4 5.0 / 8.0, # 0.3 5.0 / 8.0, # 0.2 5.0 / 8.0, # 0.1 5.0 / 8.0, # 0.0 ]) assert np.array_equal(p, prec) assert np.array_equal(r, reca) assert np.allclose(s, np.linspace(0.8, 0.0, 9)) zest()