def sim_v2(aa_list, err_set, **sim_kws): priors_desc = priors_desc_from_err_set(err_set) n_pres = sim_kws.get("n_pres", 0) n_mocks = sim_kws.get("n_mocks", 0) assert ( n_pres + n_mocks >= 1 ), "You must include at least 1 pre or mock cycle to capture the initial image" block = Munch( sim_v2=Munch( version="1.0", inputs=Munch(prep="../prep"), parameters=Munch( **SimV2Params.from_aa_list_fixture( aa_list, priors_desc=priors_desc, **sim_kws ) ), ) ) # REMOVE the priors before saving del block.sim_v2.parameters["priors"] block.sim_v2.parameters = utils.strip_underscore_keys( block.sim_v2.parameters, munchify=True ) return block
def from_prep_fixture(cls, prep_result, labels: str, n_edmans=5, priors=None): """ Run a (likely small) simulation to make a SimResult fixture for testing labels: a CSV list of aas. Eg: "DE,C" Common labels: "DE", "C", "Y", "K", "H" """ from plaster.run.sim_v2.sim_v2_worker import sim_v2 from plaster.tools.schema import check from plaster.run.priors import PriorsMLEFixtures check.t(labels, str) labels = labels.split(",") if priors is None: priors = PriorsMLEFixtures.val_defaults() sim_v2_params = SimV2Params.from_aa_list_fixture( labels, priors=priors, n_pres=1, n_mocks=0, n_edmans=n_edmans, use_lognormal_model=False, n_samples_train=100, n_samples_test=100, train_includes_radmat=True, ) return sim_v2(sim_v2_params, prep_result)
def it_makes_cycles_array(): priors = PriorsMLEFixtures.nbt_defaults() src_params = SimV2Params.from_aa_list_fixture(["DE", "Y"], priors=priors, n_pres=1, n_mocks=2, n_edmans=3) assert src_params.cycles_array().tolist() == [0, 1, 1, 2, 2, 2]
def start(self): params = SimV2Params(**self.config.parameters) prep_result = PrepResult.load_from_folder(self.inputs.prep) result = sim_v2(params, prep_result, progress=self.progress, pipeline=self) result.save() if params.dump_debug: result.dump_debug()
def _sim(priors=None, _prep_result=None, sim_kwargs=None): if _prep_result is None: _prep_result = prep_result priors = PriorsMLEFixtures.fixture_no_errors(**(priors or {})) if sim_kwargs is None: sim_kwargs = {} sim_kwargs["use_lognormal_model"] = True sim_v2_params = SimV2Params.from_aa_list_fixture( ["A", "B"], priors=priors, n_edmans=4, **(sim_kwargs or {}) ) return sim_v2_worker.sim_v2(sim_v2_params, _prep_result), sim_v2_params
def it_assigns_channels_if_not_exist(): params = SimV2Params( dyes=[ Munch(dye_name="atto_647", channel_name="647"), Munch(dye_name="foo_549", channel_name="549"), ], labels=[ Munch( aa="C", dye_name="atto_647", label_name="cys0", ptm_only=False, ), ], priors_desc={}, ) assert params.channel_names() == ["549", "647"]
def it_converts_priors(): params = SimV2Params( is_survey=False, dyes=[ Munch(dye_name="foo_549", channel_name="549"), ], labels=[ Munch(aa="D", dye_name="foo_549", label_name="DE0", ptm_only=False), ], priors_desc={ "foo": dict(class_name="MLEPrior", params=dict(value=1.0)), }, ) assert isinstance(params.priors, Priors) assert params.priors.get_mle("foo") == 1.0
def it_makes_pcbs(): # fmt: off params = SimV2Params( is_survey=False, dyes=[ Munch(dye_name="dye1", channel_name="640"), Munch(dye_name="dye2", channel_name="549"), ], labels=[ Munch(aa="A", dye_name="dye1", label_name="A", ptm_only=False), Munch(aa="B", dye_name="dye2", label_name="B", ptm_only=False), ], priors_desc={ "p_non_fluorescent.dye1": dict(class_name="MLEPrior", params=dict(value=0.5)), "p_non_fluorescent.dye2": dict(class_name="MLEPrior", params=dict(value=0.3)), }, channels={ "640": 0, "549": 1, }, ) # fmt: on pep_seq = pd.DataFrame( dict( pep_i=[1, 1, 1, 2, 2], aa=["X", "A", "X", "B", "X"], pep_offset_in_pro=[0, 1, 2, 0, 1], )) pcbs = params.pcbs(pep_seq) assert utils.np_array_same( pcbs, [ [1.0, np.nan, np.nan], [1.0, 0.0, 0.5], [1.0, np.nan, np.nan], [2.0, 1.0, 1.0 - 0.3], [2.0, np.nan, np.nan], ], )
def zest_v2_stress_like_e2e(): # This was dying with a "double free or corruption (!prev)" # This was a bug in n_dyetracks counting now fixed, but leaving this test in for regression. prep_params = PrepParams( decoy_mode=None, n_peps_limit=None, n_ptms_limit=5, protease=None, proteins=[ Munch( abundance=None, name="pep25", ptm_locs="", is_poi=0, sequence="GCAGCAGAG ", ) ], proteins_of_interest=[], ) pro_spec_df = pd.DataFrame(prep_params.proteins) prep_result = prep(prep_params, pro_spec_df) sim_v2_param_block = Munch( allow_train_test_to_be_identical=False, enable_ptm_labels=False, dyes=[Munch(dye_name="dye_0", channel_name="ch0")], labels=[ Munch( aa="C", dye_name="dye_0", label_name="label_0", ptm_only=False, ) ], priors_desc={ "p_non_fluorescent": Munch( class_name="MLEPrior", params=Munch(value=0.07), ), "p_bleach": Munch( class_name="MLEPrior", params=Munch(value=0.05), ), "gain_mu": Munch( class_name="MLEPrior", params=Munch(value=7500.0), ), "gain_sigma": Munch( class_name="MLEPrior", params=Munch(value=0.16), ), "bg_mu": Munch( class_name="MLEPrior", params=Munch(value=300.0), ), "bg_sigma": Munch( class_name="MLEPrior", params=Munch(value=700.0), ), "p_detach": Munch( class_name="MLEPrior", params=Munch(value=0.05), ), "p_edman_failure": Munch( class_name="MLEPrior", params=Munch(value=0.06), ), "row_k_sigma": Munch( class_name="MLEPrior", params=Munch(value=0.15), ), }, is_survey=False, n_edmans=8, n_mocks=1, n_pres=0, n_samples_test=1000, n_samples_train=5000, random_seed=None, test_includes_dyemat=False, train_includes_radmat=False, use_lognormal_model=True, ) sim_v2_params = SimV2Params(include_dfs=True, **sim_v2_param_block) sim_v2_result = sim_v2_worker.sim_v2(sim_v2_params, prep_result) sim_v2_result._generate_flu_info(prep_result) nn_v2_params = NNV2Params(source="zest_v2_stress_like_e2e", priors_desc=sim_v2_param_block.priors_desc) nn_result = nn_v2(nn_v2_params, prep_result, sim_v2_result, None) df = nn_result.calls() assert np.all(df.pep_i == 1)
def it_builds_dfs(): params = SimV2Params( dyes=[ Munch(dye_name="atto_647", channel_name="647"), Munch(dye_name="foo_549", channel_name="549"), ], labels=[ Munch( aa="C", dye_name="atto_647", label_name="cys0", ptm_only=False, ), Munch(aa="D", dye_name="foo_549", label_name="DE0", ptm_only=False), Munch(aa="E", dye_name="foo_549", label_name="DE0", ptm_only=False), ], channels=Munch({ "647": 0, "549": 1, }), priors_desc={ "p_non_fluorescent.atto_647": Munch( class_name="MLEPrior", params=Munch(value=1.0), ), "p_non_fluorescent.foo_549": Munch( class_name="MLEPrior", params=Munch(value=2.0), ), "p_bleach": Munch(class_name="MLEPrior", params=Munch(value=3.0)), }, ) def it_builds_channel__priors(): assert sorted(params.channel__priors().columns) == sorted([ "bg_mu", "bg_sigma", "ch_i", "channel_name", "dye_name", "gain_mu", "gain_sigma", "index", "p_bleach", "row_k_sigma", ]) assert (params.channel__priors().set_index( "ch_i").loc[1].p_bleach.sample() == 3.0) def it_builds_dye__label__priors(): assert sorted(params.dye__label__priors().columns) == sorted([ "aa", "bg_mu", "bg_sigma", "ch_i", "channel_name", "dye_name", "gain_mu", "gain_sigma", "label_name", "p_bleach", "p_non_fluorescent", "ptm_only", "row_k_sigma", ]) assert (params.dye__label__priors().set_index( "aa").loc["D"].p_non_fluorescent.sample() == 2.0) zest()
def it_handles_bleaching(): n_cycles = 5 dyes_labels = Munch( dyes=[Munch(dye_name="dye0", channel_name="ch0"),], labels=[ Munch(aa="X", dye_name="dye0", label_name="label0", ptm_only=False), ], ) sim_v2_params = SimV2Params( n_pres=1, n_mocks=0, n_edmans=4, **dyes_labels, priors_desc={ "gain_mu": dict(class_name="MLEPrior", params=dict(value=5000.0)), "gain_sigma": dict(class_name="MLEPrior", params=dict(value=50.0)), "bg_sigma": dict(class_name="MLEPrior", params=dict(value=50.0)), "row_k_sigma": dict(class_name="MLEPrior", params=dict(value=0.0)), "p_edman_failure": dict(class_name="MLEPrior", params=dict(value=0.0)), "p_detach": dict(class_name="MLEPrior", params=dict(value=0.0)), "p_bleach": dict(class_name="MLEPrior", params=dict(value=0.0)), "p_non_fluorescent": dict( class_name="MLEPrior", params=dict(value=0.0) ), }, ) dyemat = triangle_dyemat(5, 1, include_multi_drop=False, include_nul_row=True) pep_i = [] aa = [] pep_offset_in_pro = [] for i, dyt in enumerate(dyemat): pep_i += [i] * n_cycles aa += list(dyt_to_seq(dyt)) pep_offset_in_pro += [0] * n_cycles pepseqs_df = pd.DataFrame( dict(pep_i=pep_i, aa=aa, pep_offset_in_pro=pep_offset_in_pro,) ) # pcbs are an encoding of flus. See def pcbs() pcbs = sim_v2_params.pcbs(pepseqs_df) n_samples = 10 train_dyemat, train_dyepeps, train_pep_recalls = sim_v2_worker._dyemat_sim( sim_v2_params, pcbs, n_samples=n_samples, progress=None, ) n_dyts = train_dyemat.shape[0] check.array_t(train_dyemat, shape=(n_dyts, n_cycles)) # dyepeps are in (dyt_i, pep_i, count) order # Every peptide should have assert np.max(train_dyepeps[:, 1]) # Assert that every peptide (except 0) got 10 samples for pep_i in range(1, 6): mask = train_dyepeps[:, 1] == pep_i assert train_dyepeps[mask, 2].sum() == n_samples # Assert only those peptides are present assert np.min(train_dyepeps[:, 1]) == 0 assert np.max(train_dyepeps[:, 1]) == 5 # Mapping of dyt and pep should both be identity assert np.all(train_dyepeps[:, 0] == np.arange(n_dyts)) assert np.all(train_dyepeps[:, 1] == np.arange(n_dyts)) assert np.all(train_dyepeps[1:, 2] == n_samples) assert train_dyemat.tolist() == [ [0, 0, 0, 0, 0], [1, 0, 0, 0, 0], [1, 1, 0, 0, 0], [1, 1, 1, 0, 0], [1, 1, 1, 1, 0], [1, 1, 1, 1, 1], ]
def zest_radmat_sim(): dyes_labels = Munch( dyes=[ Munch(dye_name="dye0", channel_name="ch0"), Munch(dye_name="dye1", channel_name="ch1"), ], labels=[ Munch(aa="A", dye_name="dye0", label_name="label0", ptm_only=False), Munch(aa="B", dye_name="dye1", label_name="label1", ptm_only=False), ], ) params_with_noise = SimV2Params( **dyes_labels, priors_desc={ "gain_mu": dict(class_name="MLEPrior", params=dict(value=7500.0)), "gain_sigma": dict(class_name="MLEPrior", params=dict(value=0.16)), "bg_sigma": dict(class_name="MLEPrior", params=dict(value=200.0)), }, ) ch_params_with_noise = params_with_noise.by_channel() params_no_noise = SimV2Params( **dyes_labels, priors_desc={ "gain_mu": dict(class_name="MLEPrior", params=dict(value=1.0)), "gain_sigma": dict(class_name="MLEPrior", params=dict(value=0.0)), "bg_sigma": dict(class_name="MLEPrior", params=dict(value=0.0)), "row_k_sigma": dict(class_name="MLEPrior", params=dict(value=0.0)), }, ) ch_params_no_noise = params_no_noise.by_channel() # fmt: off dyemat = np.array([ [[0, 0, 0], [0, 0, 0]], [[1, 1, 0], [1, 0, 0]], [[2, 2, 1], [2, 1, 0]], ]) dyepeps = np.array([ [0, 0, 0], [1, 1, 10], [1, 2, 5], [2, 2, 5], [0, 3, 10], ]) # fmt: on n_samples_per_pep = 10 n_channels = 2 n_cycles = 3 n_peps = 4 def it_removes_all_zero_rows(): radiometry, true_pep_iz, true_dye_iz, true_row_ks = sim_v2_worker._radmat_sim( dyemat, dyepeps, ch_params_with_noise, n_samples_per_pep, n_channels, n_cycles, use_lognormal_model=True, ) expected_rows = 20 # 20 because we sample 10 per pep but only peps 1 and 2 have non-zero dyts assert radiometry.shape == (expected_rows, n_channels, n_cycles) # fmt: off assert true_pep_iz.tolist() == [ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ] # fmt: on assert not np.any(np.all(radiometry == 0.0, axis=(1, 2))) def it_returns_reasonable_radiometry(): radiometry, true_pep_iz, true_dye_iz, true_row_ks = sim_v2_worker._radmat_sim( dyemat, dyepeps, ch_params_with_noise, n_samples_per_pep, n_channels, n_cycles, use_lognormal_model=True, ) # Only 2 of the peptide have dyetracks assert radiometry.shape == (n_samples_per_pep * 2, n_channels, n_cycles) # fmt: off assert true_pep_iz.tolist() == [ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ] # fmt: on # I'm not sure of a good test here # assert np.all(radiometry[radiometry > 0.0] > 1000.0) def it_returns_correct_radiometry_with_no_noise(): # By using no noise, we can just compare that radiometry gave back the dyemat # but with each peptide repeated radiometry, true_pep_iz, true_dye_iz, true_row_ks = sim_v2_worker._radmat_sim( dyemat, dyepeps, ch_params_no_noise, n_samples_per_pep, n_channels, n_cycles, use_lognormal_model=True, ) assert np.all(true_row_ks == 1.0) assert np.all(radiometry[0:5] == dyemat[1, :].astype(RadType),) assert np.all( ( (radiometry[5:10] == dyemat[1, :].astype(RadType)) | (radiometry[5:10] == dyemat[2, :].astype(RadType)) ), ) # fmt: off assert true_dye_iz[0:5].tolist() == [1, 1, 1, 1, 1] # fmt: on assert np.all((true_dye_iz[5:10] == 1) | (true_dye_iz[5:10] == 2)) # fmt: off assert true_pep_iz.tolist() == [ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ] # fmt: on zest()
def zest_radmat_from_sampled_pep_dyemat(): n_channels = 2 n_cycles = 3 n_samples_per_pep = 5 n_peps = 2 # fmt: off sampled_dyemat = np.array([ [[1, 1, 0], [1, 0, 0]], [[2, 2, 1], [2, 1, 0]], [[2, 2, 1], [2, 1, 0]], [[1, 1, 0], [1, 0, 0]], [[1, 1, 0], [1, 0, 0]], ], dtype=np.uint8) # fmt: on dyes_labels = Munch( dyes=[ Munch(dye_name="dye0", channel_name="ch0"), Munch(dye_name="dye1", channel_name="ch1"), ], labels=[ Munch(aa="A", dye_name="dye0", label_name="label0", ptm_only=False), Munch(aa="B", dye_name="dye1", label_name="label1", ptm_only=False), ], ) params_with_noise_lognormal = SimV2Params( **dyes_labels, priors_desc={ "gain_mu": dict(class_name="MLEPrior", params=dict(value=10.0)), "gain_sigma": dict(class_name="MLEPrior", params=dict(value=0.1)), "bg_sigma": dict(class_name="MLEPrior", params=dict(value=200.0)), }, ) ch_params_with_noise_lognormal = params_with_noise_lognormal.by_channel() params_with_noise_normal_no_row_k = SimV2Params( **dyes_labels, priors_desc={ "gain_mu": dict(class_name="MLEPrior", params=dict(value=5000.0)), "gain_sigma": dict(class_name="MLEPrior", params=dict(value=50.0)), "bg_sigma": dict(class_name="MLEPrior", params=dict(value=50.0)), "row_k_sigma": dict(class_name="MLEPrior", params=dict(value=0.0)), }, ) ch_params_with_noise_normal_no_row_k = ( params_with_noise_normal_no_row_k.by_channel() ) params_with_noise_normal_with_row_k = SimV2Params( **dyes_labels, priors_desc={ "gain_mu": dict(class_name="MLEPrior", params=dict(value=5000.0)), "gain_sigma": dict(class_name="MLEPrior", params=dict(value=50.0)), "bg_sigma": dict(class_name="MLEPrior", params=dict(value=50.0)), "row_k_sigma": dict(class_name="MLEPrior", params=dict(value=0.16)), }, ) ch_params_with_noise_normal_with_row_k = ( params_with_noise_normal_with_row_k.by_channel() ) params_no_noise = SimV2Params( **dyes_labels, priors_desc={ "gain_mu": dict(class_name="MLEPrior", params=dict(value=10.0)), "gain_sigma": dict(class_name="MLEPrior", params=dict(value=0.0)), "bg_sigma": dict(class_name="MLEPrior", params=dict(value=0.0)), "row_k_sigma": dict(class_name="MLEPrior", params=dict(value=0.0)), }, ) ch_params_no_noise = params_no_noise.by_channel() # sim_v2_params = SimV2Params.construct_from_aa_list( # ["A", "B"], n_edmans=4) # ) # ch_params_no_noise = [ # Munch(beta=10.0, sigma=0.0, zero_beta=0.0, zero_sigma=0.0), # Munch(beta=10.0, sigma=0.0, zero_beta=0.0, zero_sigma=0.0), # ] # # ch_params_with_noise_lognormal = [ # Munch(beta=10.0, sigma=0.1, zero_beta=0.0, zero_sigma=200.0), # Munch(beta=10.0, sigma=0.1, zero_beta=0.0, zero_sigma=200.0), # ] output_radmat = None output_true_row_k = None def _before(): nonlocal output_radmat output_radmat = np.zeros( (n_peps, n_samples_per_pep, n_channels, n_cycles), dtype=np.float32 ) nonlocal output_true_row_k output_true_row_k = np.zeros((n_peps, n_samples_per_pep), dtype=np.float32) def it_returns_noise_free_radmat(): sim_v2_worker._radmat_from_sampled_pep_dyemat( sampled_dyemat, ch_params_no_noise, n_channels, output_radmat, output_true_row_k, use_lognormal_model=True, pep_i=1, ) assert output_radmat.shape == (n_peps, n_samples_per_pep, n_channels, n_cycles) # Peptide 0 is all zero: assert np.all(output_radmat[0, :, :, :] == 0.0) # Peptide 1 is noise-free assert np.all( output_radmat[1, :, :, :] == 10.0 * sampled_dyemat.astype(np.float32) ) @zest.retry(n_tries=2) # Stochastic def it_returns_noisy_radmat_lognormal(): sim_v2_worker._radmat_from_sampled_pep_dyemat( sampled_dyemat, ch_params_with_noise_lognormal, n_channels, output_radmat, output_true_row_k, use_lognormal_model=True, pep_i=1, ) assert output_radmat.shape == (n_peps, n_samples_per_pep, n_channels, n_cycles) assert np.all(output_radmat[0, :, :, :] == 0.0) expected = 10.0 * sampled_dyemat.astype(np.float32) diff = output_radmat[1, :, :, :] - expected diff = utils.np_safe_divide(diff, expected) ** 2 assert np.all((diff ** 2 < 0.25 ** 2) | np.isnan(diff)) def it_returns_noisy_radmat_normal_no_row_k(): sim_v2_worker._radmat_from_sampled_pep_dyemat( sampled_dyemat, ch_params_with_noise_normal_no_row_k, n_channels, output_radmat, output_true_row_k, use_lognormal_model=False, pep_i=1, ) assert output_radmat.shape == (n_peps, n_samples_per_pep, n_channels, n_cycles) assert np.all(output_radmat[0, :, :, :] == 0.0) # Set the zero record to nan so it doesn't factor in output_radmat[0, :, :, :] = np.nan output_true_row_k[0, :] = np.nan row_k_mean = np.nanmean(output_true_row_k) assert (row_k_mean - 1.0) ** 2 < 0.15 ** 2 bg_mean = np.nanmean(np.where(sampled_dyemat == 0, output_radmat, np.nan)) assert (bg_mean - 0.0) ** 2 < 300.0 ** 2 bg_std = np.nanstd(np.where(sampled_dyemat == 0, output_radmat, np.nan)) assert (bg_std - 50.0) ** 2 < 100.0 ** 2 c1_mean = np.nanmean(np.where(sampled_dyemat == 1, output_radmat, np.nan)) assert (c1_mean - 5000.0) ** 2 < 500.0 ** 2 c2_mean = np.nanmean(np.where(sampled_dyemat == 2, output_radmat, np.nan)) assert (c2_mean - (2 * 5000.0)) ** 2 < 1000.0 ** 2 c2_std = np.nanstd(np.where(sampled_dyemat == 2, output_radmat, np.nan)) expect = np.sqrt((2 * 50.0 ** 2) + 50.0 ** 2) assert (c2_std - expect) ** 2 < 80.0 ** 2 def it_handles_empty_dyemat(): empty_dyemat = np.zeros((0, n_channels, n_cycles), dtype=np.uint8) sim_v2_worker._radmat_from_sampled_pep_dyemat( empty_dyemat, ch_params_no_noise, n_channels, output_radmat, output_true_row_k, use_lognormal_model=True, pep_i=1, ) assert output_radmat.shape == (n_peps, n_samples_per_pep, n_channels, n_cycles) assert np.all(output_radmat[:, :, :, :] == 0.0) zest()
def it_handles_normal(): n_peps = 3 n_channels = 2 n_cycles = 5 dyes_labels = Munch( dyes=[ Munch(dye_name="dye0", channel_name="ch0"), Munch(dye_name="dye1", channel_name="ch1"), ], labels=[ Munch(aa="A", dye_name="dye0", label_name="label0", ptm_only=False), Munch(aa="B", dye_name="dye1", label_name="label1", ptm_only=False), ], ) sim_v2_params = SimV2Params( n_pres=1, n_mocks=0, n_edmans=4, **dyes_labels, priors_desc={ "gain_mu": dict(class_name="MLEPrior", params=dict(value=5000.0)), "gain_sigma": dict(class_name="MLEPrior", params=dict(value=50.0)), "bg_sigma": dict(class_name="MLEPrior", params=dict(value=50.0)), "row_k_sigma": dict(class_name="MLEPrior", params=dict(value=0.0)), }, ) # pepseqs: DF(pep_i, aa, pep_off_in_pro) pepseqs_df = pd.DataFrame( dict( pep_i=[0, 1, 1, 1, 2, 2, 2], aa=[".", "A", "B", ".", "B", ".", "."], pep_offset_in_pro=[0, 0, 1, 2, 3, 4, 5], ) ) # pcbs are an encoding of flus. See def pcbs() pcbs = sim_v2_params.pcbs(pepseqs_df) n_samples = 10 train_dyemat, train_dyepeps, train_pep_recalls = sim_v2_worker._dyemat_sim( sim_v2_params, pcbs, n_samples=n_samples, progress=None, ) n_dyts = train_dyemat.shape[0] check.array_t(train_dyemat, shape=(n_dyts, n_channels * n_cycles)) # dyepeps are in (dyt_i, pep_i, count) order # Every peptide should have assert np.max(train_dyepeps[:, 1]) # Assert that every peptide (except 0) got 10 samples for pep_i in range(1, n_peps): mask = train_dyepeps[:, 1] == pep_i assert train_dyepeps[mask, 2].sum() == n_samples # Assert only those peptides are present assert np.min(train_dyepeps[:, 1]) == 0 assert np.max(train_dyepeps[:, 1]) == n_peps - 1