Ejemplo n.º 1
0
def sim_v2(aa_list, err_set, **sim_kws):
    priors_desc = priors_desc_from_err_set(err_set)

    n_pres = sim_kws.get("n_pres", 0)
    n_mocks = sim_kws.get("n_mocks", 0)
    assert (
        n_pres + n_mocks >= 1
    ), "You must include at least 1 pre or mock cycle to capture the initial image"
    block = Munch(
        sim_v2=Munch(
            version="1.0",
            inputs=Munch(prep="../prep"),
            parameters=Munch(
                **SimV2Params.from_aa_list_fixture(
                    aa_list, priors_desc=priors_desc, **sim_kws
                )
            ),
        )
    )
    # REMOVE the priors before saving
    del block.sim_v2.parameters["priors"]
    block.sim_v2.parameters = utils.strip_underscore_keys(
        block.sim_v2.parameters, munchify=True
    )
    return block
Ejemplo n.º 2
0
    def from_prep_fixture(cls, prep_result, labels: str, n_edmans=5, priors=None):
        """
        Run a (likely small) simulation to make a SimResult fixture for testing

        labels: a CSV list of aas. Eg: "DE,C"
            Common labels: "DE", "C", "Y", "K", "H"
        """
        from plaster.run.sim_v2.sim_v2_worker import sim_v2
        from plaster.tools.schema import check
        from plaster.run.priors import PriorsMLEFixtures

        check.t(labels, str)
        labels = labels.split(",")

        if priors is None:
            priors = PriorsMLEFixtures.val_defaults()

        sim_v2_params = SimV2Params.from_aa_list_fixture(
            labels,
            priors=priors,
            n_pres=1,
            n_mocks=0,
            n_edmans=n_edmans,
            use_lognormal_model=False,
            n_samples_train=100,
            n_samples_test=100,
            train_includes_radmat=True,
        )

        return sim_v2(sim_v2_params, prep_result)
Ejemplo n.º 3
0
 def it_makes_cycles_array():
     priors = PriorsMLEFixtures.nbt_defaults()
     src_params = SimV2Params.from_aa_list_fixture(["DE", "Y"],
                                                   priors=priors,
                                                   n_pres=1,
                                                   n_mocks=2,
                                                   n_edmans=3)
     assert src_params.cycles_array().tolist() == [0, 1, 1, 2, 2, 2]
Ejemplo n.º 4
0
    def start(self):
        params = SimV2Params(**self.config.parameters)

        prep_result = PrepResult.load_from_folder(self.inputs.prep)

        result = sim_v2(params,
                        prep_result,
                        progress=self.progress,
                        pipeline=self)

        result.save()

        if params.dump_debug:
            result.dump_debug()
Ejemplo n.º 5
0
    def _sim(priors=None, _prep_result=None, sim_kwargs=None):
        if _prep_result is None:
            _prep_result = prep_result

        priors = PriorsMLEFixtures.fixture_no_errors(**(priors or {}))

        if sim_kwargs is None:
            sim_kwargs = {}

        sim_kwargs["use_lognormal_model"] = True

        sim_v2_params = SimV2Params.from_aa_list_fixture(
            ["A", "B"], priors=priors, n_edmans=4, **(sim_kwargs or {})
        )

        return sim_v2_worker.sim_v2(sim_v2_params, _prep_result), sim_v2_params
Ejemplo n.º 6
0
 def it_assigns_channels_if_not_exist():
     params = SimV2Params(
         dyes=[
             Munch(dye_name="atto_647", channel_name="647"),
             Munch(dye_name="foo_549", channel_name="549"),
         ],
         labels=[
             Munch(
                 aa="C",
                 dye_name="atto_647",
                 label_name="cys0",
                 ptm_only=False,
             ),
         ],
         priors_desc={},
     )
     assert params.channel_names() == ["549", "647"]
Ejemplo n.º 7
0
 def it_converts_priors():
     params = SimV2Params(
         is_survey=False,
         dyes=[
             Munch(dye_name="foo_549", channel_name="549"),
         ],
         labels=[
             Munch(aa="D",
                   dye_name="foo_549",
                   label_name="DE0",
                   ptm_only=False),
         ],
         priors_desc={
             "foo": dict(class_name="MLEPrior", params=dict(value=1.0)),
         },
     )
     assert isinstance(params.priors, Priors)
     assert params.priors.get_mle("foo") == 1.0
Ejemplo n.º 8
0
    def it_makes_pcbs():
        # fmt: off
        params = SimV2Params(
            is_survey=False,
            dyes=[
                Munch(dye_name="dye1", channel_name="640"),
                Munch(dye_name="dye2", channel_name="549"),
            ],
            labels=[
                Munch(aa="A", dye_name="dye1", label_name="A", ptm_only=False),
                Munch(aa="B", dye_name="dye2", label_name="B", ptm_only=False),
            ],
            priors_desc={
                "p_non_fluorescent.dye1":
                dict(class_name="MLEPrior", params=dict(value=0.5)),
                "p_non_fluorescent.dye2":
                dict(class_name="MLEPrior", params=dict(value=0.3)),
            },
            channels={
                "640": 0,
                "549": 1,
            },
        )
        # fmt: on

        pep_seq = pd.DataFrame(
            dict(
                pep_i=[1, 1, 1, 2, 2],
                aa=["X", "A", "X", "B", "X"],
                pep_offset_in_pro=[0, 1, 2, 0, 1],
            ))

        pcbs = params.pcbs(pep_seq)
        assert utils.np_array_same(
            pcbs,
            [
                [1.0, np.nan, np.nan],
                [1.0, 0.0, 0.5],
                [1.0, np.nan, np.nan],
                [2.0, 1.0, 1.0 - 0.3],
                [2.0, np.nan, np.nan],
            ],
        )
Ejemplo n.º 9
0
def zest_v2_stress_like_e2e():
    # This was dying with a "double free or corruption (!prev)"
    # This was a bug in n_dyetracks counting now fixed, but leaving this test in for regression.

    prep_params = PrepParams(
        decoy_mode=None,
        n_peps_limit=None,
        n_ptms_limit=5,
        protease=None,
        proteins=[
            Munch(
                abundance=None,
                name="pep25",
                ptm_locs="",
                is_poi=0,
                sequence="GCAGCAGAG ",
            )
        ],
        proteins_of_interest=[],
    )
    pro_spec_df = pd.DataFrame(prep_params.proteins)
    prep_result = prep(prep_params, pro_spec_df)

    sim_v2_param_block = Munch(
        allow_train_test_to_be_identical=False,
        enable_ptm_labels=False,
        dyes=[Munch(dye_name="dye_0", channel_name="ch0")],
        labels=[
            Munch(
                aa="C",
                dye_name="dye_0",
                label_name="label_0",
                ptm_only=False,
            )
        ],
        priors_desc={
            "p_non_fluorescent":
            Munch(
                class_name="MLEPrior",
                params=Munch(value=0.07),
            ),
            "p_bleach":
            Munch(
                class_name="MLEPrior",
                params=Munch(value=0.05),
            ),
            "gain_mu":
            Munch(
                class_name="MLEPrior",
                params=Munch(value=7500.0),
            ),
            "gain_sigma":
            Munch(
                class_name="MLEPrior",
                params=Munch(value=0.16),
            ),
            "bg_mu":
            Munch(
                class_name="MLEPrior",
                params=Munch(value=300.0),
            ),
            "bg_sigma":
            Munch(
                class_name="MLEPrior",
                params=Munch(value=700.0),
            ),
            "p_detach":
            Munch(
                class_name="MLEPrior",
                params=Munch(value=0.05),
            ),
            "p_edman_failure":
            Munch(
                class_name="MLEPrior",
                params=Munch(value=0.06),
            ),
            "row_k_sigma":
            Munch(
                class_name="MLEPrior",
                params=Munch(value=0.15),
            ),
        },
        is_survey=False,
        n_edmans=8,
        n_mocks=1,
        n_pres=0,
        n_samples_test=1000,
        n_samples_train=5000,
        random_seed=None,
        test_includes_dyemat=False,
        train_includes_radmat=False,
        use_lognormal_model=True,
    )

    sim_v2_params = SimV2Params(include_dfs=True, **sim_v2_param_block)

    sim_v2_result = sim_v2_worker.sim_v2(sim_v2_params, prep_result)
    sim_v2_result._generate_flu_info(prep_result)

    nn_v2_params = NNV2Params(source="zest_v2_stress_like_e2e",
                              priors_desc=sim_v2_param_block.priors_desc)
    nn_result = nn_v2(nn_v2_params, prep_result, sim_v2_result, None)
    df = nn_result.calls()
    assert np.all(df.pep_i == 1)
Ejemplo n.º 10
0
    def it_builds_dfs():
        params = SimV2Params(
            dyes=[
                Munch(dye_name="atto_647", channel_name="647"),
                Munch(dye_name="foo_549", channel_name="549"),
            ],
            labels=[
                Munch(
                    aa="C",
                    dye_name="atto_647",
                    label_name="cys0",
                    ptm_only=False,
                ),
                Munch(aa="D",
                      dye_name="foo_549",
                      label_name="DE0",
                      ptm_only=False),
                Munch(aa="E",
                      dye_name="foo_549",
                      label_name="DE0",
                      ptm_only=False),
            ],
            channels=Munch({
                "647": 0,
                "549": 1,
            }),
            priors_desc={
                "p_non_fluorescent.atto_647":
                Munch(
                    class_name="MLEPrior",
                    params=Munch(value=1.0),
                ),
                "p_non_fluorescent.foo_549":
                Munch(
                    class_name="MLEPrior",
                    params=Munch(value=2.0),
                ),
                "p_bleach":
                Munch(class_name="MLEPrior", params=Munch(value=3.0)),
            },
        )

        def it_builds_channel__priors():
            assert sorted(params.channel__priors().columns) == sorted([
                "bg_mu",
                "bg_sigma",
                "ch_i",
                "channel_name",
                "dye_name",
                "gain_mu",
                "gain_sigma",
                "index",
                "p_bleach",
                "row_k_sigma",
            ])
            assert (params.channel__priors().set_index(
                "ch_i").loc[1].p_bleach.sample() == 3.0)

        def it_builds_dye__label__priors():
            assert sorted(params.dye__label__priors().columns) == sorted([
                "aa",
                "bg_mu",
                "bg_sigma",
                "ch_i",
                "channel_name",
                "dye_name",
                "gain_mu",
                "gain_sigma",
                "label_name",
                "p_bleach",
                "p_non_fluorescent",
                "ptm_only",
                "row_k_sigma",
            ])
            assert (params.dye__label__priors().set_index(
                "aa").loc["D"].p_non_fluorescent.sample() == 2.0)

        zest()
Ejemplo n.º 11
0
    def it_handles_bleaching():
        n_cycles = 5

        dyes_labels = Munch(
            dyes=[Munch(dye_name="dye0", channel_name="ch0"),],
            labels=[
                Munch(aa="X", dye_name="dye0", label_name="label0", ptm_only=False),
            ],
        )

        sim_v2_params = SimV2Params(
            n_pres=1,
            n_mocks=0,
            n_edmans=4,
            **dyes_labels,
            priors_desc={
                "gain_mu": dict(class_name="MLEPrior", params=dict(value=5000.0)),
                "gain_sigma": dict(class_name="MLEPrior", params=dict(value=50.0)),
                "bg_sigma": dict(class_name="MLEPrior", params=dict(value=50.0)),
                "row_k_sigma": dict(class_name="MLEPrior", params=dict(value=0.0)),
                "p_edman_failure": dict(class_name="MLEPrior", params=dict(value=0.0)),
                "p_detach": dict(class_name="MLEPrior", params=dict(value=0.0)),
                "p_bleach": dict(class_name="MLEPrior", params=dict(value=0.0)),
                "p_non_fluorescent": dict(
                    class_name="MLEPrior", params=dict(value=0.0)
                ),
            },
        )

        dyemat = triangle_dyemat(5, 1, include_multi_drop=False, include_nul_row=True)
        pep_i = []
        aa = []
        pep_offset_in_pro = []

        for i, dyt in enumerate(dyemat):
            pep_i += [i] * n_cycles
            aa += list(dyt_to_seq(dyt))
            pep_offset_in_pro += [0] * n_cycles

        pepseqs_df = pd.DataFrame(
            dict(pep_i=pep_i, aa=aa, pep_offset_in_pro=pep_offset_in_pro,)
        )

        # pcbs are an encoding of flus. See def pcbs()
        pcbs = sim_v2_params.pcbs(pepseqs_df)

        n_samples = 10
        train_dyemat, train_dyepeps, train_pep_recalls = sim_v2_worker._dyemat_sim(
            sim_v2_params, pcbs, n_samples=n_samples, progress=None,
        )
        n_dyts = train_dyemat.shape[0]

        check.array_t(train_dyemat, shape=(n_dyts, n_cycles))

        # dyepeps are in (dyt_i, pep_i, count) order
        # Every peptide should have
        assert np.max(train_dyepeps[:, 1])

        # Assert that every peptide (except 0) got 10 samples
        for pep_i in range(1, 6):
            mask = train_dyepeps[:, 1] == pep_i
            assert train_dyepeps[mask, 2].sum() == n_samples

        # Assert only those peptides are present
        assert np.min(train_dyepeps[:, 1]) == 0
        assert np.max(train_dyepeps[:, 1]) == 5

        # Mapping of dyt and pep should both be identity
        assert np.all(train_dyepeps[:, 0] == np.arange(n_dyts))
        assert np.all(train_dyepeps[:, 1] == np.arange(n_dyts))
        assert np.all(train_dyepeps[1:, 2] == n_samples)

        assert train_dyemat.tolist() == [
            [0, 0, 0, 0, 0],
            [1, 0, 0, 0, 0],
            [1, 1, 0, 0, 0],
            [1, 1, 1, 0, 0],
            [1, 1, 1, 1, 0],
            [1, 1, 1, 1, 1],
        ]
Ejemplo n.º 12
0
def zest_radmat_sim():
    dyes_labels = Munch(
        dyes=[
            Munch(dye_name="dye0", channel_name="ch0"),
            Munch(dye_name="dye1", channel_name="ch1"),
        ],
        labels=[
            Munch(aa="A", dye_name="dye0", label_name="label0", ptm_only=False),
            Munch(aa="B", dye_name="dye1", label_name="label1", ptm_only=False),
        ],
    )

    params_with_noise = SimV2Params(
        **dyes_labels,
        priors_desc={
            "gain_mu": dict(class_name="MLEPrior", params=dict(value=7500.0)),
            "gain_sigma": dict(class_name="MLEPrior", params=dict(value=0.16)),
            "bg_sigma": dict(class_name="MLEPrior", params=dict(value=200.0)),
        },
    )
    ch_params_with_noise = params_with_noise.by_channel()

    params_no_noise = SimV2Params(
        **dyes_labels,
        priors_desc={
            "gain_mu": dict(class_name="MLEPrior", params=dict(value=1.0)),
            "gain_sigma": dict(class_name="MLEPrior", params=dict(value=0.0)),
            "bg_sigma": dict(class_name="MLEPrior", params=dict(value=0.0)),
            "row_k_sigma": dict(class_name="MLEPrior", params=dict(value=0.0)),
        },
    )
    ch_params_no_noise = params_no_noise.by_channel()

    # fmt: off
    dyemat = np.array([
        [[0, 0, 0], [0, 0, 0]],
        [[1, 1, 0], [1, 0, 0]],
        [[2, 2, 1], [2, 1, 0]],
    ])

    dyepeps = np.array([
        [0, 0, 0],
        [1, 1, 10],
        [1, 2, 5],
        [2, 2, 5],
        [0, 3, 10],
    ])
    # fmt: on

    n_samples_per_pep = 10
    n_channels = 2
    n_cycles = 3
    n_peps = 4

    def it_removes_all_zero_rows():
        radiometry, true_pep_iz, true_dye_iz, true_row_ks = sim_v2_worker._radmat_sim(
            dyemat,
            dyepeps,
            ch_params_with_noise,
            n_samples_per_pep,
            n_channels,
            n_cycles,
            use_lognormal_model=True,
        )

        expected_rows = 20
        # 20 because we sample 10 per pep but only peps 1 and 2 have non-zero dyts
        assert radiometry.shape == (expected_rows, n_channels, n_cycles)
        # fmt: off
        assert true_pep_iz.tolist() == [
            1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
            2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        ]
        # fmt: on
        assert not np.any(np.all(radiometry == 0.0, axis=(1, 2)))

    def it_returns_reasonable_radiometry():
        radiometry, true_pep_iz, true_dye_iz, true_row_ks = sim_v2_worker._radmat_sim(
            dyemat,
            dyepeps,
            ch_params_with_noise,
            n_samples_per_pep,
            n_channels,
            n_cycles,
            use_lognormal_model=True,
        )
        # Only 2 of the peptide have dyetracks
        assert radiometry.shape == (n_samples_per_pep * 2, n_channels, n_cycles)
        # fmt: off
        assert true_pep_iz.tolist() == [
            1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
            2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        ]
        # fmt: on
        # I'm not sure of a good test here
        # assert np.all(radiometry[radiometry > 0.0] > 1000.0)

    def it_returns_correct_radiometry_with_no_noise():
        # By using no noise, we can just compare that radiometry gave back the dyemat
        # but with each peptide repeated
        radiometry, true_pep_iz, true_dye_iz, true_row_ks = sim_v2_worker._radmat_sim(
            dyemat,
            dyepeps,
            ch_params_no_noise,
            n_samples_per_pep,
            n_channels,
            n_cycles,
            use_lognormal_model=True,
        )

        assert np.all(true_row_ks == 1.0)

        assert np.all(radiometry[0:5] == dyemat[1, :].astype(RadType),)

        assert np.all(
            (
                (radiometry[5:10] == dyemat[1, :].astype(RadType))
                | (radiometry[5:10] == dyemat[2, :].astype(RadType))
            ),
        )

        # fmt: off
        assert true_dye_iz[0:5].tolist() == [1, 1, 1, 1, 1]
        # fmt: on

        assert np.all((true_dye_iz[5:10] == 1) | (true_dye_iz[5:10] == 2))

        # fmt: off
        assert true_pep_iz.tolist() == [
            1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
            2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        ]
        # fmt: on

    zest()
Ejemplo n.º 13
0
def zest_radmat_from_sampled_pep_dyemat():
    n_channels = 2
    n_cycles = 3
    n_samples_per_pep = 5
    n_peps = 2

    # fmt: off
    sampled_dyemat = np.array([
        [[1, 1, 0], [1, 0, 0]],
        [[2, 2, 1], [2, 1, 0]],
        [[2, 2, 1], [2, 1, 0]],
        [[1, 1, 0], [1, 0, 0]],
        [[1, 1, 0], [1, 0, 0]],
    ], dtype=np.uint8)
    # fmt: on

    dyes_labels = Munch(
        dyes=[
            Munch(dye_name="dye0", channel_name="ch0"),
            Munch(dye_name="dye1", channel_name="ch1"),
        ],
        labels=[
            Munch(aa="A", dye_name="dye0", label_name="label0", ptm_only=False),
            Munch(aa="B", dye_name="dye1", label_name="label1", ptm_only=False),
        ],
    )

    params_with_noise_lognormal = SimV2Params(
        **dyes_labels,
        priors_desc={
            "gain_mu": dict(class_name="MLEPrior", params=dict(value=10.0)),
            "gain_sigma": dict(class_name="MLEPrior", params=dict(value=0.1)),
            "bg_sigma": dict(class_name="MLEPrior", params=dict(value=200.0)),
        },
    )
    ch_params_with_noise_lognormal = params_with_noise_lognormal.by_channel()

    params_with_noise_normal_no_row_k = SimV2Params(
        **dyes_labels,
        priors_desc={
            "gain_mu": dict(class_name="MLEPrior", params=dict(value=5000.0)),
            "gain_sigma": dict(class_name="MLEPrior", params=dict(value=50.0)),
            "bg_sigma": dict(class_name="MLEPrior", params=dict(value=50.0)),
            "row_k_sigma": dict(class_name="MLEPrior", params=dict(value=0.0)),
        },
    )
    ch_params_with_noise_normal_no_row_k = (
        params_with_noise_normal_no_row_k.by_channel()
    )

    params_with_noise_normal_with_row_k = SimV2Params(
        **dyes_labels,
        priors_desc={
            "gain_mu": dict(class_name="MLEPrior", params=dict(value=5000.0)),
            "gain_sigma": dict(class_name="MLEPrior", params=dict(value=50.0)),
            "bg_sigma": dict(class_name="MLEPrior", params=dict(value=50.0)),
            "row_k_sigma": dict(class_name="MLEPrior", params=dict(value=0.16)),
        },
    )
    ch_params_with_noise_normal_with_row_k = (
        params_with_noise_normal_with_row_k.by_channel()
    )

    params_no_noise = SimV2Params(
        **dyes_labels,
        priors_desc={
            "gain_mu": dict(class_name="MLEPrior", params=dict(value=10.0)),
            "gain_sigma": dict(class_name="MLEPrior", params=dict(value=0.0)),
            "bg_sigma": dict(class_name="MLEPrior", params=dict(value=0.0)),
            "row_k_sigma": dict(class_name="MLEPrior", params=dict(value=0.0)),
        },
    )
    ch_params_no_noise = params_no_noise.by_channel()

    # sim_v2_params = SimV2Params.construct_from_aa_list(
    #     ["A", "B"], n_edmans=4)
    # )

    # ch_params_no_noise = [
    #     Munch(beta=10.0, sigma=0.0, zero_beta=0.0, zero_sigma=0.0),
    #     Munch(beta=10.0, sigma=0.0, zero_beta=0.0, zero_sigma=0.0),
    # ]
    #
    # ch_params_with_noise_lognormal = [
    #     Munch(beta=10.0, sigma=0.1, zero_beta=0.0, zero_sigma=200.0),
    #     Munch(beta=10.0, sigma=0.1, zero_beta=0.0, zero_sigma=200.0),
    # ]

    output_radmat = None
    output_true_row_k = None

    def _before():
        nonlocal output_radmat
        output_radmat = np.zeros(
            (n_peps, n_samples_per_pep, n_channels, n_cycles), dtype=np.float32
        )

        nonlocal output_true_row_k
        output_true_row_k = np.zeros((n_peps, n_samples_per_pep), dtype=np.float32)

    def it_returns_noise_free_radmat():
        sim_v2_worker._radmat_from_sampled_pep_dyemat(
            sampled_dyemat,
            ch_params_no_noise,
            n_channels,
            output_radmat,
            output_true_row_k,
            use_lognormal_model=True,
            pep_i=1,
        )

        assert output_radmat.shape == (n_peps, n_samples_per_pep, n_channels, n_cycles)

        # Peptide 0 is all zero:
        assert np.all(output_radmat[0, :, :, :] == 0.0)

        # Peptide 1 is noise-free
        assert np.all(
            output_radmat[1, :, :, :] == 10.0 * sampled_dyemat.astype(np.float32)
        )

    @zest.retry(n_tries=2)  # Stochastic
    def it_returns_noisy_radmat_lognormal():
        sim_v2_worker._radmat_from_sampled_pep_dyemat(
            sampled_dyemat,
            ch_params_with_noise_lognormal,
            n_channels,
            output_radmat,
            output_true_row_k,
            use_lognormal_model=True,
            pep_i=1,
        )

        assert output_radmat.shape == (n_peps, n_samples_per_pep, n_channels, n_cycles)
        assert np.all(output_radmat[0, :, :, :] == 0.0)
        expected = 10.0 * sampled_dyemat.astype(np.float32)
        diff = output_radmat[1, :, :, :] - expected
        diff = utils.np_safe_divide(diff, expected) ** 2
        assert np.all((diff ** 2 < 0.25 ** 2) | np.isnan(diff))

    def it_returns_noisy_radmat_normal_no_row_k():
        sim_v2_worker._radmat_from_sampled_pep_dyemat(
            sampled_dyemat,
            ch_params_with_noise_normal_no_row_k,
            n_channels,
            output_radmat,
            output_true_row_k,
            use_lognormal_model=False,
            pep_i=1,
        )

        assert output_radmat.shape == (n_peps, n_samples_per_pep, n_channels, n_cycles)
        assert np.all(output_radmat[0, :, :, :] == 0.0)

        # Set the zero record to nan so it doesn't factor in
        output_radmat[0, :, :, :] = np.nan
        output_true_row_k[0, :] = np.nan

        row_k_mean = np.nanmean(output_true_row_k)
        assert (row_k_mean - 1.0) ** 2 < 0.15 ** 2

        bg_mean = np.nanmean(np.where(sampled_dyemat == 0, output_radmat, np.nan))
        assert (bg_mean - 0.0) ** 2 < 300.0 ** 2

        bg_std = np.nanstd(np.where(sampled_dyemat == 0, output_radmat, np.nan))
        assert (bg_std - 50.0) ** 2 < 100.0 ** 2

        c1_mean = np.nanmean(np.where(sampled_dyemat == 1, output_radmat, np.nan))
        assert (c1_mean - 5000.0) ** 2 < 500.0 ** 2

        c2_mean = np.nanmean(np.where(sampled_dyemat == 2, output_radmat, np.nan))
        assert (c2_mean - (2 * 5000.0)) ** 2 < 1000.0 ** 2

        c2_std = np.nanstd(np.where(sampled_dyemat == 2, output_radmat, np.nan))
        expect = np.sqrt((2 * 50.0 ** 2) + 50.0 ** 2)
        assert (c2_std - expect) ** 2 < 80.0 ** 2

    def it_handles_empty_dyemat():
        empty_dyemat = np.zeros((0, n_channels, n_cycles), dtype=np.uint8)

        sim_v2_worker._radmat_from_sampled_pep_dyemat(
            empty_dyemat,
            ch_params_no_noise,
            n_channels,
            output_radmat,
            output_true_row_k,
            use_lognormal_model=True,
            pep_i=1,
        )

        assert output_radmat.shape == (n_peps, n_samples_per_pep, n_channels, n_cycles)
        assert np.all(output_radmat[:, :, :, :] == 0.0)

    zest()
Ejemplo n.º 14
0
    def it_handles_normal():
        n_peps = 3
        n_channels = 2
        n_cycles = 5

        dyes_labels = Munch(
            dyes=[
                Munch(dye_name="dye0", channel_name="ch0"),
                Munch(dye_name="dye1", channel_name="ch1"),
            ],
            labels=[
                Munch(aa="A", dye_name="dye0", label_name="label0", ptm_only=False),
                Munch(aa="B", dye_name="dye1", label_name="label1", ptm_only=False),
            ],
        )

        sim_v2_params = SimV2Params(
            n_pres=1,
            n_mocks=0,
            n_edmans=4,
            **dyes_labels,
            priors_desc={
                "gain_mu": dict(class_name="MLEPrior", params=dict(value=5000.0)),
                "gain_sigma": dict(class_name="MLEPrior", params=dict(value=50.0)),
                "bg_sigma": dict(class_name="MLEPrior", params=dict(value=50.0)),
                "row_k_sigma": dict(class_name="MLEPrior", params=dict(value=0.0)),
            },
        )

        # pepseqs: DF(pep_i, aa, pep_off_in_pro)
        pepseqs_df = pd.DataFrame(
            dict(
                pep_i=[0, 1, 1, 1, 2, 2, 2],
                aa=[".", "A", "B", ".", "B", ".", "."],
                pep_offset_in_pro=[0, 0, 1, 2, 3, 4, 5],
            )
        )

        # pcbs are an encoding of flus. See def pcbs()
        pcbs = sim_v2_params.pcbs(pepseqs_df)

        n_samples = 10
        train_dyemat, train_dyepeps, train_pep_recalls = sim_v2_worker._dyemat_sim(
            sim_v2_params, pcbs, n_samples=n_samples, progress=None,
        )
        n_dyts = train_dyemat.shape[0]

        check.array_t(train_dyemat, shape=(n_dyts, n_channels * n_cycles))

        # dyepeps are in (dyt_i, pep_i, count) order
        # Every peptide should have
        assert np.max(train_dyepeps[:, 1])

        # Assert that every peptide (except 0) got 10 samples
        for pep_i in range(1, n_peps):
            mask = train_dyepeps[:, 1] == pep_i
            assert train_dyepeps[mask, 2].sum() == n_samples

        # Assert only those peptides are present
        assert np.min(train_dyepeps[:, 1]) == 0
        assert np.max(train_dyepeps[:, 1]) == n_peps - 1