Ejemplo n.º 1
0
    def start(self):
        params = NNV2Params(source="NNV2Task", **self.config.parameters)

        sigproc_v2_result = None
        rad_filter_result = None
        if params.include_sigproc:
            sigproc_v2_result = SigprocV2Result.load_from_folder(
                self.inputs.sigproc, prop_list=["n_cycles", "n_channels"])
            rad_filter_result = RadFilterResult.load_from_folder(
                self.inputs.rad_filter)

        prep_result = None
        if self.inputs.get("prep") is not None:
            prep_result = PrepResult.load_from_folder(self.inputs.prep)

        sim_v2_result = None
        if self.inputs.get("sim_v2") is not None:
            sim_v2_result = SimV2Result.load_from_folder(self.inputs.sim_v2)

        nn_v2(
            params,
            prep_result,
            sim_v2_result,
            sigproc_v2_result,
            rad_filter_result,
            progress=self.progress,
            pipeline=self,
        ).save()
Ejemplo n.º 2
0
    def it_generates_flu_info():
        with tmp.tmp_folder(chdir=True):
            prep_result = PrepResult.prep_result_fixture(
                pros=[".", "XAXCD", "XAXCDXX", "XCCXX"],
                pro_is_decoys=[False, False, False, False],
                peps=[".", "XAXCD", "XAXCDXX", "XCCXX"],
                pep_pro_iz=[0, 1, 2, 3],
            )
            sim_params = _stub_sim_params(some_error_model, n_samples)
            sim_result = sim_v1_worker.sim_v1(sim_params, prep_result)
            sim_result._generate_flu_info(prep_result)

            def it_computes_head_and_tail():
                _flus = sim_result._flus
                assert np.all(_flus[_flus.pep_i.isin([1, 2])].flu_count == 2)
                assert np.all(_flus[_flus.pep_i.isin([1, 2])].n_head_ch_0 == 1)
                assert np.all(_flus[_flus.pep_i.isin([1, 2])].n_head_ch_1 == 0)
                assert np.all(_flus[_flus.pep_i.isin([1, 2])].n_tail_ch_0 == 0)
                assert np.all(_flus[_flus.pep_i.isin([1, 2])].n_tail_ch_1 == 1)
                assert np.all(_flus[_flus.pep_i == 3].flu_count == 1)

            def it_peps__flus():
                df = sim_result.peps__flus(prep_result)
                assert "flustr" in df
                assert len(df) == 4

            def it_peps__flus__unique_flus():
                df = sim_result.peps__flus__unique_flus(prep_result)
                assert np.all(df.pep_i.values == [0, 3])

            zest()
Ejemplo n.º 3
0
    def it_returns_the_fraction_of_all_dark_samples():
        with tmp.tmp_folder(chdir=True):
            n_samples = 5000
            sim_params = _stub_sim_params(
                ErrorModel.from_defaults(n_channels=2), n_samples)
            prep_result = PrepResult.prep_result_fixture(
                pros=[".", "ABCDEFGHI"],
                pro_is_decoys=[False, False],
                peps=[".", "ABB"],
                pep_pro_iz=[0, 1],
            )

            pep_seq_df = prep_result.pepseqs()

            dyemat, radmat, recall = _make_arrays("test1",
                                                  n_peps=2,
                                                  n_samples=n_samples)
            sim_v1_worker._do_pep_sim(
                pep_seq_df[pep_seq_df.pep_i == 1],
                sim_params,
                n_samples=n_samples,
                output_dyemat=dyemat,
                output_radmat=radmat,
                output_recall=recall,
            )
            assert np.all((0.9 < recall[1]) & (recall[1] < 1.0))
Ejemplo n.º 4
0
    def it_drop_all_darks():
        with tmp.tmp_folder(chdir=True):
            prep_result = PrepResult.prep_result_fixture(
                pros=[".", "DD", "EE"],
                pro_is_decoys=[False, False, False],
                peps=[".", "DD", "EE"],
                pep_pro_iz=[0, 1, 2],
            )
            n_peptides = 3
            sim_params = _stub_sim_params(no_error_model, n_samples)
            sim_result = sim_v1_worker.sim_v1(sim_params, prep_result)
            assert sim_result.test_dyemat.shape == (
                0,
                n_channels,
                n_cycles,
            )
            assert sim_result.test_dyemat.dtype == np.uint8
            assert np.all(sim_result.test_dyemat[:] == 0)  # All dark

            assert sim_result.train_dyemat.shape == (
                1,
                n_channels,
                n_cycles,
            )
            assert sim_result.train_dyemat.dtype == np.uint8
            assert np.all(sim_result.train_pep_recalls[:] == 0.0)
Ejemplo n.º 5
0
    def start(self):
        sim_params = SimParams(include_dfs=True, **self.config.parameters)

        prep_result = PrepResult.load_from_folder(self.inputs.prep)

        sim_result = sim(sim_params, prep_result, progress=self.progress, pipeline=self)
        sim_result._generate_flu_info(prep_result)
        sim_result.save()
Ejemplo n.º 6
0
    def start(self):
        rf_train_v2_params = RFTrainV2Params(**self.config.parameters)

        prep_result = PrepResult.load_from_folder(self.inputs.prep)

        sim_v2_result = SimV2Result.load_from_folder(self.inputs.sim_v2)

        rf_train_v2_result = rf_train(
            rf_train_v2_params, prep_result, sim_v2_result, progress=self.progress
        )

        rf_train_v2_result.save()
Ejemplo n.º 7
0
    def start(self):
        params = SimV2Params(**self.config.parameters)

        prep_result = PrepResult.load_from_folder(self.inputs.prep)

        result = sim_v2(params,
                        prep_result,
                        progress=self.progress,
                        pipeline=self)

        result.save()

        if params.dump_debug:
            result.dump_debug()
Ejemplo n.º 8
0
    def start(self):
        test_nn_params = TestNNParams(**self.config.parameters)

        prep_result = PrepResult.load_from_folder(self.inputs.prep)
        sim_result = SimResult.load_from_folder(self.inputs.sim)

        test_nn_result = test_nn(
            test_nn_params,
            prep_result,
            sim_result,
            progress=self.progress,
            pipeline=self,
        )

        test_nn_result.save()
Ejemplo n.º 9
0
    def start(self):
        survey_nn_params = SurveyNNParams(**self.config.parameters)

        prep_result = PrepResult.load_from_folder(self.inputs.prep)
        sim_result = SimResult.load_from_folder(self.inputs.sim)

        survey_nn_result = survey_nn(
            survey_nn_params,
            prep_result,
            sim_result,
            progress=self.progress,
            pipeline=self,
        )

        survey_nn_result.save()
Ejemplo n.º 10
0
def zest_poi_vs_all():
    prep_result = PrepResult.prep_result_fixture(
        pros=[".", "A", "B", "C"],
        pro_is_decoys=[0, 0, 0, 0],
        peps=[".", "A", "B", "C"],
        pep_pro_iz=[0, 1, 2, 3],
        is_pois=[0, 0, 1, 0],
    )

    true_pep_iz = np.array([1, 1, 2, 2, 3, 3])

    remapped_pep_iz = poi_vs_all(true_pep_iz, prep_result)

    assert remapped_pep_iz.tolist() == [0, 0, 2, 2, 0, 0]

    zest()
Ejemplo n.º 11
0
 def _before():
     nonlocal result, prep_result
     sim_params = _make_sim_params(["AB", "CD"], n_edmans=3)
     prep_result = PrepResult.prep_result_fixture(
         pros=[".", "ABCDAADD", "AXCABC"],
         pro_is_decoys=[False, False, False],
         peps=[".", "ABCD", "AADD", "AXCABC"],
         pep_pro_iz=[0, 1, 1, 2],
     )
     prep_result.params = PrepParams(proteins=[
         dict(name="id_0", sequence="."),
         dict(name="id_1", sequence="ABCDAADD"),
         dict(name="id_2", sequence="AXCABC"),
     ])
     result = SimV1Result(params=sim_params)
     result._generate_flu_info(prep_result)
Ejemplo n.º 12
0
    def start(self):
        test_rf_params = TestRFParams(**self.config.parameters)

        prep_result = PrepResult.load_from_folder(self.inputs.prep)
        sim_result = SimResult.load_from_folder(self.inputs.sim)
        train_rf_result = TrainRFResult.load_from_folder(self.inputs.train_rf)

        test_rf_result = test_rf(
            test_rf_params,
            prep_result,
            sim_result,
            train_rf_result,
            progress=self.progress,
            pipeline=self,
        )

        test_rf_result.save()
Ejemplo n.º 13
0
    def it_returns_no_all_dark_samples_on_valid_peps():
        with tmp.tmp_folder(chdir=True):
            prep_result = PrepResult.prep_result_fixture(
                pros=[".", "ABCDEFGHI"],
                pro_is_decoys=[False, False],
                peps=[".", "AAA"],
                pep_pro_iz=[0, 1],
            )

            pep_seq_df = prep_result.pepseqs()
            n_samples = 1000
            dyemat, radmat, recall = _make_arrays("test1",
                                                  n_peps=2,
                                                  n_samples=n_samples)
            sim_v1_worker._do_pep_sim(
                pep_seq_df[pep_seq_df.pep_i == 1],
                sim_params,
                n_samples=n_samples,
                output_dyemat=dyemat,
                output_radmat=radmat,
                output_recall=recall,
            )
            assert not np.any(np.all(dyemat[1] == 0, axis=(1, 2)))
Ejemplo n.º 14
0
    def it_gives_up_on_hard_peptides_and_returns_none():
        with tmp.tmp_folder(chdir=True):
            prep_result = PrepResult.prep_result_fixture(
                pros=[".", "ABCDEFGHI"],
                pro_is_decoys=[False, False],
                peps=[".", "DDD"],
                pep_pro_iz=[0, 1],
            )

            pep_seq_df = prep_result.pepseqs()

            n_samples = 1000
            dyemat, radmat, recall = _make_arrays("test1",
                                                  n_peps=2,
                                                  n_samples=n_samples)
            sim_v1_worker._do_pep_sim(
                pep_seq_df[pep_seq_df.pep_i == 1],
                sim_params,
                n_samples=n_samples,
                output_dyemat=dyemat,
                output_radmat=radmat,
                output_recall=recall,
            )
            assert np.all(recall[:] == 0.0)
Ejemplo n.º 15
0
    def _make_prep_result(pros,
                          is_decoys=[],
                          abundances=[],
                          ptm_locs=[],
                          in_report=[]):
        """
        Builds a PrepResult from a list of proteins. Each protein can be a
        string or a list of strings. Lists of strings are returned as multiple
        peps for the same protein.
        Eg. _make_prep_result([("AAXC", "XKP"), "AGGH"]) will yield 2 pros and 3 peps.
        """
        n_pros = len(pros)
        names = [f"id_{i}" for i in range(n_pros)]
        seqstrs = ["".join(pro) for pro in pros]

        def _make_protein(name, seq, abundance, report):
            protein = dict(name=name, sequence=seq, report=report or 0)
            if abundance is not None:
                protein["abundance"] = abundance
            return protein

        proteins = [
            _make_protein(*params) for params in itertools.zip_longest(
                names, seqstrs, abundances, in_report)
        ]
        params = PrepParams(proteins=proteins)

        _pros = pd.DataFrame(
            [(name, is_decoy or False, i, ptm_locs or "", in_report or 0)
             for i, (name, is_decoy, ptm_locs, in_report) in enumerate(
                 itertools.zip_longest(names, is_decoys, ptm_locs, in_report))
             ],
            columns=PrepResult.pros_columns,
        )

        _pro_seqs = pd.DataFrame(
            [(pro_i, aa) for pro_i, seqstr in enumerate(seqstrs)
             for aa in list(seqstr)],
            columns=PrepResult.pro_seqs_columns,
        )

        # normalize pros as lists of strings
        pros = [pro if isinstance(pro, list) else list(pro) for pro in pros]

        # extract peps from pros definitions
        peps_lens = [list(map(len, pro)) for pro in pros]
        peps = [(i, pep, start, stop - 1)
                for i, (pro, pep_lens) in enumerate(zip(pros, peps_lens))
                for pep, start, stop in zip(
                    pro,
                    itertools.accumulate([0] + pep_lens),
                    itertools.accumulate(pep_lens),
                )]

        _peps = pd.DataFrame(
            [(i, start, stop, pro_i)
             for i, (pro_i, _, start, stop) in enumerate(peps)],
            columns=PrepResult.peps_columns,
        )

        _pep_seqs = pd.DataFrame(
            [(pep_i, aa, start + offset)
             for pep_i, (_, pep, start, _) in enumerate(peps)
             for offset, aa in enumerate(list(pep))],
            columns=PrepResult.pep_seqs_columns,
        )

        return PrepResult(
            params=params,
            _pros=_pros,
            _pro_seqs=_pro_seqs,
            _peps=_peps,
            _pep_seqs=_pep_seqs,
        )
Ejemplo n.º 16
0
def prep(prep_params, pro_spec_df):
    """
    Given protease and decoy mode, create proteins and peptides.

    Arguments:
        prep_params: PrepParams
        pro_spec_df: Columns: sequence (str), id (str), ptm_locs (str)

    Steps:
        1. Real proteins are checked for uniqueness in seq and id
        2. The real proteins are first string-split "unwound" into seq_ dataframes
           (one row per amino acid).
        3. The decoys are added by reversing those real DFs.
        4. The proteolysis occurs by a map against proteins
        5. PTMs are added

    ParamResults:
        Four DFs:
            * the pro data (one row per protein)
            * the pro_seq data (one row per aa) * n_pros
            * the pep data (one row per peptide)
            * the pep_seq data (one row per aa) * n_pres
    """

    if prep_params.drop_duplicates:
        pro_spec_df = pro_spec_df.drop_duplicates("sequence")
        pro_spec_df = pro_spec_df.drop_duplicates("name")

    _step_1_check_for_uniqueness(pro_spec_df)

    reals_df, real_seqs_df = _step_2_create_pros_and_pro_seqs_dfs(pro_spec_df)

    decoys_df, decoy_seqs_df = _step_3_generate_decoys(reals_df, real_seqs_df,
                                                       prep_params.decoy_mode)

    pros_df = pd.concat((reals_df, decoys_df),
                        sort=True).reset_index(drop=True)
    pros_df = pros_df.astype(dict(pro_i=int))

    pro_seqs_df = pd.concat(
        (real_seqs_df, decoy_seqs_df)).reset_index(drop=True)

    peps_df, pep_seqs_df = _step_4_proteolysis(pro_seqs_df,
                                               prep_params.protease)

    if prep_params.n_peps_limit is not None:
        # This is used for debugging to limit the number of peptides.
        # This draws randomly to hopefully pick up decoys too
        n_peps = peps_df.pep_i.nunique()
        pep_iz = np.sort(
            np.random.choice(n_peps, prep_params.n_peps_limit, replace=False))
        pep_iz[0] = 0  # Ensure the reserved value is present
        peps_df = peps_df.loc[pep_iz]
        pep_seqs_df = pep_seqs_df[pep_seqs_df.pep_i.isin(pep_iz)]

    if prep_params.n_ptms_limit != 0:
        # n_ptms_limit can be a non-zero value to limit the number of ptms
        # allowed per peptide, or set to 0 to skip ptm permutations even when
        # there are PTMs annotated for the proteins in protein_csv.
        ptm_peps_df, ptm_pep_seqs_df = _step_5_create_ptm_peptides(
            peps_df, pep_seqs_df, pros_df, prep_params.n_ptms_limit)
        if ptm_peps_df is not None and len(ptm_peps_df) > 0:
            peps_df = pd.concat([peps_df, ptm_peps_df])
            pep_seqs_df = pd.concat([pep_seqs_df, ptm_pep_seqs_df])
    # else:
    #     important("Skipping ptm permutations because n_ptms_limit is 0")

    return PrepResult(
        params=prep_params,
        _pros=pros_df,
        _pro_seqs=pro_seqs_df,
        _peps=peps_df,
        _pep_seqs=pep_seqs_df,
    )
Ejemplo n.º 17
0
def zest_sim():
    prep_result = PrepResult.prep_result_fixture(
        pros=[".", "ABCDEFGHI", "ABC"],
        pro_is_decoys=[False, False, True],
        peps=[".", "ABC", "DAF", "ACH", "ABC"],
        pep_pro_iz=[0, 1, 1, 1, 2],
    )

    n_samples = 8
    n_peptides = 5
    n_channels = 2
    n_cycles = 3  # mock + edman (See below)

    def it_maintains_decoys_for_train():
        with tmp.tmp_folder(chdir=True):
            sim_params = _stub_sim_params(some_error_model, n_samples)
            sim_result = sim_v1_worker.sim_v1(sim_params, prep_result)
            assert np.any(sim_result.train_true_pep_iz == 4)

    def it_removes_decoys_for_test():
        with tmp.tmp_folder(chdir=True):
            sim_params = _stub_sim_params(some_error_model, n_samples)
            sim_result = sim_v1_worker.sim_v1(sim_params, prep_result)
            assert not np.any(sim_result.test_true_pep_iz == 4)

    def it_raises_if_train_and_test_identical():
        with tmp.tmp_folder(chdir=True):
            with zest.raises(in_message="are identical"):
                sim_params = _stub_sim_params(no_error_model, n_samples)
                sim_v1_worker.sim_v1(sim_params, prep_result)

    def it_drop_all_darks():
        with tmp.tmp_folder(chdir=True):
            prep_result = PrepResult.prep_result_fixture(
                pros=[".", "DD", "EE"],
                pro_is_decoys=[False, False, False],
                peps=[".", "DD", "EE"],
                pep_pro_iz=[0, 1, 2],
            )
            n_peptides = 3
            sim_params = _stub_sim_params(no_error_model, n_samples)
            sim_result = sim_v1_worker.sim_v1(sim_params, prep_result)
            assert sim_result.test_dyemat.shape == (
                0,
                n_channels,
                n_cycles,
            )
            assert sim_result.test_dyemat.dtype == np.uint8
            assert np.all(sim_result.test_dyemat[:] == 0)  # All dark

            assert sim_result.train_dyemat.shape == (
                1,
                n_channels,
                n_cycles,
            )
            assert sim_result.train_dyemat.dtype == np.uint8
            assert np.all(sim_result.train_pep_recalls[:] == 0.0)

    def it_generates_flu_info():
        with tmp.tmp_folder(chdir=True):
            prep_result = PrepResult.prep_result_fixture(
                pros=[".", "XAXCD", "XAXCDXX", "XCCXX"],
                pro_is_decoys=[False, False, False, False],
                peps=[".", "XAXCD", "XAXCDXX", "XCCXX"],
                pep_pro_iz=[0, 1, 2, 3],
            )
            sim_params = _stub_sim_params(some_error_model, n_samples)
            sim_result = sim_v1_worker.sim_v1(sim_params, prep_result)
            sim_result._generate_flu_info(prep_result)

            def it_computes_head_and_tail():
                _flus = sim_result._flus
                assert np.all(_flus[_flus.pep_i.isin([1, 2])].flu_count == 2)
                assert np.all(_flus[_flus.pep_i.isin([1, 2])].n_head_ch_0 == 1)
                assert np.all(_flus[_flus.pep_i.isin([1, 2])].n_head_ch_1 == 0)
                assert np.all(_flus[_flus.pep_i.isin([1, 2])].n_tail_ch_0 == 0)
                assert np.all(_flus[_flus.pep_i.isin([1, 2])].n_tail_ch_1 == 1)
                assert np.all(_flus[_flus.pep_i == 3].flu_count == 1)

            def it_peps__flus():
                df = sim_result.peps__flus(prep_result)
                assert "flustr" in df
                assert len(df) == 4

            def it_peps__flus__unique_flus():
                df = sim_result.peps__flus__unique_flus(prep_result)
                assert np.all(df.pep_i.values == [0, 3])

            zest()

    def it_surveys():
        with tmp.tmp_folder(chdir=True):
            n_samples = 1
            sim_params = _stub_sim_params(some_error_model, n_samples)
            sim_params.is_survey = True
            sim_params.n_samples_train = n_samples
            sim_params.n_samples_test = None
            sim_result = sim_v1_worker.sim_v1(sim_params, prep_result)
            assert sim_result.train_dyemat.shape == (
                n_peptides * n_samples,
                n_channels,
                n_cycles,
            )
            assert sim_result.train_dyemat.dtype == np.uint8
            assert sim_result.test_dyemat is None

    zest()