def start(self): params = NNV2Params(source="NNV2Task", **self.config.parameters) sigproc_v2_result = None rad_filter_result = None if params.include_sigproc: sigproc_v2_result = SigprocV2Result.load_from_folder( self.inputs.sigproc, prop_list=["n_cycles", "n_channels"]) rad_filter_result = RadFilterResult.load_from_folder( self.inputs.rad_filter) prep_result = None if self.inputs.get("prep") is not None: prep_result = PrepResult.load_from_folder(self.inputs.prep) sim_v2_result = None if self.inputs.get("sim_v2") is not None: sim_v2_result = SimV2Result.load_from_folder(self.inputs.sim_v2) nn_v2( params, prep_result, sim_v2_result, sigproc_v2_result, rad_filter_result, progress=self.progress, pipeline=self, ).save()
def it_generates_flu_info(): with tmp.tmp_folder(chdir=True): prep_result = PrepResult.prep_result_fixture( pros=[".", "XAXCD", "XAXCDXX", "XCCXX"], pro_is_decoys=[False, False, False, False], peps=[".", "XAXCD", "XAXCDXX", "XCCXX"], pep_pro_iz=[0, 1, 2, 3], ) sim_params = _stub_sim_params(some_error_model, n_samples) sim_result = sim_v1_worker.sim_v1(sim_params, prep_result) sim_result._generate_flu_info(prep_result) def it_computes_head_and_tail(): _flus = sim_result._flus assert np.all(_flus[_flus.pep_i.isin([1, 2])].flu_count == 2) assert np.all(_flus[_flus.pep_i.isin([1, 2])].n_head_ch_0 == 1) assert np.all(_flus[_flus.pep_i.isin([1, 2])].n_head_ch_1 == 0) assert np.all(_flus[_flus.pep_i.isin([1, 2])].n_tail_ch_0 == 0) assert np.all(_flus[_flus.pep_i.isin([1, 2])].n_tail_ch_1 == 1) assert np.all(_flus[_flus.pep_i == 3].flu_count == 1) def it_peps__flus(): df = sim_result.peps__flus(prep_result) assert "flustr" in df assert len(df) == 4 def it_peps__flus__unique_flus(): df = sim_result.peps__flus__unique_flus(prep_result) assert np.all(df.pep_i.values == [0, 3]) zest()
def it_returns_the_fraction_of_all_dark_samples(): with tmp.tmp_folder(chdir=True): n_samples = 5000 sim_params = _stub_sim_params( ErrorModel.from_defaults(n_channels=2), n_samples) prep_result = PrepResult.prep_result_fixture( pros=[".", "ABCDEFGHI"], pro_is_decoys=[False, False], peps=[".", "ABB"], pep_pro_iz=[0, 1], ) pep_seq_df = prep_result.pepseqs() dyemat, radmat, recall = _make_arrays("test1", n_peps=2, n_samples=n_samples) sim_v1_worker._do_pep_sim( pep_seq_df[pep_seq_df.pep_i == 1], sim_params, n_samples=n_samples, output_dyemat=dyemat, output_radmat=radmat, output_recall=recall, ) assert np.all((0.9 < recall[1]) & (recall[1] < 1.0))
def it_drop_all_darks(): with tmp.tmp_folder(chdir=True): prep_result = PrepResult.prep_result_fixture( pros=[".", "DD", "EE"], pro_is_decoys=[False, False, False], peps=[".", "DD", "EE"], pep_pro_iz=[0, 1, 2], ) n_peptides = 3 sim_params = _stub_sim_params(no_error_model, n_samples) sim_result = sim_v1_worker.sim_v1(sim_params, prep_result) assert sim_result.test_dyemat.shape == ( 0, n_channels, n_cycles, ) assert sim_result.test_dyemat.dtype == np.uint8 assert np.all(sim_result.test_dyemat[:] == 0) # All dark assert sim_result.train_dyemat.shape == ( 1, n_channels, n_cycles, ) assert sim_result.train_dyemat.dtype == np.uint8 assert np.all(sim_result.train_pep_recalls[:] == 0.0)
def start(self): sim_params = SimParams(include_dfs=True, **self.config.parameters) prep_result = PrepResult.load_from_folder(self.inputs.prep) sim_result = sim(sim_params, prep_result, progress=self.progress, pipeline=self) sim_result._generate_flu_info(prep_result) sim_result.save()
def start(self): rf_train_v2_params = RFTrainV2Params(**self.config.parameters) prep_result = PrepResult.load_from_folder(self.inputs.prep) sim_v2_result = SimV2Result.load_from_folder(self.inputs.sim_v2) rf_train_v2_result = rf_train( rf_train_v2_params, prep_result, sim_v2_result, progress=self.progress ) rf_train_v2_result.save()
def start(self): params = SimV2Params(**self.config.parameters) prep_result = PrepResult.load_from_folder(self.inputs.prep) result = sim_v2(params, prep_result, progress=self.progress, pipeline=self) result.save() if params.dump_debug: result.dump_debug()
def start(self): test_nn_params = TestNNParams(**self.config.parameters) prep_result = PrepResult.load_from_folder(self.inputs.prep) sim_result = SimResult.load_from_folder(self.inputs.sim) test_nn_result = test_nn( test_nn_params, prep_result, sim_result, progress=self.progress, pipeline=self, ) test_nn_result.save()
def start(self): survey_nn_params = SurveyNNParams(**self.config.parameters) prep_result = PrepResult.load_from_folder(self.inputs.prep) sim_result = SimResult.load_from_folder(self.inputs.sim) survey_nn_result = survey_nn( survey_nn_params, prep_result, sim_result, progress=self.progress, pipeline=self, ) survey_nn_result.save()
def zest_poi_vs_all(): prep_result = PrepResult.prep_result_fixture( pros=[".", "A", "B", "C"], pro_is_decoys=[0, 0, 0, 0], peps=[".", "A", "B", "C"], pep_pro_iz=[0, 1, 2, 3], is_pois=[0, 0, 1, 0], ) true_pep_iz = np.array([1, 1, 2, 2, 3, 3]) remapped_pep_iz = poi_vs_all(true_pep_iz, prep_result) assert remapped_pep_iz.tolist() == [0, 0, 2, 2, 0, 0] zest()
def _before(): nonlocal result, prep_result sim_params = _make_sim_params(["AB", "CD"], n_edmans=3) prep_result = PrepResult.prep_result_fixture( pros=[".", "ABCDAADD", "AXCABC"], pro_is_decoys=[False, False, False], peps=[".", "ABCD", "AADD", "AXCABC"], pep_pro_iz=[0, 1, 1, 2], ) prep_result.params = PrepParams(proteins=[ dict(name="id_0", sequence="."), dict(name="id_1", sequence="ABCDAADD"), dict(name="id_2", sequence="AXCABC"), ]) result = SimV1Result(params=sim_params) result._generate_flu_info(prep_result)
def start(self): test_rf_params = TestRFParams(**self.config.parameters) prep_result = PrepResult.load_from_folder(self.inputs.prep) sim_result = SimResult.load_from_folder(self.inputs.sim) train_rf_result = TrainRFResult.load_from_folder(self.inputs.train_rf) test_rf_result = test_rf( test_rf_params, prep_result, sim_result, train_rf_result, progress=self.progress, pipeline=self, ) test_rf_result.save()
def it_returns_no_all_dark_samples_on_valid_peps(): with tmp.tmp_folder(chdir=True): prep_result = PrepResult.prep_result_fixture( pros=[".", "ABCDEFGHI"], pro_is_decoys=[False, False], peps=[".", "AAA"], pep_pro_iz=[0, 1], ) pep_seq_df = prep_result.pepseqs() n_samples = 1000 dyemat, radmat, recall = _make_arrays("test1", n_peps=2, n_samples=n_samples) sim_v1_worker._do_pep_sim( pep_seq_df[pep_seq_df.pep_i == 1], sim_params, n_samples=n_samples, output_dyemat=dyemat, output_radmat=radmat, output_recall=recall, ) assert not np.any(np.all(dyemat[1] == 0, axis=(1, 2)))
def it_gives_up_on_hard_peptides_and_returns_none(): with tmp.tmp_folder(chdir=True): prep_result = PrepResult.prep_result_fixture( pros=[".", "ABCDEFGHI"], pro_is_decoys=[False, False], peps=[".", "DDD"], pep_pro_iz=[0, 1], ) pep_seq_df = prep_result.pepseqs() n_samples = 1000 dyemat, radmat, recall = _make_arrays("test1", n_peps=2, n_samples=n_samples) sim_v1_worker._do_pep_sim( pep_seq_df[pep_seq_df.pep_i == 1], sim_params, n_samples=n_samples, output_dyemat=dyemat, output_radmat=radmat, output_recall=recall, ) assert np.all(recall[:] == 0.0)
def _make_prep_result(pros, is_decoys=[], abundances=[], ptm_locs=[], in_report=[]): """ Builds a PrepResult from a list of proteins. Each protein can be a string or a list of strings. Lists of strings are returned as multiple peps for the same protein. Eg. _make_prep_result([("AAXC", "XKP"), "AGGH"]) will yield 2 pros and 3 peps. """ n_pros = len(pros) names = [f"id_{i}" for i in range(n_pros)] seqstrs = ["".join(pro) for pro in pros] def _make_protein(name, seq, abundance, report): protein = dict(name=name, sequence=seq, report=report or 0) if abundance is not None: protein["abundance"] = abundance return protein proteins = [ _make_protein(*params) for params in itertools.zip_longest( names, seqstrs, abundances, in_report) ] params = PrepParams(proteins=proteins) _pros = pd.DataFrame( [(name, is_decoy or False, i, ptm_locs or "", in_report or 0) for i, (name, is_decoy, ptm_locs, in_report) in enumerate( itertools.zip_longest(names, is_decoys, ptm_locs, in_report)) ], columns=PrepResult.pros_columns, ) _pro_seqs = pd.DataFrame( [(pro_i, aa) for pro_i, seqstr in enumerate(seqstrs) for aa in list(seqstr)], columns=PrepResult.pro_seqs_columns, ) # normalize pros as lists of strings pros = [pro if isinstance(pro, list) else list(pro) for pro in pros] # extract peps from pros definitions peps_lens = [list(map(len, pro)) for pro in pros] peps = [(i, pep, start, stop - 1) for i, (pro, pep_lens) in enumerate(zip(pros, peps_lens)) for pep, start, stop in zip( pro, itertools.accumulate([0] + pep_lens), itertools.accumulate(pep_lens), )] _peps = pd.DataFrame( [(i, start, stop, pro_i) for i, (pro_i, _, start, stop) in enumerate(peps)], columns=PrepResult.peps_columns, ) _pep_seqs = pd.DataFrame( [(pep_i, aa, start + offset) for pep_i, (_, pep, start, _) in enumerate(peps) for offset, aa in enumerate(list(pep))], columns=PrepResult.pep_seqs_columns, ) return PrepResult( params=params, _pros=_pros, _pro_seqs=_pro_seqs, _peps=_peps, _pep_seqs=_pep_seqs, )
def prep(prep_params, pro_spec_df): """ Given protease and decoy mode, create proteins and peptides. Arguments: prep_params: PrepParams pro_spec_df: Columns: sequence (str), id (str), ptm_locs (str) Steps: 1. Real proteins are checked for uniqueness in seq and id 2. The real proteins are first string-split "unwound" into seq_ dataframes (one row per amino acid). 3. The decoys are added by reversing those real DFs. 4. The proteolysis occurs by a map against proteins 5. PTMs are added ParamResults: Four DFs: * the pro data (one row per protein) * the pro_seq data (one row per aa) * n_pros * the pep data (one row per peptide) * the pep_seq data (one row per aa) * n_pres """ if prep_params.drop_duplicates: pro_spec_df = pro_spec_df.drop_duplicates("sequence") pro_spec_df = pro_spec_df.drop_duplicates("name") _step_1_check_for_uniqueness(pro_spec_df) reals_df, real_seqs_df = _step_2_create_pros_and_pro_seqs_dfs(pro_spec_df) decoys_df, decoy_seqs_df = _step_3_generate_decoys(reals_df, real_seqs_df, prep_params.decoy_mode) pros_df = pd.concat((reals_df, decoys_df), sort=True).reset_index(drop=True) pros_df = pros_df.astype(dict(pro_i=int)) pro_seqs_df = pd.concat( (real_seqs_df, decoy_seqs_df)).reset_index(drop=True) peps_df, pep_seqs_df = _step_4_proteolysis(pro_seqs_df, prep_params.protease) if prep_params.n_peps_limit is not None: # This is used for debugging to limit the number of peptides. # This draws randomly to hopefully pick up decoys too n_peps = peps_df.pep_i.nunique() pep_iz = np.sort( np.random.choice(n_peps, prep_params.n_peps_limit, replace=False)) pep_iz[0] = 0 # Ensure the reserved value is present peps_df = peps_df.loc[pep_iz] pep_seqs_df = pep_seqs_df[pep_seqs_df.pep_i.isin(pep_iz)] if prep_params.n_ptms_limit != 0: # n_ptms_limit can be a non-zero value to limit the number of ptms # allowed per peptide, or set to 0 to skip ptm permutations even when # there are PTMs annotated for the proteins in protein_csv. ptm_peps_df, ptm_pep_seqs_df = _step_5_create_ptm_peptides( peps_df, pep_seqs_df, pros_df, prep_params.n_ptms_limit) if ptm_peps_df is not None and len(ptm_peps_df) > 0: peps_df = pd.concat([peps_df, ptm_peps_df]) pep_seqs_df = pd.concat([pep_seqs_df, ptm_pep_seqs_df]) # else: # important("Skipping ptm permutations because n_ptms_limit is 0") return PrepResult( params=prep_params, _pros=pros_df, _pro_seqs=pro_seqs_df, _peps=peps_df, _pep_seqs=pep_seqs_df, )
def zest_sim(): prep_result = PrepResult.prep_result_fixture( pros=[".", "ABCDEFGHI", "ABC"], pro_is_decoys=[False, False, True], peps=[".", "ABC", "DAF", "ACH", "ABC"], pep_pro_iz=[0, 1, 1, 1, 2], ) n_samples = 8 n_peptides = 5 n_channels = 2 n_cycles = 3 # mock + edman (See below) def it_maintains_decoys_for_train(): with tmp.tmp_folder(chdir=True): sim_params = _stub_sim_params(some_error_model, n_samples) sim_result = sim_v1_worker.sim_v1(sim_params, prep_result) assert np.any(sim_result.train_true_pep_iz == 4) def it_removes_decoys_for_test(): with tmp.tmp_folder(chdir=True): sim_params = _stub_sim_params(some_error_model, n_samples) sim_result = sim_v1_worker.sim_v1(sim_params, prep_result) assert not np.any(sim_result.test_true_pep_iz == 4) def it_raises_if_train_and_test_identical(): with tmp.tmp_folder(chdir=True): with zest.raises(in_message="are identical"): sim_params = _stub_sim_params(no_error_model, n_samples) sim_v1_worker.sim_v1(sim_params, prep_result) def it_drop_all_darks(): with tmp.tmp_folder(chdir=True): prep_result = PrepResult.prep_result_fixture( pros=[".", "DD", "EE"], pro_is_decoys=[False, False, False], peps=[".", "DD", "EE"], pep_pro_iz=[0, 1, 2], ) n_peptides = 3 sim_params = _stub_sim_params(no_error_model, n_samples) sim_result = sim_v1_worker.sim_v1(sim_params, prep_result) assert sim_result.test_dyemat.shape == ( 0, n_channels, n_cycles, ) assert sim_result.test_dyemat.dtype == np.uint8 assert np.all(sim_result.test_dyemat[:] == 0) # All dark assert sim_result.train_dyemat.shape == ( 1, n_channels, n_cycles, ) assert sim_result.train_dyemat.dtype == np.uint8 assert np.all(sim_result.train_pep_recalls[:] == 0.0) def it_generates_flu_info(): with tmp.tmp_folder(chdir=True): prep_result = PrepResult.prep_result_fixture( pros=[".", "XAXCD", "XAXCDXX", "XCCXX"], pro_is_decoys=[False, False, False, False], peps=[".", "XAXCD", "XAXCDXX", "XCCXX"], pep_pro_iz=[0, 1, 2, 3], ) sim_params = _stub_sim_params(some_error_model, n_samples) sim_result = sim_v1_worker.sim_v1(sim_params, prep_result) sim_result._generate_flu_info(prep_result) def it_computes_head_and_tail(): _flus = sim_result._flus assert np.all(_flus[_flus.pep_i.isin([1, 2])].flu_count == 2) assert np.all(_flus[_flus.pep_i.isin([1, 2])].n_head_ch_0 == 1) assert np.all(_flus[_flus.pep_i.isin([1, 2])].n_head_ch_1 == 0) assert np.all(_flus[_flus.pep_i.isin([1, 2])].n_tail_ch_0 == 0) assert np.all(_flus[_flus.pep_i.isin([1, 2])].n_tail_ch_1 == 1) assert np.all(_flus[_flus.pep_i == 3].flu_count == 1) def it_peps__flus(): df = sim_result.peps__flus(prep_result) assert "flustr" in df assert len(df) == 4 def it_peps__flus__unique_flus(): df = sim_result.peps__flus__unique_flus(prep_result) assert np.all(df.pep_i.values == [0, 3]) zest() def it_surveys(): with tmp.tmp_folder(chdir=True): n_samples = 1 sim_params = _stub_sim_params(some_error_model, n_samples) sim_params.is_survey = True sim_params.n_samples_train = n_samples sim_params.n_samples_test = None sim_result = sim_v1_worker.sim_v1(sim_params, prep_result) assert sim_result.train_dyemat.shape == ( n_peptides * n_samples, n_channels, n_cycles, ) assert sim_result.train_dyemat.dtype == np.uint8 assert sim_result.test_dyemat is None zest()