def it_groups(): df = pd.DataFrame(dict(a=[1, 1, 2, 2, 2], b=[1, 2, 3, 4, 5])) res = zap.df_groups(test9, df.groupby("a")) a = listi(res, 0) ap1 = listi(res, 1) assert a == [1, 2] assert ap1 == [2, 3]
def peps_above_thresholds(self, precision=0.0, recall=0.0): with zap.Context(mode="thread"): df = zap.df_groups( _do_peps_above_thresholds, self.pr_curve_by_pep().groupby("pep_i"), precision=precision, recall=recall, ) df = df.reset_index().sort_index().rename(columns={0: "passes"}) return np.argwhere(df.passes.values).flatten()
def _run_sim(sim_params, pep_seqs_df, name, n_peps, n_samples, progress): if sim_params.get("random_seed") is not None: # Increment so that train and test will be different sim_params.random_seed += 1 np.random.seed(sim_params.random_seed) dyemat = ArrayResult( f"{name}_dyemat", shape=(n_peps, n_samples, sim_params.n_channels, sim_params.n_cycles), dtype=DyeType, mode="w+", ) radmat = ArrayResult( f"{name}_radmat", shape=(n_peps, n_samples, sim_params.n_channels, sim_params.n_cycles), dtype=RadType, mode="w+", ) recall = ArrayResult( f"{name}_recall", shape=(n_peps, ), dtype=RecallType, mode="w+", ) flus__remainders = zap.df_groups( _do_pep_sim, pep_seqs_df.groupby("pep_i"), sim_params=sim_params, n_samples=n_samples, output_dyemat=dyemat, output_radmat=radmat, output_recall=recall, _progress=progress, _trap_exceptions=False, _process_mode=True, ) flus = np.array(utils.listi(flus__remainders, 0)) flu_remainders = np.array(utils.listi(flus__remainders, 1)) return dyemat, radmat, recall, flus, flu_remainders
def _step_5_create_ptm_peptides(peps_df, pep_seqs_df, pros_df, n_ptms_limit): """ Create new peps and pep_seqs by applying PTMs based on the pro_ptm_locs information in pros_df. """ # 1. Get subset of proteins+peps with ptms by filtering proteins with ptms and joining # to peps and pep_seqs # # This None vs "" is messy. pros_with_ptms = pros_df[pros_df.pro_ptm_locs != ""] df = (pros_with_ptms.set_index("pro_i").join( peps_df.set_index("pro_i")).reset_index()) df = df.set_index("pep_i").join( pep_seqs_df.set_index("pep_i")).reset_index() if len(df) == 0: return None, None # 2. for each peptide apply _do_ptm_permutations which will result in # a list of new dataframes of the form joined above; new_pep_infos is a # list of these lists. # # new_pep_infos = parallel_groupby_apply( # df.groupby("pep_i"), # _do_ptm_permutations, # n_ptms_limit=n_ptms_limit, # _trap_exceptions=False, # _process_mode=True, # ) new_pep_infos = zap.df_groups( _do_ptm_permutations, df.groupby("pep_i"), n_ptms_limit=n_ptms_limit, _trap_exceptions=False, _process_mode=True, ) # 3. create new peps, pep_seqs, from list of dfs returned in (2) # # peps_columns = ["pep_i", "pep_start", "pep_stop", "pro_i"] # pep_seqs_columns = ["pep_i", "aa", "pep_offset_in_pro"] # new_peps = [] new_pep_seqs = [] pep_iz = peps_df.pep_i.unique() next_pep_i = peps_df.pep_i.max() + 1 for new_peps_info in new_pep_infos: for pep_info in new_peps_info: # Note we only want one pep entry and pep_info contains enough rows to hold # the whole sequence for the peptide in the aa column. So drop_duplicates() pep = pep_info[PrepResult.peps_columns].drop_duplicates() pep_seq = pep_info[PrepResult.pep_seqs_columns].copy( ) # avoid SettingWithCopyWarning with copy() pep.pep_i = next_pep_i pep_seq.pep_i = next_pep_i next_pep_i += 1 new_peps += [pep] new_pep_seqs += [pep_seq] new_peps_df = pd.concat(new_peps) new_pep_seqs_df = pd.concat(new_pep_seqs) return new_peps_df, new_pep_seqs_df