def it_shows_help(): schema = s( s.is_kws(a=s.is_dict( help="Help for a", elems=dict( b=s.is_int(help="Help for b"), c=s.is_kws(d=s.is_int(help="Help for d")), ), ))) schema.help() help_calls = m_print_help.normalized_calls() help_calls = [{h["key"]: h["help"]} for h in help_calls] assert help_calls == [ { "root": None }, { "a": "Help for a" }, { "b": "Help for b" }, { "c": None }, { "d": "Help for d" }, ]
class PrepParams(Params): defaults = Munch( protease=None, decoy_mode=None, include_misses=0, n_peps_limit=None, drop_duplicates=False, n_ptms_limit=None, ) schema = s( s.is_kws_r( protease=s.is_list(noneable=True, elems=s.is_str()), decoy_mode=s.is_str(noneable=True), include_misses=s.is_int(), n_peps_limit=s.is_int(noneable=True), drop_duplicates=s.is_bool(), n_ptms_limit=s.is_int(noneable=True), proteins=s.is_list( s.is_kws( name=s.is_str(required=True), sequence=s.is_str(required=True), ptm_locs=s.is_str(noneable=True), report=s.is_int(noneable=True), abundance=s.is_number(noneable=True), )), ))
def it_fetches_user_data(): schema = s( s.is_dict( help="Help for a", elems=dict( b=s.is_int(help="Help for b", userdata="userdata_1"), c=s.is_kws(d=s.is_int(help="Help for d")), ), )) tlf = schema.top_level_fields() assert tlf[0][0] == "b" and tlf[0][3] == "userdata_1"
class Gen(BaseGenerator): schema = s(s.is_kws(label_set=s.is_list(s.is_str())))
class SimParams(Params): """ Simulations parameters is and ErrorModel + parameters for sim """ defaults = Munch( n_pres=1, n_mocks=0, n_edmans=1, n_samples_train=5_000, n_samples_test=1_000, dyes=[], labels=[], random_seed=None, train_n_sample_multiplier= None, # This does not appear to be used anywhere. tfb allow_train_test_to_be_identical=False, enable_ptm_labels=False, is_survey=False, ) schema = s( s.is_kws_r( is_survey=s.is_bool(), error_model=s.is_kws(**ErrorModel.schema.schema()), n_pres=s.is_int(bounds=(0, None)), n_mocks=s.is_int(bounds=(0, None)), n_edmans=s.is_int(bounds=(0, None)), n_samples_train=s.is_int(bounds=(1, None)), n_samples_test=s.is_int(bounds=(1, None)), dyes=s.is_list(elems=s.is_kws_r(dye_name=s.is_str(), channel_name=s.is_str())), labels=s.is_list(elems=s.is_kws_r( amino_acid=s.is_str(), dye_name=s.is_str(), label_name=s.is_str(), ptm_only=s.is_bool(required=False, noneable=True), )), random_seed=s.is_int(required=False, noneable=True), allow_train_test_to_be_identical=s.is_bool(required=False, noneable=True), enable_ptm_labels=s.is_bool(required=False, noneable=True), )) def copy(self): # REMOVE everything that _build_join_dfs put in utils.safe_del(self, "df") utils.safe_del(self, "by_channel") utils.safe_del(self, "ch_by_aa") utils.safe_del(self, "channel_i_to_gain") utils.safe_del(self, "channel_i_to_vpd") dst = utils.munch_deep_copy(self, klass_set={SimParams}) dst.error_model = ErrorModel(**dst.error_model) assert isinstance(dst, SimParams) return dst def __init__(self, include_dfs=True, **kwargs): kwargs["error_model"] = kwargs.pop("error_model", ErrorModel()) super().__init__(**kwargs) if include_dfs: self._build_join_dfs() def validate(self): super().validate() all_dye_names = list(set([d.dye_name for d in self.dyes])) # No duplicate dye names self._validate( len(all_dye_names) == len(self.dyes), "The dye list contains a duplicate") # No duplicate labels self._validate( len(list(set(utils.listi(self.labels, "amino_acid")))) == len(self.labels), "There is a duplicate label", ) # All labels have a legit dye name [ self._validate( label.dye_name in all_dye_names, f"Label {label.label_name} does not have a valid matching dye_name", ) for label in self.labels ] @property def n_cycles(self): return self.n_pres + self.n_mocks + self.n_edmans def channels(self): return sorted(list(set(utils.listi(self.dyes, "channel_name")))) def channel_i_by_name(self): channels = self.channels() return { channel_name: channel_i for channel_i, channel_name in enumerate(channels) } @property def n_channels(self): return len(self.channel_i_by_name().keys()) @property def n_channels_and_cycles(self): return self.n_channels, self.n_cycles def _build_join_dfs(self): """ The error model contains information about the dyes and labels and other terms. Those error model parameters are wired together by names which are useful for reconciling calibrations. But here, these "by name" parameters are all put into a dataframe so that they can be indexed by integers. """ sim_dyes_df = pd.DataFrame(self.dyes) assert len(sim_dyes_df) > 0 sim_labels_df = pd.DataFrame(self.labels) assert len(sim_labels_df) > 0 error_model_dyes_df = pd.DataFrame(self.error_model.dyes) assert len(error_model_dyes_df) > 0 error_model_labels_df = pd.DataFrame(self.error_model.labels) assert len(error_model_labels_df) > 0 if len(sim_dyes_df) > 0: channel_df = (sim_dyes_df[[ "channel_name" ]].drop_duplicates().reset_index( drop=True).rename_axis("ch_i").reset_index()) label_df = pd.merge(left=sim_labels_df, right=error_model_labels_df, on="label_name") dye_df = pd.merge(left=sim_dyes_df, right=error_model_dyes_df, on="dye_name") dye_df = pd.merge(left=dye_df, right=channel_df, on="channel_name") self.df = (pd.merge( left=label_df, right=dye_df, on="dye_name").drop_duplicates().reset_index(drop=True)) else: self.df = pd.DataFrame() assert np.all( self.df.groupby("ch_i").p_bleach_per_cycle.nunique() == 1) assert np.all(self.df.groupby("ch_i").beta.nunique() == 1) assert np.all(self.df.groupby("ch_i").sigma.nunique() == 1) self.by_channel = [ Munch( p_bleach_per_cycle=self.df[self.df.ch_i == ch].iloc[0].p_bleach_per_cycle, beta=self.df[self.df.ch_i == ch].iloc[0].beta, sigma=self.df[self.df.ch_i == ch].iloc[0].sigma, gain=self.df[self.df.ch_i == ch].iloc[0].gain, vpd=self.df[self.df.ch_i == ch].iloc[0].vpd, ) for ch in range(self.n_channels) ] self.ch_by_aa = { row.amino_acid: row.ch_i for row in self.df.itertuples() } # These two needs to be lists (not ndarray) because they have to be duplicated self.channel_i_to_gain = [ self.by_channel[i].gain for i in range(self.n_channels) ] self.channel_i_to_vpd = [ self.by_channel[i].vpd for i in range(self.n_channels) ] def to_label_list(self): """Summarize labels like: ["DE", "C"]""" return [ "".join([ label.amino_acid for label in self.labels if label.dye_name == dye.dye_name ]) for dye in self.dyes ] def to_label_str(self): """Summarize labels like: DE,C""" return ",".join(self.to_label_list()) @classmethod def construct_from_aa_list(cls, aa_list, **kwargs): """ This is a helper to generate channel when you have a list of aas. For example, two channels where ch0 is D&E and ch1 is Y. ["DE", "Y"]. If you pass in an error model, it needs to match channels and labels. """ check.list_or_tuple_t(aa_list, str) allowed_aa_mods = ["[", "]"] assert all([(aa.isalpha() or aa in allowed_aa_mods) for aas in aa_list for aa in list(aas)]) dyes = [ Munch(dye_name=f"dye_{ch}", channel_name=f"ch_{ch}") for ch, _ in enumerate(aa_list) ] # Note the extra for loop because "DE" needs to be split into "D" & "E" # which is done by aa_str_to_list() - which also handles PTMs like S[p] labels = [ Munch( amino_acid=aa, dye_name=f"dye_{ch}", label_name=f"label_{ch}", ptm_only=False, ) for ch, aas in enumerate(aa_list) for aa in aa_str_to_list(aas) ] return cls(dyes=dyes, labels=labels, **kwargs)
class PrepParams(Params): PHOTOBLEACHING_PSEUDO_AA = "X" ALLOW_NONES_AND_NANS_IN_ABUNDANCE = False NORMALIZE_ABUNDANCE = False # Abundance is normalized in gen defaults = Munch( protease=None, decoy_mode=None, include_misses=0, n_peps_limit=None, drop_duplicates=False, n_ptms_limit=None, is_photobleaching_run=False, photobleaching_n_cycles=None, photobleaching_run_n_dye_count=None, ) schema = s( s.is_kws_r( protease=s.is_list(noneable=True, elems=s.is_str()), decoy_mode=s.is_str(noneable=True), include_misses=s.is_int(), n_peps_limit=s.is_int(noneable=True), drop_duplicates=s.is_bool(), n_ptms_limit=s.is_int(noneable=True), proteins=s.is_list( s.is_kws( name=s.is_str(required=True), sequence=s.is_str(required=True), ptm_locs=s.is_str(noneable=True), is_poi=s.is_int(noneable=True), abundance=s.is_number(noneable=True), )), is_photobleaching_run=s.is_bool(), photobleaching_n_cycles=s.is_int(noneable=True), photobleaching_run_n_dye_count=s.is_int(noneable=True), )) def validate(self): super().validate() # Try to normalize abundance values if provided. If abundance values are provided, do basic validation. # If no abundance values are provided, do nothing. # When a protein csv with no abundance columns is provided, it will come through as all nans # Note that self.proteins is likely a list of Munches, but could be a list of dicts, so don't assume we can access items as attrs abundance_info_present = any( "abundance" in protein and protein["abundance"] is not None and not math.isnan(protein["abundance"]) for protein in self.proteins) if abundance_info_present: abundance_criteria = [ (lambda protein: "abundance" in protein, "Abundance missing"), ( lambda protein: protein["abundance"] >= 0 if protein["abundance"] is not None else True, "Abundance must be greater than or equal to zero", ), ] if not self.ALLOW_NONES_AND_NANS_IN_ABUNDANCE: abundance_criteria += [ ( lambda protein: protein["abundance"] is not None, "Abundance must not be None", ), ( lambda protein: not math.isnan(protein["abundance"]), "Abundance must not be NaN", ), ] # Find min abundance value, also check for zeros and NaNs and error if found min_abundance = None for protein in self.proteins: # Check to make sure abundance passes criteria for criteria_fn, msg in abundance_criteria: if not criteria_fn(protein): abundance_value = protein.get("abundance") raise SchemaValidationFailed( f"Protein {protein.get('name')} has invalid abundance: {abundance_value} - {msg}" ) # Find min abundance value if (min_abundance is None or protein["abundance"] < min_abundance) and protein["abundance"] > 0: min_abundance = protein["abundance"] if self.NORMALIZE_ABUNDANCE: if min_abundance != 1: log.info("abundance data is not normalized, normalizing.") # normalize abundance by min value for protein in self.proteins: if protein["abundance"] is not None: protein["abundance"] /= min_abundance else: # Abundance information is missing from all proteins # Set abudance to 1 for protein in self.proteins: protein["abundance"] = 1