class PrepParams(Params): defaults = Munch( protease=None, decoy_mode=None, include_misses=0, n_peps_limit=None, drop_duplicates=False, n_ptms_limit=None, ) schema = s( s.is_kws_r( protease=s.is_list(noneable=True, elems=s.is_str()), decoy_mode=s.is_str(noneable=True), include_misses=s.is_int(), n_peps_limit=s.is_int(noneable=True), drop_duplicates=s.is_bool(), n_ptms_limit=s.is_int(noneable=True), proteins=s.is_list( s.is_kws( name=s.is_str(required=True), sequence=s.is_str(required=True), ptm_locs=s.is_str(noneable=True), report=s.is_int(noneable=True), abundance=s.is_number(noneable=True), )), ))
class TestGen(BaseGenerator): schema = s( s.is_kws_r( n_edmans=s.is_int(help="Number of Edman cycles"), n_pres=s.is_int(), protease=s.is_list(s.is_str()), label_set=s.is_list(s.is_str()), )) defaults = Munch(n_pres=0)
class CalibNNParams(Params): defaults = Munch() schema = s( s.is_kws_r( mode=s.is_str(), n_pres=s.is_int(), n_mocks=s.is_int(), n_edmans=s.is_int(), dye_names=s.is_list(s.is_str()), scope_name=s.is_str(), channels=s.is_list(s.is_int()), ) )
def it_returns_required_elems(): userdata = dict(some_key=1) test_s = s( s.is_dict( all_required=True, elems=dict( a=s.is_int(), b=s.is_float(help="A float"), c=s.is_number(), d=s.is_str(userdata=userdata), e=s.is_list(), f=s.is_dict(all_required=True, elems=dict(d=s.is_int(), e=s.is_int())), ), )) reqs = test_s.requirements() assert reqs == [ ("a", int, None, None), ("b", float, "A float", None), ("c", float, None, None), ("d", str, None, userdata), ("e", list, None, None), ("f", dict, None, None), ]
class ImsImportParams(Params): defaults = Munch( is_movie=False, start_field=0, n_fields_limit=None, start_cycle=0, n_cycles_limit=None, dst_ch_i_to_src_ch_i=None, is_z_stack_single_file=False, z_stack_n_slices_per_field=None, ) # Note that in movie mode what is called "field" is really the "frame" since the # stage does not move between shots. # The single .nd2 file in movie mode then treats the "fields" as if they are "cycles" # of a single field. schema = s( s.is_kws_r( is_movie=s.is_bool(noneable=True), start_field=s.is_int(), n_fields_limit=s.is_int(noneable=True), start_cycle=s.is_int(noneable=True), n_cycles_limit=s.is_int(noneable=True), dst_ch_i_to_src_ch_i=s.is_list(elems=s.is_int(), noneable=True), is_z_stack_single_file=s.is_bool(), z_stack_n_slices_per_field=s.is_int(noneable=True), ))
def it_bounds_min(): test_s = s(s.is_list(min_len=2)) test_s.validate([1, 2]) with zest.raises(SchemaValidationFailed): test_s.validate([1]) with zest.raises(SchemaValidationFailed): test_s.validate([])
def it_validates_default_list_elems_int(): test_s = s(s.is_list(elems=s.is_int())) test_s.validate([1, 2, 3]) with zest.raises(SchemaValidationFailed): test_s.validate(1) with zest.raises(SchemaValidationFailed): test_s.validate([1, "str"])
class TestNNParams(Params): defaults = Munch( include_training_set=False, n_neighbors=8, dt_score_mode="gmm_normalized_wpdf_dist_sigma", dt_score_metric="", dt_score_bias=0.1, dt_filter_threshold=0, rare_penalty=0.8, penalty_coefs=None, radius=15.0, random_seed=None, ) schema = s( s.is_kws_r( include_training_set=s.is_bool(), n_neighbors=s.is_int(), dt_score_bias=s.is_float(), dt_score_mode=s.is_str(options=[ "gmm_normalized_wpdf", "gmm_normalized_wpdf_dist_sigma", "gmm_normalized_wpdf_no_inv_var", "one", "dt_freq_log_weight", "cdist_normalized", "cdist_weighted_sqrt", "cdist_weighted_log", "cdist_weighted_normalized", "cdist_weighted_normalized_sqrt", "cdist_weighted_normalized_log", ]), dt_score_metric=s.is_str(options=[ "", "braycurtis", "canberra", "chebyshev", "cityblock", "correlation", "cosine", "euclidean", "jensenshannon", "minkowski", "seuclidean", "sqeuclidean", ]), dt_filter_threshold=s.is_int(), penalty_coefs=s.is_list(elems=s.is_float(), min_len=2, max_len=2, noneable=True), rare_penalty=s.is_float(noneable=True), radius=s.is_float(), random_seed=s.is_int(noneable=True), ))
def it_validates_recursively(): test_s = s( s.is_dict(elems=dict( a=s.is_int(), b=s.is_list(required=True, elems=s.is_str()), c=s.is_dict(required=True), ))) test_s.validate(dict(a=1, b=["a", "b"], c=dict())) with zest.raises(SchemaValidationFailed): test_s.validate(dict(a=1, b=[1], c=dict())) with zest.raises(SchemaValidationFailed): test_s.validate(dict(a=1, b=["a"], c=1))
class BaseGenerator(report_builder.ReportBuilder, Munch): """ Base of all generators. Expects sub-classes to provide a class member "required_schema" which is used for parsing the kwargs on the __init__() Inherits from ReportBuilder for backwards compatibility with generators which expect to find report methods on the generator class """ schema = None # Should be overloaded in any sub-class defaults = {} # Should be overloaded in any sub-class job_setup_schema = s( s.is_kws_r( job=s.is_str(help="See Main Help"), sample=s.is_str(allow_empty_string=False, help="See Main Help"), )) protein_schema = s( s.is_kws_r( protein=s.is_list(elems=s.is_kws_r( id=s.is_str(), seqstr=s.is_str(), )), protein_of_interest=s.is_list( s.is_str(allow_empty_string=False), noneable=True, help= "The id of the protein(s) of interest, used in survey and reporting", ), )) label_set_schema = s( s.is_kws_r( label_set=s.is_list(elems=s.is_str(), help="See Main Help"))) lnfit_schema = s( s.is_kws_r( lnfit_name=s.is_list(s.is_str(), noneable=True, help="See Main Help"), lnfit_params=s.is_list(s.is_str(), noneable=True, help="See Main Help"), lnfit_dye_on_threshold=s.is_list(s.is_int(), noneable=True, help="See Main Help"), lnfit_photometry_only=s.is_list(s.is_str(), noneable=True, help="See Main Help"), )) scope_run_schema = s( s.is_kws_r( n_edmans=s.is_int(help="See Main Help"), n_pres=s.is_int(help="See Main Help"), n_mocks=s.is_int(help="See Main Help"), )) peptide_setup_schema = s( s.is_kws_r( protease=s.is_list(elems=s.is_str(), help="See Main Help"), decoys=s.is_str(help="See Main Help"), random_seed=s.is_int(noneable=True, help="See Main Help"), n_ptms_limit=s.is_int( bounds=(0, 12), help= "Max number of PTMs per peptide to allow. Peptides with more PTM sites than this will not consider any PTM permutations.", ), )) sim_schema = s( s.is_kws_r( n_samples_train=s.is_int(bounds=(1, None), help="See Main Help"), n_samples_test=s.is_int(bounds=(1, None), help="See Main Help"), allow_edman_cterm=s.is_bool( noneable=True, help= "Edman cycles can remove final C-terminal AA from peptides at plate boundary.", ), use_lognormal_model=s.is_bool( help="Use older lognormal radiometry model", ), is_photobleaching_run=s.is_bool(), photobleaching_run_n_dye_count=s.is_int(noneable=True), )) sigproc_source_schema = s( s.is_kws_r( movie=s.is_bool(noneable=True, help="See Main Help"), n_cycles_limit=s.is_int(noneable=True, help="See Main Help"), start_cycle=s.is_int(noneable=True, help="See Main Help"), dst_ch_i_to_src_ch_i=s.is_str(noneable=True, help="Comma separated"), )) sigproc_v1_schema = s( s.is_kws_r( sigproc_source=s.is_str(noneable=True, help="See Main Help"), radial_filter=s.is_float(noneable=True, bounds=(0.01, 1.0), help="See Main Help"), peak_find_n_cycles=s.is_int(bounds=(1, 10000), help="See Main Help"), peak_find_start=s.is_int(bounds=(0, 10000), help="See Main Help"), anomaly_iqr_cutoff=s.is_int(bounds=(1, 100), help="See Main Help"), )) sigproc_v2_schema = s( s.is_kws_r( calibration_job=s.is_str(noneable=True), sigproc_source=s.is_str(noneable=True, help="See Main Help"), self_calib=s.is_bool(noneable=True), ch_aln=s.is_str(noneable=True, help="comma delimited in x0,y0,x1,y1,..."), ch_for_alignment=s.is_int(noneable=True), calib_dst_ch_i_to_src_ch_i=s.is_str(noneable=True, help="Comma separated"), )) sigproc_v2_calib_schema = s( s.is_kws_r( sigproc_source=s.is_str(noneable=True, help="See Main Help"), movie=s.is_bool(noneable=True), mode=s.is_str(options=["illum"]), # mode will eventually have a second option "dye calib" )) # TODO: Remove all error_model_schema error_model_schema = s( s.is_kws_r( err_p_edman_failure=s.is_list(elems=s.is_str( help="See Main Help")), err_p_detach=s.is_list(elems=s.is_str(help="See Main Help")), err_p_bleach=s.is_list(elems=s.is_str(help="See Main Help")), err_p_non_fluorescent=s.is_list(elems=s.is_str( help="See Main Help")), err_row_k_sigma=s.is_list(elems=s.is_str(help="See Main Help")), # For lognormal: to be deprecated err_dye_beta=s.is_list(elems=s.is_str(help="See Main Help")), err_dye_sigma=s.is_list(elems=s.is_str(help="See Main Help")), err_dye_zero_beta=s.is_list(elems=s.is_str(help="See Main Help")), err_dye_zero_sigma=s.is_list(elems=s.is_str(help="See Main Help")), # For normal err_gain_mu=s.is_list(elems=s.is_str(help="See Main Help")), err_gain_sigma=s.is_list(elems=s.is_str(help="See Main Help")), err_bg_mu=s.is_list(elems=s.is_str(help="See Main Help")), err_bg_sigma=s.is_list(elems=s.is_str(help="See Main Help")), )) # Scheme is a flag that allows passing a pair of (protease, label_set) in directly, # Rather than passing them separately and getting permutations scheme_schema = s( s.is_kws_r(scheme=s.is_list(elems=s.is_str(), help="See Main Help"))) classifier_choice_schema = s(s.is_kws_r(classifier=s.is_str())) error_model_defaults_chemistry = Munch( err_p_edman_failure=0.06, err_p_detach=0.05, err_p_bleach=0.05, err_p_non_fluorescent=0.07, ) error_model_defaults_lognormal = Munch( err_row_k_sigma=0.16, err_dye_beta=7500.0, err_dye_sigma=0.16, err_dye_zero_beta=0.0, err_dye_zero_sigma=400.0, ) error_model_defaults_normal = Munch( # Based on eye-balling val18_2t err_row_k_sigma=0.16, err_gain_mu=15_000.0, err_gain_sigma=1_200.0, err_bg_mu=0.0, err_bg_sigma=400.0, ) has_report = True def __init__(self, **kwargs): # APPLY defaults and then ask user for any elements that are not declared super().__init__(**kwargs) self.apply_defaults() self.setup_err_model() self.validate() self.reports = Munch() self.add_report("report", self) # static reports are ipynb files that are placed in the _reports # folder under a job and are executed by the indexer. # self.static_reports is a list of file names (without paths) self.static_reports = [] self._validate_protein_of_interest() def add_report(self, report_name, builder): assert report_name not in self.reports self.reports[report_name] = builder def _validate_protein_of_interest(self): if "protein" in self: seq_ids = {seq["id"] for seq in self.protein} for poi in self.protein_of_interest: if poi not in seq_ids: raise ValueError( f"protein_of_interest '{poi}' is not in the protein id list. " f"Confirm you specified a Name and not a UniprotAC") def setup_err_model(self): err_param_dict = defaultdict(list) for name, type, _, user_data in self.error_model_schema.requirements(): values = self.get(name, []) for value in values: low_prob, high_prob, step_prob = None, None, 1 parts = value.split("|") if len(parts) == 2: dye_part = parts[0] prob_parts = parts[1] else: dye_part = None prob_parts = parts[0] prob_parts = prob_parts.split(":") if name in ( "err_p_edman_failure", "err_p_detach", "err_row_k_beta", "err_row_k_sigma", ): if dye_part: raise SchemaValidationFailed( f"error model term '{name}' is not allowed to have a dye-index." ) else: if dye_part is None: raise SchemaValidationFailed( f"error model term '{name}' expected a dye-index.") low_prob = float(prob_parts[0]) if len(prob_parts) > 1: high_prob = float(prob_parts[1]) if len(prob_parts) > 2: step_prob = int(prob_parts[2]) if high_prob is None: high_prob = low_prob key = f"{name}:{dye_part if dye_part is not None else 0}" err_param_dict[key] += np.linspace(low_prob, high_prob, step_prob).tolist() err_param_dict[key] = list(set(err_param_dict[key])) self.err_param_dict = err_param_dict def apply_defaults(self): """Overloadable by sub-classes.""" self.schema.apply_defaults(self.defaults, self, override_nones=True) def validate(self): """Overloadable by sub-classes for extra validation""" self.schema.validate(self, context=self.__class__.__name__) def sigprocs_v1(self): tasks = [] if self.sigproc_source: ims_import = task_templates.ims_import( self.sigproc_source, is_movie=self.movie, n_cycles_limit=self.n_cycles_limit, start_cycle=self.start_cycle, dst_ch_i_to_src_ch_i=self.dst_ch_i_to_src_ch_i, ) sigproc = task_templates.sigproc_v1() sigproc.sigproc_v1.parameters.radial_filter = self.radial_filter sigproc.sigproc_v1.parameters.peak_find_n_cycles = self.peak_find_n_cycles sigproc.sigproc_v1.parameters.peak_find_start = self.peak_find_start sigproc.sigproc_v1.parameters.anomaly_iqr_cutoff = self.anomaly_iqr_cutoff tasks += [Munch(**ims_import, **sigproc)] return tasks def tasks_for_sigproc_v2(self): tasks = {} if self.sigproc_source: ims_import_task = task_templates.ims_import( self.sigproc_source, is_movie=self.movie, n_cycles_limit=self.n_cycles_limit, start_cycle=self.start_cycle, dst_ch_i_to_src_ch_i=self.dst_ch_i_to_src_ch_i, ) calib_priors = None if self.calibration_job is not None: calib_src_path = (local.path(self.calibration_job) / "sigproc_v2_calib/plaster_output/sigproc_v2") calib_result = SigprocV2Result.load_from_folder( calib_src_path, prop_list=["calib_priors"]) calib_priors = calib_result.calib_priors if self.calib_dst_ch_i_to_src_ch_i is not None: # Convert a string like 2,1,0 and remap check.t(self.calib_dst_ch_i_to_src_ch_i, str) calib_dst_ch_i_to_src_ch_i = [ int(ch_i) for ch_i in self.calib_dst_ch_i_to_src_ch_i.split(",") ] ch_remapped_priors = Priors.copy(calib_priors) ch_remapped_priors.delete_ch_specific_records() ch_aln_prior = ch_remapped_priors.get_exact(f"ch_aln") if ch_aln_prior is not None: ch_aln_prior = ChannelAlignPrior.ch_remap( ch_aln_prior.prior, calib_dst_ch_i_to_src_ch_i) for dst_ch_i, src_ch_i in enumerate( calib_dst_ch_i_to_src_ch_i): def remap(src_key, dst_key): prior = calib_priors.get_exact(src_key) if prior is not None: ch_remapped_priors.add( dst_key, prior.prior, "remapped channel in gen") remap(f"reg_illum.ch_{src_ch_i}", f"reg_illum.ch_{dst_ch_i}") remap(f"reg_psf.ch_{src_ch_i}", f"reg_psf.ch_{dst_ch_i}") calib_priors = ch_remapped_priors ch_aln = None if self.ch_aln is not None: ch_aln = np.array([float(i) for i in self.ch_aln.split(",")]) assert ch_aln.shape[0] % 2 == 0 ch_aln = ch_aln.reshape((-1, 2)) sigproc_v2_task = task_templates.sigproc_v2_analyze( calib_priors=calib_priors, self_calib=self.self_calib, ch_aln=ch_aln, ch_for_alignment=self.ch_for_alignment, ) tasks = Munch(**ims_import_task, **sigproc_v2_task) return tasks def lnfits(self, sigproc_version): # It is common to have multiple lnfit tasks for a single run, so this fn returns a # block with potentially multiple lnfit tasks using unique task names when more # than one is present. lnfit_tasks = {} if self.lnfit_params: if not self.lnfit_dye_on_threshold: raise ValueError( f"You must specify a --lnfit_dye_on_threshold when --lnfit_params is given" ) dye_thresholds = self.lnfit_dye_on_threshold lnfit_names = self.lnfit_name or ([None] * len(self.lnfit_params)) photometries_only = self.lnfit_photometry_only or ( [True] * len(self.lnfit_params)) if len(self.lnfit_params) > 1 and len(dye_thresholds) == 1: dye_thresholds *= len(self.lnfit_params) assert len(self.lnfit_params) == len(dye_thresholds) assert len(self.lnfit_params) == len(lnfit_names) for i, (params, thresh, name, photometry_only) in enumerate( zip(self.lnfit_params, dye_thresholds, lnfit_names, photometries_only)): task = task_templates.lnfit(sigproc_version=sigproc_version) task.lnfit.parameters["lognormal_fitter_v2_params"] = params task.lnfit.parameters["dye_on_threshold"] = thresh task.lnfit.parameters[ "photometry_only"] = photometry_only.lower() in ( "true", "1", ) task_name = "lnfit" if len(self.lnfit_params) > 1 or name: task_name = name or f"lnfit_{i}" helpers.task_rename(task, task_name) lnfit_tasks[task_name] = task[task_name] return lnfit_tasks def run_name(self, aa_list, protease=None, err_set=None): """ A helper for run folder names based on aa_list and protease. Note, not all generators will use this convention. Compose a run_name from protease and aa_list in normalized form: Eg: protease="trypsin", aa_list=("DE", "K") => "trypsin_de_k" """ if protease is None: protease = "" if aa_list is not None: aa_list = [a.replace("[", "").replace("]", "") for a in aa_list] aas = "_".join(aa_list) else: aas = "bleach" if err_set is not None: err_str = hashlib.md5( json.dumps(err_set).encode()).hexdigest()[0:4] else: err_str = "" return re.sub( "[^0-9a-z_]+", "_", (protease + ("_" if protease != "" else "") + aas).lower() + "_" + err_str, ) def _label_str_permutate(self, label_str): """ Return list of permutations of a label_str such as: "A,B,C:2" => ("A", "B"), ("A", "C"), ("B", "C") A suffix label set may be added to each permutation with +: "A,B,C:2+S" => ("A", "B", "S"), ("A", "C", "S"), ("B", "C", "S") "A,B,C:2+S,T" => ("A", "B", "S", "T"), ("A", "C", "S", "T"), ("B", "C", "S", "T") """ check.t(label_str, str) semi_split = label_str.split(":") if len(semi_split) > 2: raise ValueError(f"Label-set '{label_str}' has >1 colon.") suffix_labels = "" if len(semi_split) == 2: suffix_split = semi_split[1].split("+") if len(suffix_split) > 2: raise ValueError(f"Label-set '{label_str}' has >1 plus.") if len(suffix_split) == 2: semi_split = [semi_split[0], suffix_split[0]] suffix_labels = suffix_split[1].split(",") suffix_labels = [slabel.strip() for slabel in suffix_labels] labels = semi_split[0].split(",") labels = [label.strip() for label in labels] if len(semi_split) == 1: perm_count = len(labels) else: perm_count = int(semi_split[1]) if not 0 < perm_count < len(labels): raise ValueError( f"Label-set '{label_str}' has a permutation count " f"of {perm_count}; needs to be between 0 and {len(labels) - 1}" ) perms = list(itertools.combinations(labels, perm_count)) if suffix_labels: perms = [p + tuple(suffix_labels) for p in perms] return perms def label_set_permutate(self) -> List[Tuple[str, ...]]: """ Returns a list of label sets, where each label set is a tuple of strings """ check.list_t(self.label_set, str) return utils.flatten([ self._label_str_permutate(label_str) for label_str in self.label_set ], 1) def error_set_permutate(self): tuples = [[(key, val) for val in vals] for key, vals in self.err_param_dict.items()] return tuples def scheme_set_permutate(self) -> List[Scheme]: """ Unparsed schemes are of form: protease/label_set, where protease is a str, and label_set is a str parseable by self._label_str_permutate """ parsed_schemes = [] for scheme in self.scheme: split = scheme.split("/") if len(split) != 2 or not all(split): raise ValueError( f"Scheme {scheme} must be of form: protease/label_set") parsed_label_set = self._label_str_permutate(split[1]) parsed_schemes += [ Scheme(split[0], label_set) for label_set in parsed_label_set ] return parsed_schemes def default_err_set(self, n_channels, use_lognormal_model): if use_lognormal_model: defaults = Munch( **self.error_model_defaults_chemistry, **self.error_model_defaults_lognormal, ) # TODO: No longer correct return Munch( p_edman_failure=[defaults.err_p_edman_failure] * 1, p_detach=[defaults.err_p_detach] * 1, p_bleach=[defaults.err_p_bleach] * n_channels, p_non_fluorescent=[defaults.err_p_non_fluorescent] * n_channels, row_k_sigma=[defaults.err_row_k_sigma] * 1, gain_mu=[defaults.err_dye_beta] * n_channels, gain_sigma=[defaults.err_dye_sigma] * n_channels, bg_mu=[defaults.err_dye_zero_beta] * n_channels, bg_sigma=[defaults.err_dye_zero_sigma] * n_channels, ) else: defaults = Munch( **self.error_model_defaults_chemistry, **self.error_model_defaults_normal, ) return Munch( p_edman_failure=[defaults.err_p_edman_failure] * 1, p_detach=[defaults.err_p_detach] * 1, p_bleach=[defaults.err_p_bleach] * n_channels, p_non_fluorescent=[defaults.err_p_non_fluorescent] * n_channels, row_k_sigma=[defaults.err_row_k_sigma] * 1, gain_mu=[defaults.err_gain_mu] * n_channels, gain_sigma=[defaults.err_gain_sigma] * n_channels, bg_mu=[defaults.err_bg_mu] * n_channels, bg_sigma=[defaults.err_bg_sigma] * n_channels, ) def photobleaching_err_set(self, n_channels, use_lognormal_model): if use_lognormal_model: defaults = Munch( **self.error_model_defaults_chemistry, **self.error_model_defaults_lognormal, ) # TODO: No longer correct return Munch( p_edman_failure=[0.0] * 1, p_detach=[0.0] * 1, p_bleach=[0.0] * n_channels, p_non_fluorescent=[0.0] * n_channels, row_k_sigma=[defaults.err_row_k_sigma] * 1, gain_mu=[defaults.err_dye_beta] * n_channels, gain_sigma=[defaults.err_dye_sigma] * n_channels, bg_mu=[defaults.err_dye_zero_beta] * n_channels, bg_sigma=[defaults.err_dye_zero_sigma] * n_channels, ) else: defaults = Munch( **self.error_model_defaults_chemistry, **self.error_model_defaults_normal, ) return Munch( p_edman_failure=[0.0] * 1, p_detach=[0.0] * 1, p_bleach=[0.0] * n_channels, p_non_fluorescent=[0.0] * n_channels, row_k_sigma=[defaults.err_row_k_sigma] * 1, gain_mu=[defaults.err_gain_mu] * n_channels, gain_sigma=[defaults.err_gain_sigma] * n_channels, bg_mu=[defaults.err_bg_mu] * n_channels, bg_sigma=[defaults.err_bg_sigma] * n_channels, ) def run_parameter_permutator(self, use_lognormal_model=True): """ Generate permutations of all the variable parameters Defaults all arguments to self.* Gracefully handles lack of protease. """ proteases = utils.non_none(self.get("protease"), [None]) proteases = [("protease", p) for p in proteases] label_sets = self.label_set_permutate() label_sets = [("label_set", s) for s in label_sets] if len(proteases) == 0: proteases = [("protease", None)] err_sets = self.error_set_permutate() combined = [proteases, label_sets] + err_sets # Schemes is a list of schemes, where each scheme is a tuple containing: # - A Label set, in the form of Tuple['label_set', Tuple[str, ...]] # - A protease, in the form of Tuple['protease', str] # Build scheme set from protease and label set args schemes = list(itertools.product(*combined)) # Add in directly specified schemes schemes += [(("protease", scheme.protease), ("label_set", scheme.label_set)) for scheme in self.scheme_set_permutate()] for params in schemes: protease = utils.filt_first(params, lambda i: i[0] == "protease") protease = protease[1] label_set = utils.filt_first(params, lambda i: i[0] == "label_set") label_set = label_set[1] # Given that the label_set is now known, the error model can be setup n_channels = len(label_set) err_set = self.default_err_set(n_channels, use_lognormal_model) for param in params: if param[0].startswith("err_"): parts = param[0].split(":") err_set[parts[0][4:]][int( parts[1])] = param[1] # The 4: removes the "err_" yield protease, label_set, err_set def erisyon_block(self, aa_list, protease=None, err_set=None): return task_templates.erisyon( run_name=self.run_name(aa_list, protease, err_set), sample=self.sample, generator_name=self.__class__.__name__, ) def report_section_user_config(self, report=None): """ Emit report configuation parameters specified by the user via gen so that they can be further edited if desired, and used by reporting functions in the templates. """ if report is None: report = self config = [] if self.protein_of_interest: config += [ f"PGEN_protein_of_interest = {self.protein_of_interest}\n" ] if self.report_prec: config += [f"PGEN_report_precisions = {self.report_prec}\n"] if config: self.report_section_markdown("# PGEN-controlled report config") config = [ f"# These values were or can be specified by the user at gen time:\n" ] + config report.add_report_section("code", config) def report_assemble(self): """ Overrides report_assemble in ReportBuilder to implement the self.has_report behavior """ if not self.has_report: return None else: return super().report_assemble() def generate(self): """ Abstract method to be overloaded. Expected to return a list of runs. """ pass
class SigprocV1Params(Params): defaults = dict( hat_rad=2, iqr_rng=96, threshold_abs=1.0, channel_indices_for_alignment=None, channel_indices_for_peak_finding=None, radiometry_channels=None, save_debug=False, peak_find_n_cycles=4, peak_find_start=0, radial_filter=None, anomaly_iqr_cutoff=95, n_fields_limit=None, save_full_signal_radmat_npy=False, ) schema = s( s.is_kws_r( anomaly_iqr_cutoff=s.is_number(noneable=True, bounds=(0, 100)), radial_filter=s.is_float(noneable=True, bounds=(0, 1)), peak_find_n_cycles=s.is_int(bounds=(1, None), noneable=True), peak_find_start=s.is_int(bounds=(0, None), noneable=True), save_debug=s.is_bool(), hat_rad=s.is_int(bounds=(1, 3)), iqr_rng=s.is_number(noneable=True, bounds=(0, 100)), threshold_abs=s.is_number( bounds=(0, 100)), # Not sure of a reasonable bound channel_indices_for_alignment=s.is_list(s.is_int(), noneable=True), channel_indices_for_peak_finding=s.is_list(s.is_int(), noneable=True), radiometry_channels=s.is_dict(noneable=True), n_fields_limit=s.is_int(noneable=True), save_full_signal_radmat_npy=s.is_bool(), )) def validate(self): # Note: does not call super because the override_nones is set to false here self.schema.apply_defaults(self.defaults, apply_to=self, override_nones=False) self.schema.validate(self, context=self.__class__.__name__) if self.radiometry_channels is not None: pat = re.compile(r"[0-9a-z_]+") for name, channel_i in self.radiometry_channels.items(): self._validate( pat.fullmatch(name), "radiometry_channels name must be lower-case alphanumeric (including underscore)", ) self._validate(isinstance(channel_i, int), "channel_i must be an integer") def set_radiometry_channels_from_input_channels_if_needed( self, n_channels): if self.radiometry_channels is None: # Assume channels from nd2 manifest channels = list(range(n_channels)) self.radiometry_channels = {f"ch_{ch}": ch for ch in channels} @property def n_output_channels(self): return len(self.radiometry_channels.keys()) @property def n_input_channels(self): return len(self.radiometry_channels.keys()) @property def channels_cycles_dim(self): # This is a cache set in sigproc_v1. # It is a helper for the repeative call: # n_outchannels, n_inchannels, n_cycles, dim = return self._outchannels_inchannels_cycles_dim def _input_channels(self): """ Return a list that converts channel number of the output to the channel of the input Example: input might have channels ["foo", "bar"] the radiometry_channels has: {"bar": 0}] Thus this function returns [1] because the 0th output channel is mapped to the "1" input channel """ return [ self.radiometry_channels[name] for name in sorted(self.radiometry_channels.keys()) ] # def input_names(self): # return sorted(self.radiometry_channels.keys()) def output_channel_to_input_channel(self, out_ch): return self._input_channels()[out_ch] def input_channel_to_output_channel(self, in_ch): """Not every input channel necessarily has an output; can return None""" return utils.filt_first_arg(self._input_channels(), lambda x: x == in_ch)
class PrepParams(Params): PHOTOBLEACHING_PSEUDO_AA = "X" ALLOW_NONES_AND_NANS_IN_ABUNDANCE = False NORMALIZE_ABUNDANCE = False # Abundance is normalized in gen defaults = Munch( protease=None, decoy_mode=None, include_misses=0, n_peps_limit=None, drop_duplicates=False, n_ptms_limit=None, is_photobleaching_run=False, photobleaching_n_cycles=None, photobleaching_run_n_dye_count=None, ) schema = s( s.is_kws_r( protease=s.is_list(noneable=True, elems=s.is_str()), decoy_mode=s.is_str(noneable=True), include_misses=s.is_int(), n_peps_limit=s.is_int(noneable=True), drop_duplicates=s.is_bool(), n_ptms_limit=s.is_int(noneable=True), proteins=s.is_list( s.is_kws( name=s.is_str(required=True), sequence=s.is_str(required=True), ptm_locs=s.is_str(noneable=True), is_poi=s.is_int(noneable=True), abundance=s.is_number(noneable=True), )), is_photobleaching_run=s.is_bool(), photobleaching_n_cycles=s.is_int(noneable=True), photobleaching_run_n_dye_count=s.is_int(noneable=True), )) def validate(self): super().validate() # Try to normalize abundance values if provided. If abundance values are provided, do basic validation. # If no abundance values are provided, do nothing. # When a protein csv with no abundance columns is provided, it will come through as all nans # Note that self.proteins is likely a list of Munches, but could be a list of dicts, so don't assume we can access items as attrs abundance_info_present = any( "abundance" in protein and protein["abundance"] is not None and not math.isnan(protein["abundance"]) for protein in self.proteins) if abundance_info_present: abundance_criteria = [ (lambda protein: "abundance" in protein, "Abundance missing"), ( lambda protein: protein["abundance"] >= 0 if protein["abundance"] is not None else True, "Abundance must be greater than or equal to zero", ), ] if not self.ALLOW_NONES_AND_NANS_IN_ABUNDANCE: abundance_criteria += [ ( lambda protein: protein["abundance"] is not None, "Abundance must not be None", ), ( lambda protein: not math.isnan(protein["abundance"]), "Abundance must not be NaN", ), ] # Find min abundance value, also check for zeros and NaNs and error if found min_abundance = None for protein in self.proteins: # Check to make sure abundance passes criteria for criteria_fn, msg in abundance_criteria: if not criteria_fn(protein): abundance_value = protein.get("abundance") raise SchemaValidationFailed( f"Protein {protein.get('name')} has invalid abundance: {abundance_value} - {msg}" ) # Find min abundance value if (min_abundance is None or protein["abundance"] < min_abundance) and protein["abundance"] > 0: min_abundance = protein["abundance"] if self.NORMALIZE_ABUNDANCE: if min_abundance != 1: log.info("abundance data is not normalized, normalizing.") # normalize abundance by min value for protein in self.proteins: if protein["abundance"] is not None: protein["abundance"] /= min_abundance else: # Abundance information is missing from all proteins # Set abudance to 1 for protein in self.proteins: protein["abundance"] = 1
class SurveyV2Result(BaseResult): name = "survey_v2" filename = "survey_v2.pkl" required_props = dict(params=SurveyV2Params, _survey=(pd.DataFrame)) survey_columns = [ "pro_i", "pep_i", "pep_start", "pep_stop", "pep_len", "seqstr", "P2", "flustr", "n_dyes_max_any_ch", "flu_count", "nn_pep_i", "nn_dist", ] survey_filter_schema = s( s.is_kws_r( allow_proline_at_2=s.is_bool(), run_exclude=s.is_list(s.is_str()), run_include=s.is_list(s.is_str()), max_dyes_per_ch=s.is_int(noneable=True), max_pep_len=s.is_int(noneable=True), max_ptms_per_pep=s.is_int(noneable=True), multi_peptide_metric=s.is_str(noneable=True, options=["dist_avg", "dist_min"]), n_best_schemes=s.is_int(), n_peps_per_scheme=s.is_int(), objective=s.is_str(options=["protein_id", "coverage", "ptms"]), poi_only=s.is_bool(), pro_subset=s.is_list(s.is_str()), ptm_subset=s.is_list(s.is_int()), )) defaults = Munch( allow_proline_at_2=False, run_exclude=[], run_include=[], max_dyes_per_ch=None, max_pep_len=None, max_ptms_per_pep=None, multi_pro_rank="dist_min", n_best_schemes=50, n_peps_per_scheme=1, objective="protein_id", poi_only=False, pro_subset=[], ptm_subset=[], ) @classmethod def validate_filters(cls, filters): """ Validates filters against schema, and fills in defaults where missing. This is a class method so that higher level objects like JobResult can make use of filtering that seems more logical to group with our filters, but are applied at higher level (e.g. objective). """ check.t(filters, Munch) cls.survey_filter_schema.apply_defaults(cls.defaults, filters) cls.survey_filter_schema.validate(filters) def survey(self): return self._survey def _domain_loss(self, df, filters, msg): # debug aid if filters.verbose: if filters.objective == "ptms": domain_loss = set(filters.requested_ptms) - set( list(df.ptm.astype(int))) # print(filters.requested_ptms) else: domain_loss = set(filters.requested_proteins) - set( list(df.pro_i)) # print(filters.requested_proteins) if domain_loss: print( f" {filters.objective} {msg} domain_loss: {sorted(domain_loss)}" ) def _apply_filters(self, filters, prep=None): """ filters may be used to reduce the entries rows of _survey. # TODO: can we just require prep to avoid all the exception logic below? """ if filters.verbose: print(f"\n{(self._folder / '..' / '..').name}") df = self.survey() # If the caller is optimizing for PTMs, we need to add PTM information for # the peptides. This is done here so that protein PTMs can be changed # after a run is complete. the PrepResult is required. Note the inner # join which causes the resulting df to only contain entries which have # PTM locations specified. if filters.objective == "ptms": if prep: peps__ptms = prep.peps__ptms(ptm_peps_only=True, ptms_to_rows=True)[[ "pep_i", "n_pep_ptms", "ptm" ]] if len(peps__ptms) > 0: df = (df.set_index("pep_i").join( peps__ptms.set_index("pep_i"), how="inner").reset_index()) # Write down which ptms were explicitly or implicitly requested by the caller, # so we can know later which ones were removed by filtering. Note that looking # at the unique values in df.ptm is not quite right if more than one protein has a # PTM at the same location. In theory this is OK, but it means our PTM accounting # for "domain_loss" for PTMs is not quite right, so assert here and deal with that # if necessary. assert len(df.ptm) == len(df.ptm.unique( )), "More than one protein has the same PTM location?" filters.requested_ptms = filters.ptm_subset or sorted( list(df.ptm.unique().astype(int))) if filters.verbose: print(f" ptms domain: {filters.requested_ptms}") else: raise ValueError("Must supply PrepResult to optimize for PTMs") if filters is not None: self.validate_filters(filters) # Do protein subset or POI which substantially reduces df if len(filters.pro_subset) > 0: if prep is None: raise ValueError( "Must supply PrepResult to filter by pro_subset") pros = prep.pros() pro_iz = pros[pros.pro_id.isin( filters.pro_subset)].pro_i.values df = df[df.pro_i.isin(pro_iz)] if filters.poi_only == True: if prep is None: raise ValueError( "Must supply PrepResult to filter by proteins-of-interest" ) poi_iz = prep.pros__pois().pro_i.values if len(poi_iz) > 0: # If there are no entries, then all are considered "of interest", # so only filter here if there are some specifically marked. df = df[df.pro_i.isin(poi_iz)] # Write down requested proteins so we can tell the user which ones got # removed by filtering. filters.requested_proteins = sorted(list(df.pro_i.unique())) if filters.verbose: print(f" proteins domain: {filters.requested_proteins}") self._domain_loss(df, filters, "post-protein-filtering") # remove rows per filtering if filters.max_pep_len is not None: df = df[df.pep_len <= filters.max_pep_len] self._domain_loss(df, filters, "max_pep_len") if filters.max_dyes_per_ch is not None: df = df[df.n_dyes_max_any_ch <= filters.max_dyes_per_ch] self._domain_loss(df, filters, "max_dyes_per_ch") if filters.max_ptms_per_pep is not None: df = df[df.n_pep_ptms <= filters.max_ptms_per_pep] self._domain_loss(df, filters, "max_ptms_per_pep") if len(filters.ptm_subset) > 0: # WARNING: this affects PTMs for ALL proteins that have them. # This is typically OK, since you're looking for PTMs on a single # protein of interest, but if you had a PTM at location 100 on two # different proteins, this filter would apply to both of them. df = df[df.ptm.astype(int).isin(filters.ptm_subset)] self._domain_loss(df, filters, "ptm_subset") if not filters.allow_proline_at_2: df = df[df.P2 == False] self._domain_loss(df, filters, "allow_proline_at_2") return df.copy() def n_uniques(self, filters=None, df=None): """ Returns number of peptides with unique flus. This is probably actually not very interesting if you are comparing different proteases, but it may be if you are comparing different labeling schemes for a single protease, or for pre-specified peptide sets like MHC. """ df = self._apply_filters(filters) if df is None else df return len(df[df.flu_count == 1]) def protein_coverage(self, prep_result, filters=None, df=None): """ Returns the percentage coverage of proteins with peptides that have unique flus. If any proteins are marked "of interest" via the pro_report flag, then we compute the coverage only for those proteins, else all proteins are used. """ df = self._apply_filters(filters, prep=prep_result) if df is None else df df = df[df.flu_count == 1] # only use peptides that have unique flus n_poi = prep_result.n_pois poi_iz = (prep_result.pros__pois().pro_i.values if n_poi > 0 else prep_result.pros().pro_i.values) # OLD - returns average coverage of proteins in domain # poi_percent_coverage = np.zeros_like(poi_iz).astype(float) # proseq_groups = prep_result.proseqs().groupby("pro_i") # pep_coverage_groups = df.groupby("pro_i") # for i, poi_i in enumerate(poi_iz): # try: # poi_percent_coverage[i] = ( # pep_coverage_groups.get_group(poi_i).pep_len.sum() # / proseq_groups.get_group(poi_i).aa.count() # ) # except KeyError: # pass # protein not covered at all by peps # avg_coverage = np.mean(poi_percent_coverage) # return avg_coverage # NEW - returns total percentage coverage of multiple proteins # in the case multiple proteins in domain of interest. proseq_groups = prep_result.proseqs().groupby("pro_i") pep_coverage_groups = df.groupby("pro_i") total_aa_covered = 0 total_proteins_len = 0 for poi_i in poi_iz: try: total_aa_covered += pep_coverage_groups.get_group( poi_i).pep_len.sum() except KeyError: pass # protein not covered at all, no length added to total_aa_covered total_proteins_len += proseq_groups.get_group(poi_i).aa.count() return total_aa_covered / total_proteins_len def max_nn_dist(self, unique_flus_only=True, filters=None, df=None): """ Returns the maximum nearest-neighbor distance over all perfect dyetracks from the set of peptides. We will probably want more nuanced information here, or via some other fn -- something that gets at more than just the max, perhaps including information for the top N, and some measure of 'separated-ness' across that set. A single non-normalized max value feels kind of fragile. """ df = self._apply_filters(filters) if df is None else df if unique_flus_only: df = df[df.flu_count == 1] return df.nn_dist.max() def max_nn_dist_peps(self, prep=None, unique_flus_only=True, filters=None, df=None): """ Like max_nn_dist(), but instead of returning just the max dist, returns information about the peptide(s) as well, in a DataFrame. filters.n_peps_per_scheme controls how many rows are returned for non-ptm filtering. For ptm-filtering, the number of peps is determined by how many peptides in this scheme contain ptms -- each will be returned. prep : a PrepResult. If provided, we'll include the protein_coverage in the results. """ df = self._apply_filters(filters, prep=prep) if df is None else df if unique_flus_only: df = df[df.flu_count == 1] self._domain_loss(df, filters, "unique_flus_only") if prep is not None: cols = list(df.columns) cols.insert(1, "pro_id") df = (df.set_index("pro_i").join(prep.pros().set_index("pro_i"), how="left").reset_index()[cols]) df["nn_coverage"] = self.protein_coverage(prep, df=df) df["nn_unique"] = self.n_uniques(df=df) df = df.sort_values( by=["nn_dist", "pep_len", "n_dyes_max_any_ch"], ascending=[False, True, True], ) if filters.objective != "ptms": # If we only need to know a single max dist across all peps/proteins, we're done. # This is the case if multi_peptide_metric is None - we're not trying to take into # account the performance on multiple protein distances. Return the n best peps. if filters.multi_peptide_metric is None: return df[:filters.n_peps_per_scheme].reset_index(drop=True) # Otherwise we need a composite metric that considers the nn_dist # of peptides from multiple proteins. To start, we rank # the peptides from each protein based on the sort order already # established above, and take the top n based on filters.n_peps_per_scheme, # leaving us with the top n peptides, ranked, from each protein df["nn_rank"] = (df.groupby("pro_i").nn_dist.rank( "first", ascending=False).astype("int")) df = df[df.nn_rank.isin(range(filters.n_peps_per_scheme + 1))] # Then we compute a couple of composite metrics which are fns # of the nn_dist from the peptides of each rank. The caller # can sort on these across multiple runs. df["nn_dist_avg"] = df.groupby("nn_rank").nn_dist.transform("mean") df["nn_dist_min"] = df.groupby("nn_rank").nn_dist.transform("min") else: # For PTMs, we have already filtered out the peptides that # don't contain PTMs, so we just need these composite metrics # computed for the single set of all of the peptides in the df. df["nn_rank"] = 1 df["nn_dist_avg"] = df.nn_dist.mean() df["nn_dist_min"] = df.nn_dist.min() # It can be that the caller is interested in N proteins or PTMs, but some # of those have been lost due to filtering etc. This will cause the # mean and stats above to be w.r.t. too few set members, so adjust # these in a way that makes sense. In the initial application of # filtering, the number of proteins/ptms the caller is interested in # has been saved. (If a protein or PTM was "lost" due to filtering, # it effectively merged with the background, is not observable, # and its nn_dist is therefore 0 -- indistinguishable from some # neighbor) filter_pass = 1.0 domain_loss = "" # either lost proteins, or lost ptms if filters.objective == "ptms": filter_pass = len(df) / len(filters.requested_ptms) domain_loss = set(filters.requested_ptms) - set( list(df.ptm.astype(int))) if filters.verbose and domain_loss: print(f" ** final domain_loss: {sorted(domain_loss)}") elif len(filters.requested_proteins) == 0: filter_pass = 0 else: filter_pass = len(df.pro_i.unique()) / len( filters.requested_proteins) domain_loss = set(filters.requested_proteins) - set(list(df.pro_i)) domain_loss = str(sorted(domain_loss)) if domain_loss else "" df["domain_loss"] = domain_loss assert filter_pass <= 1.0 if filter_pass != 1.0: df.nn_dist_avg *= 1.0 - filter_pass df.nn_dist_min = 0 if filters.verbose: print(f" filter_pass is {filter_pass}") return df.sort_values(by=["nn_rank", "pro_i"], ascending=[True, True]).reset_index(drop=True) def nn_stats(self, prep_result, filters=None): """ Returns a tuple that gives the main stats for this survey run that can be used to pick from a list of such survey runs: nn_uniques - the number of unique peptides nn_coverage - percent coverage of protein(s) by unique peptides nn_dist - distance to neighbor for most isolated dyetrack """ df = self._apply_filters(filters, prep=prep_result) n_uniques = (self.n_uniques(df=df), ) return ( self.n_uniques(df=df), self.protein_coverage(prep_result, df=df), self.max_nn_dist(df=df), )
class ErrorModel(Params): schema = s( s.is_kws_r( p_dud=s.is_deprecated(), p_edman_failure=s.is_float(bounds=(0, 1)), p_detach=s.is_float(bounds=(0, 1)), dyes=s.is_list(elems=s.is_kws_r( dye_name=s.is_str(), p_bleach_per_cycle=s.is_float(bounds=(0, 1)), p_non_fluorescent=s.is_float(bounds=(0, 1)), # gain and vpd are the new parameters and beta, sigma are the legacy gain=s.is_float(required=False, bounds=(0, None)), vpd=s.is_float(required=False, bounds=(0, None)), beta=s.is_float(required=False, bounds=(0, None)), sigma=s.is_float(required=False, bounds=(0, None)), )), labels=s.is_list(elems=s.is_kws_r( label_name=s.is_str(), p_failure_to_bind_amino_acid=s.is_float(bounds=(0, 1)), p_failure_to_attach_to_dye=s.is_float(bounds=(0, 1)), )), )) defaults = Munch(p_edman_failure=0.06, p_detach=0.05, dyes=[], labels=[]) def __init__(self, **kwargs): dyes = kwargs["dyes"] = kwargs.pop("dyes", []) for dye in dyes: dye.p_bleach_per_cycle = dye.get( "p_bleach_per_cycle", kwargs.pop("p_bleach_per_cycle", 0.05)) dye.p_non_fluorescent = dye.get( "p_non_fluorescent", kwargs.pop("p_non_fluorescent", 0.07)) labels = kwargs["labels"] = kwargs.pop("labels", []) for label in labels: label.p_failure_to_bind_amino_acid = label.get( "p_failure_to_bind_amino_acid", kwargs.pop("p_failure_to_bind_amino_acid", 0.0), ) label.p_failure_to_attach_to_dye = label.get( "p_failure_to_attach_to_dye", kwargs.pop("p_failure_to_attach_to_dye", 0.0), ) super().__init__(**kwargs) @classmethod def no_errors(cls, n_channels, **kwargs): beta = kwargs.pop("beta", 7500.0) sigma = kwargs.pop("sigma", 0.0) gain = kwargs.pop("gain", 10.0) vpd = kwargs.pop("vpd", 0.1) return cls( p_edman_failure=0.0, p_detach=0.0, dyes=[ Munch( dye_name=f"dye_{ch}", p_bleach_per_cycle=0.0, p_non_fluorescent=0.0, sigma=sigma, beta=beta, gain=gain, vpd=vpd, ) for ch in range(n_channels) ], labels=[ Munch( label_name=f"label_{ch}", p_failure_to_bind_amino_acid=0.0, p_failure_to_attach_to_dye=0.0, ) for ch in range(n_channels) ], **kwargs, ) @classmethod def from_err_set(cls, err_set, **kwargs): """err_set is a construct used by the error iterators in pgen""" n_channels = len(err_set.p_non_fluorescent) return cls( p_edman_failure=err_set.p_edman_failure[0], p_detach=err_set.p_detach[0], dyes=[ Munch( dye_name=f"dye_{ch}", p_bleach_per_cycle=p_bleach_per_cycle, p_non_fluorescent=p_non_fluorescent, sigma=dye_sigma, beta=dye_beta, gain=dye_gain, vpd=dye_vpd, ) for ch, dye_beta, dye_sigma, dye_gain, dye_vpd, p_bleach_per_cycle, p_non_fluorescent in zip( range(n_channels), err_set.dye_beta, err_set.dye_sigma, err_set.dye_gain, err_set.dye_vpd, err_set.p_bleach_per_cycle, err_set.p_non_fluorescent, ) ], labels=[ Munch( label_name=f"label_{ch}", p_failure_to_bind_amino_acid=0.0, p_failure_to_attach_to_dye=0.0, ) for ch in range(n_channels) ], **kwargs, ) @classmethod def from_defaults(cls, n_channels): return cls( p_edman_failure=cls.defaults.p_edman_failure, p_detach=cls.defaults.p_detach, dyes=[ Munch( dye_name=f"dye_{ch}", p_bleach_per_cycle=0.05, p_non_fluorescent=0.07, sigma=0.16, beta=7500.0, gain=7500.0, vpd=0.10, ) for ch in range(n_channels) ], labels=[ Munch( label_name=f"label_{ch}", p_failure_to_bind_amino_acid=0.0, p_failure_to_attach_to_dye=0.0, ) for ch in range(n_channels) ], ) def scale_dyes(self, key, scalar): for dye in self.dyes: dye[key] *= scalar def set_dye_param(self, key, val): for dye in self.dyes: dye[key] = val
class SimV1Params(ParamsAndPriors): """ Simulations parameters is and ErrorModel + parameters for sim """ defaults = Munch( n_pres=1, n_mocks=0, n_edmans=1, n_samples_train=5_000, n_samples_test=1_000, dyes=[], labels=[], random_seed=None, train_n_sample_multiplier= None, # This does not appear to be used anywhere. tfb allow_train_test_to_be_identical=False, enable_ptm_labels=False, is_survey=False, ) schema = s( s.is_kws_r( is_survey=s.is_bool(), priors_desc=Priors.priors_desc_schema, n_pres=s.is_int(bounds=(0, None)), n_mocks=s.is_int(bounds=(0, None)), n_edmans=s.is_int(bounds=(0, None)), n_samples_train=s.is_int(bounds=(1, None)), n_samples_test=s.is_int(bounds=(1, None)), dyes=s.is_list(elems=s.is_kws_r(dye_name=s.is_str(), channel_name=s.is_str())), labels=s.is_list(elems=s.is_kws_r( aa=s.is_str(), dye_name=s.is_str(), label_name=s.is_str(), ptm_only=s.is_bool(required=False, noneable=True), )), channels=s.is_dict(required=False), random_seed=s.is_int(required=False, noneable=True), allow_train_test_to_be_identical=s.is_bool(required=False, noneable=True), enable_ptm_labels=s.is_bool(required=False, noneable=True), )) # def copy(self): # # REMOVE everything that _build_join_dfs put in # utils.safe_del(self, "df") # utils.safe_del(self, "by_channel") # utils.safe_del(self, "ch_by_aa") # # dst = utils.munch_deep_copy(self, klass_set={SimV1Params}) # dst.error_model = ErrorModel(**dst.error_model) # assert isinstance(dst, SimV1Params) # return dst def __init__(self, **kwargs): super().__init__(source="SimV1Params", **kwargs) self._setup_dfs() def validate(self): super().validate() all_dye_names = list(set([d.dye_name for d in self.dyes])) # No duplicate dye names self._validate( len(all_dye_names) == len(self.dyes), "The dye list contains a duplicate") # No duplicate labels self._validate( len(list(set(utils.listi(self.labels, "aa")))) == len(self.labels), "There is a duplicate label", ) # All labels have a legit dye name [ self._validate( label.dye_name in all_dye_names, f"Label {label.label_name} does not have a valid matching dye_name", ) for label in self.labels ] # Channel mappings mentioned_channels = {dye.channel_name: False for dye in self.dyes} if "channels" in self: # Validate that channel mapping is complete for channel_name, ch_i in self.channels.items(): self._validate( channel_name in mentioned_channels, f"Channel name '{channel_name}' was not found in dyes", ) mentioned_channels[channel_name] = True self._validate( all([mentioned for _, mentioned in mentioned_channels.items()]), "Not all channels in dyes were enumerated in channels", ) else: # No channel mapping: assign them self["channels"] = { ch_name: i for i, ch_name in enumerate(sorted(mentioned_channels.keys())) } @property def n_cycles(self): return self.n_pres + self.n_mocks + self.n_edmans def channel_names(self): return sorted(list(set(utils.listi(self.dyes, "channel_name")))) def channel_i_by_name(self): channels = self.channel_names() return { channel_name: channel_i for channel_i, channel_name in enumerate(channels) } @property def n_channels(self): return len(self.channel_i_by_name().keys()) @property def n_channels_and_cycles(self): return self.n_channels, self.n_cycles def _setup_dfs(self): """ The error model contains information about the dyes and labels and other terms. Those error model parameters are wired together by names which are useful for reconciling calibrations. But here, these "by name" parameters are all put into a dataframe so that they can be indexed by integers. """ dyes_df = pd.DataFrame(self.dyes) assert len(dyes_df) > 0 labels_df = pd.DataFrame(self.labels) assert len(labels_df) > 0 # LOOKUP dye priors dye_priors = [] for dye in self.dyes: # SEARCH priors by dye name and if not found by channel p_non_fluorescent = self.priors.get_exact( f"p_non_fluorescent.{dye.dye_name}") if p_non_fluorescent is None: p_non_fluorescent = self.priors.get( f"p_non_fluorescent.ch_{dye.channel_name}") dye_priors += [ Munch( dye_name=dye.dye_name, p_non_fluorescent=p_non_fluorescent.prior, ) ] dye_priors_df = pd.DataFrame(dye_priors) # dye_priors_df: (dye_name, p_non_fluorescent) dyes_df = utils.easy_join(dyes_df, dye_priors_df, "dye_name") # dyes_df: (dye_name, channel_name, p_non_fluorescent) # TODO: LOOKUP label priors # (p_failure_to_bind_aa, p_failure_to_attach_to_dye) # LOOKUP channel priors ch_priors = pd.DataFrame([ dict( channel_name=channel_name, ch_i=ch_i, bg_mu=self.priors.get(f"bg_mu.ch_{ch_i}").prior, bg_sigma=self.priors.get(f"bg_sigma.ch_{ch_i}").prior, gain_mu=self.priors.get(f"gain_mu.ch_{ch_i}").prior, gain_sigma=self.priors.get(f"gain_sigma.ch_{ch_i}").prior, row_k_sigma=self.priors.get(f"row_k_sigma.ch_{ch_i}").prior, p_bleach=self.priors.get(f"p_bleach.ch_{ch_i}").prior, ) for channel_name, ch_i in self.channels.items() ]) # ch_priors: (channel_name, ch_i, ...) self._channel__priors = (utils.easy_join( dyes_df, ch_priors, "channel_name").drop( columns=["p_non_fluorescent"]).drop_duplicates().reset_index()) # self._channel__priors: ( # 'ch_i', 'channel_name', 'bg_mu', 'bg_sigma', 'dye_name', # 'gain_mu', 'gain_sigma', 'index', 'p_bleach', 'row_k_sigma', # ) # SANITY check channel__priors group_by_ch = self._channel__priors.groupby("ch_i") for field in ( "bg_mu", "bg_sigma", "gain_mu", "gain_sigma", "row_k_sigma", ): assert np.all(group_by_ch[field].nunique() == 1) assert "p_non_fluorescent" not in self._channel__priors.columns labels_dyes_df = utils.easy_join(labels_df, dyes_df, "dye_name") self._dye__label__priors = utils.easy_join( labels_dyes_df, ch_priors, "channel_name").reset_index(drop=True) # self._dye__label__priors: ( # 'channel_name', 'dye_name', 'aa', 'label_name', # 'ptm_only', 'p_non_fluorescent', 'ch_i', 'bg_mu', 'bg_sigma', # 'gain_mu', 'gain_sigma', 'row_k_sigma', 'p_bleach' # ) self._ch_by_aa = { row.aa: row.ch_i for row in self._dye__label__priors.itertuples() } def dye__label__priors(self): """ DataFrame( 'channel_name', 'dye_name', 'aa', 'label_name', 'ptm_only', 'p_non_fluorescent', 'ch_i', 'bg_mu', 'bg_sigma', 'gain_mu', 'gain_sigma', 'row_k_sigma', 'p_bleach' ) """ return self._dye__label__priors def channel__priors(self): """ DataFrame( 'ch_i', 'channel_name', 'bg_mu', 'bg_sigma', 'dye_name', 'gain_mu', 'gain_sigma', 'index', 'p_bleach', 'row_k_sigma', ) """ return self._channel__priors def by_channel(self): return self._channel__priors.set_index("ch_i") def to_label_list(self): """Summarize labels like: ["DE", "C"]""" return [ "".join([ label.aa for label in self.labels if label.dye_name == dye.dye_name ]) for dye in self.dyes ] def to_label_str(self): """Summarize labels like: DE,C""" return ",".join(self.to_label_list()) @classmethod def construct_from_aa_list(cls, aa_list, **kwargs): """ This is a helper to generate channel when you have a list of aas. For example, two channels where ch0 is D&E and ch1 is Y. ["DE", "Y"]. If you pass in an error model, it needs to match channels and labels. """ check.list_or_tuple_t(aa_list, str) allowed_aa_mods = ["[", "]"] assert all([(aa.isalpha() or aa in allowed_aa_mods) for aas in aa_list for aa in list(aas)]) dyes = [ Munch(dye_name=f"dye_{ch}", channel_name=f"ch_{ch}") for ch, _ in enumerate(aa_list) ] # Note the extra for loop because "DE" needs to be split into "D" & "E" # which is done by aa_str_to_list() - which also handles PTMs like S[p] labels = [ Munch( aa=aa, dye_name=f"dye_{ch}", label_name=f"label_{ch}", ptm_only=False, ) for ch, aas in enumerate(aa_list) for aa in aa_str_to_list(aas) ] return cls(dyes=dyes, labels=labels, **kwargs)
class SimParams(Params): """ Simulations parameters is and ErrorModel + parameters for sim """ defaults = Munch( n_pres=1, n_mocks=0, n_edmans=1, n_samples_train=5_000, n_samples_test=1_000, dyes=[], labels=[], random_seed=None, train_n_sample_multiplier= None, # This does not appear to be used anywhere. tfb allow_train_test_to_be_identical=False, enable_ptm_labels=False, is_survey=False, ) schema = s( s.is_kws_r( is_survey=s.is_bool(), error_model=s.is_kws(**ErrorModel.schema.schema()), n_pres=s.is_int(bounds=(0, None)), n_mocks=s.is_int(bounds=(0, None)), n_edmans=s.is_int(bounds=(0, None)), n_samples_train=s.is_int(bounds=(1, None)), n_samples_test=s.is_int(bounds=(1, None)), dyes=s.is_list(elems=s.is_kws_r(dye_name=s.is_str(), channel_name=s.is_str())), labels=s.is_list(elems=s.is_kws_r( amino_acid=s.is_str(), dye_name=s.is_str(), label_name=s.is_str(), ptm_only=s.is_bool(required=False, noneable=True), )), random_seed=s.is_int(required=False, noneable=True), allow_train_test_to_be_identical=s.is_bool(required=False, noneable=True), enable_ptm_labels=s.is_bool(required=False, noneable=True), )) def copy(self): # REMOVE everything that _build_join_dfs put in utils.safe_del(self, "df") utils.safe_del(self, "by_channel") utils.safe_del(self, "ch_by_aa") utils.safe_del(self, "channel_i_to_gain") utils.safe_del(self, "channel_i_to_vpd") dst = utils.munch_deep_copy(self, klass_set={SimParams}) dst.error_model = ErrorModel(**dst.error_model) assert isinstance(dst, SimParams) return dst def __init__(self, include_dfs=True, **kwargs): kwargs["error_model"] = kwargs.pop("error_model", ErrorModel()) super().__init__(**kwargs) if include_dfs: self._build_join_dfs() def validate(self): super().validate() all_dye_names = list(set([d.dye_name for d in self.dyes])) # No duplicate dye names self._validate( len(all_dye_names) == len(self.dyes), "The dye list contains a duplicate") # No duplicate labels self._validate( len(list(set(utils.listi(self.labels, "amino_acid")))) == len(self.labels), "There is a duplicate label", ) # All labels have a legit dye name [ self._validate( label.dye_name in all_dye_names, f"Label {label.label_name} does not have a valid matching dye_name", ) for label in self.labels ] @property def n_cycles(self): return self.n_pres + self.n_mocks + self.n_edmans def channels(self): return sorted(list(set(utils.listi(self.dyes, "channel_name")))) def channel_i_by_name(self): channels = self.channels() return { channel_name: channel_i for channel_i, channel_name in enumerate(channels) } @property def n_channels(self): return len(self.channel_i_by_name().keys()) @property def n_channels_and_cycles(self): return self.n_channels, self.n_cycles def _build_join_dfs(self): """ The error model contains information about the dyes and labels and other terms. Those error model parameters are wired together by names which are useful for reconciling calibrations. But here, these "by name" parameters are all put into a dataframe so that they can be indexed by integers. """ sim_dyes_df = pd.DataFrame(self.dyes) assert len(sim_dyes_df) > 0 sim_labels_df = pd.DataFrame(self.labels) assert len(sim_labels_df) > 0 error_model_dyes_df = pd.DataFrame(self.error_model.dyes) assert len(error_model_dyes_df) > 0 error_model_labels_df = pd.DataFrame(self.error_model.labels) assert len(error_model_labels_df) > 0 if len(sim_dyes_df) > 0: channel_df = (sim_dyes_df[[ "channel_name" ]].drop_duplicates().reset_index( drop=True).rename_axis("ch_i").reset_index()) label_df = pd.merge(left=sim_labels_df, right=error_model_labels_df, on="label_name") dye_df = pd.merge(left=sim_dyes_df, right=error_model_dyes_df, on="dye_name") dye_df = pd.merge(left=dye_df, right=channel_df, on="channel_name") self.df = (pd.merge( left=label_df, right=dye_df, on="dye_name").drop_duplicates().reset_index(drop=True)) else: self.df = pd.DataFrame() assert np.all( self.df.groupby("ch_i").p_bleach_per_cycle.nunique() == 1) assert np.all(self.df.groupby("ch_i").beta.nunique() == 1) assert np.all(self.df.groupby("ch_i").sigma.nunique() == 1) self.by_channel = [ Munch( p_bleach_per_cycle=self.df[self.df.ch_i == ch].iloc[0].p_bleach_per_cycle, beta=self.df[self.df.ch_i == ch].iloc[0].beta, sigma=self.df[self.df.ch_i == ch].iloc[0].sigma, gain=self.df[self.df.ch_i == ch].iloc[0].gain, vpd=self.df[self.df.ch_i == ch].iloc[0].vpd, ) for ch in range(self.n_channels) ] self.ch_by_aa = { row.amino_acid: row.ch_i for row in self.df.itertuples() } # These two needs to be lists (not ndarray) because they have to be duplicated self.channel_i_to_gain = [ self.by_channel[i].gain for i in range(self.n_channels) ] self.channel_i_to_vpd = [ self.by_channel[i].vpd for i in range(self.n_channels) ] def to_label_list(self): """Summarize labels like: ["DE", "C"]""" return [ "".join([ label.amino_acid for label in self.labels if label.dye_name == dye.dye_name ]) for dye in self.dyes ] def to_label_str(self): """Summarize labels like: DE,C""" return ",".join(self.to_label_list()) @classmethod def construct_from_aa_list(cls, aa_list, **kwargs): """ This is a helper to generate channel when you have a list of aas. For example, two channels where ch0 is D&E and ch1 is Y. ["DE", "Y"]. If you pass in an error model, it needs to match channels and labels. """ check.list_or_tuple_t(aa_list, str) allowed_aa_mods = ["[", "]"] assert all([(aa.isalpha() or aa in allowed_aa_mods) for aas in aa_list for aa in list(aas)]) dyes = [ Munch(dye_name=f"dye_{ch}", channel_name=f"ch_{ch}") for ch, _ in enumerate(aa_list) ] # Note the extra for loop because "DE" needs to be split into "D" & "E" # which is done by aa_str_to_list() - which also handles PTMs like S[p] labels = [ Munch( amino_acid=aa, dye_name=f"dye_{ch}", label_name=f"label_{ch}", ptm_only=False, ) for ch, aas in enumerate(aa_list) for aa in aa_str_to_list(aas) ] return cls(dyes=dyes, labels=labels, **kwargs)
def it_validates_default_list(): test_s = s(s.is_list()) test_s.validate([]) test_s.validate([1, 2, 3, "str", dict(), []]) with zest.raises(SchemaValidationFailed): test_s.validate(1)
class PTMGenerator(BaseGenerator): """ Use one set of labels to identify peptides and another label to measure quantities of PTM forms. Assumptions: * Only one label_set channel has PTMs in it. Generator-specific arguments: @--ptm-peptide="P10000" # Peptide to examine; Required and Repeatable """ schema = s( s.is_kws_r( **BaseGenerator.job_setup_schema.schema(), **BaseGenerator.protein_schema.schema(), **BaseGenerator.label_set_schema.schema(), **BaseGenerator.scope_run_schema.schema(), **BaseGenerator.peptide_setup_schema.schema(), **BaseGenerator.error_model_schema.schema(), ptm_protein_of_interest=s.is_list( s.is_str(allow_empty_string=False), help="The name of the protein to look for PTMs", ), ptm_label=s.is_str(allow_empty_string=False, help="The PTM label"), n_peptides_limit=s.is_int( noneable=True, help="Useful for debugging to limit peptide counts" ), ) ) defaults = Munch( n_edmans=10, n_pres=1, n_mocks=0, decoys="none", random_seed=None, ptm_label="S[p]T[p]", dye_beta=[7500.0], dye_sigma=[0.16], ) def apply_defaults(self): super().apply_defaults() # Plumbum creates empty lists on list switches. This means # that the apply defaults doesn't quite work right. # TASK: Find a cleaner solution. For now hard-code if len(self.dye_beta) == 0: self.dye_beta = self.defaults.dye_beta if len(self.dye_sigma) == 0: self.dye_sigma = self.defaults.dye_sigma def generate(self): runs = [] for protease, aa_list, err_set in self.run_parameter_permutator(): # GENERATE e-block e_block = self.erisyon_block(aa_list, protease, err_set) ptm_labels = re.compile(r"[A-Z]\[.\]", re.IGNORECASE).findall( self.ptm_label ) # This feels a likely hacky ptm_aas = "".join([i[0] for i in ptm_labels]) if ptm_aas not in aa_list: aa_list = tuple(list(aa_list) + [ptm_aas]) # GENERATE the usual non-ptm prep, sim, train prep_task = task_templates.prep( self.protein, protease, self.decoys, n_peptides_limit=self.n_peptides_limit, proteins_of_interest=self.proteins_of_interest, ) sim_task = task_templates.sim( list(aa_list), n_pres=self.n_pres, n_mocks=self.n_mocks, n_edmans=self.n_edmans, dye_beta=self.dye_beta, dye_sigma=self.dye_sigma, ptm_labels=ptm_labels, ) train_task = task_templates.train_rf() # GENERATE the ptm tasks ptm_train_rf_task = task_templates.ptm_train_rf( ptm_labels, self.ptm_protein_of_interest ) ptm_classify_test_rf_task = task_templates.ptm_classify_test_rf() # CREATE the run run = Munch( run_name=self.run_name(aa_list, protease, err_set), **e_block, **prep_task, **sim_task, **train_task, **ptm_train_rf_task, **ptm_classify_test_rf_task, ) runs += [run] self.report_section_run_array(runs, to_load=["plaster", "sim", "prep", "ptm"]) self.report_section_from_template("ptm_template.ipynb") n_runs = len(runs) self.report_preamble( utils.smart_wrap( f""" # PTM Report ## {n_runs} run(s) processed. """ ) ) return runs
class ClassifyV1Generator(BaseGenerator): """ General-purpose generator for classifying peptides/proteins. May be used to search for one or more "needle" peptides. Assumptions: Generator-specific arguments: @--protein_of_interest="P10636-8" # Only affects reporting downstream """ # These schema are in general subsets of the "params" for different plaster tasks, # and for convenience in sharing among generators they are defined in BaseGenerator. # Its a bit arbitrary where some parameters end up, because they might be shared # by two different tasks that both get run as part of a classify run. For example, # this classify generator supports runs that classify either just simulations, or # additionally actual data from a scope. Both sims and scope runs need n_edmans, # n_mocks, n_pres. But the schema for each cannot both contain these else we'll # pass duplicate key names into the schema below. schema = s( s.is_kws_r( **BaseGenerator.job_setup_schema.schema(), **BaseGenerator.protein_schema.schema(), **BaseGenerator.label_set_schema.schema(), **BaseGenerator.lnfit_schema.schema(), **BaseGenerator.scope_run_schema.schema(), **BaseGenerator.peptide_setup_schema.schema(), **BaseGenerator.sigproc_source_schema.schema(), **BaseGenerator.sigproc_v1_schema.schema(), **BaseGenerator.error_model_schema.schema(), **BaseGenerator.sim_schema.schema(), **BaseGenerator.scheme_schema.schema(), rf=s.is_bool(help="Include rf classifier", noneable=True), report_prec=s.is_list( elems=s.is_float(bounds=(0.001, 0.999)), help="The precision for classifier reporting", ), ) ) defaults = Munch( n_edmans=10, n_pres=0, n_mocks=1, n_samples_train=5_000, n_samples_test=1_000, decoys="none", random_seed=None, rf=True, sigproc_source=None, protein_of_interest=None, lnfit_name=None, lnfit_params=None, lnfit_dye_on_threshold=None, movie=False, radial_filter=None, peak_find_n_cycles=4, peak_find_start=0, anomaly_iqr_cutoff=95, # dye_beta=[7500.0], # dye_sigma=[0.16], n_ptms_limit=5, report_prec=[0.95, 0.9, 0.8], ) def apply_defaults(self): super().apply_defaults() # Plumbum creates empty lists on list switches. This means # that the apply defaults doesn't quite work right. # TASK: Find a cleaner solution. For now hard-code # if len(self.err_dye_beta) == 0: # self.err_dye_beta = self.defaults.dye_beta # if len(self.dye_sigma) == 0: # self.dye_sigma = self.defaults.dye_sigma if len(self.report_prec) == 0: self.report_prec = self.defaults.report_prec def validate(self): super().validate() assert self.rf def generate(self): self.report_section_user_config() sigproc_tasks = self.sigprocs_v1() or [{}] # guarantee traverse loop once # TODO: 'default' reporting needs to be rethought. Maybe we just employ # gen switch that says which report type. The pattern that has developed # is that each project of any substance wants a special type of report. These # projects are different enough that you always want to include custom stuff. # Presumably as we do more collabs/projects, they tend to group into a # handful of basic types. # # Bear in mind that we're in the classify generator, so all of these # refer to jobs that involve classification. (jobs like photobleaching # or other sigprocv2-only tasks don't -- those have their own hacky # report logic similar to what you'll see below). # # Currently those types are: 'standard' sigprocv2 with classify, # spike-in sigprocv2 with classify. # # VFS-only types: 'standard classify', PTM classify, # MHC classify (perhaps this is really standard classify, but is big, and # does not use a protease, and has all small uniform-length peptides) # # See all the hacky logic after these loops that patch together # a report by trying to deduce which of the above we're looking # at. # # Maybe we just need different generators instead of including # complex reporting logic? # # Etc. # # PTM, MHC, and PRO are the three classes of highest-level specialized reports # that report on all of the runs in a job taken together. Whereas the default # report that comes out of classify will emit a long report with one section per # run, this became totally unwieldy when a job has 50+ (or hundreds!) of runs. # In that case you really only want a high-level report with a way to explore # the runs, and that's exactly what the specialized PTM, MHC, and PRO templates # are created for. Here we try to cleverly deduce what kind of report we should # do based on whether there are PTMs present, Proteins-of-interest present, or # in the hackiest case, whether the sample or job name contains a given string. # # A PTM report is done if PTMs have been specified for any of the proteins ptm_report = any([pro.get("ptm_locs") for pro in self.protein]) # A MHC-style report (which is special in that we know ahead of time that # the peptides are identical for all runs -- because we started with a list # of peptides -- so we can do lots of interesting comparisons that you can't # do when the peptides differ from run-to-run) is created for jobs which have # the string 'mhc' in their job-name or sample-name. This needs to change, # but our Broad MHC project is the only one of this class for a year now. # This report is useful for any job that contains runs whose peptides are # identical -- this means either peptides were provided in the first place # and no protease was given to the "prep" task, or that only one protease, # and potentially lots of label schemes, is used. mhc_report = not ptm_report and ( "mhc" in self.job.lower() or "mhc" in self.sample.lower() ) # A protein-identification report is done if there are proteins of interest pro_report = ( not ptm_report and not mhc_report and ( bool(self.protein_of_interest) or any([pro.get("is_poi") for pro in self.protein]) ) ) run_descs = [] for protease, aa_list, err_set in self.run_parameter_permutator(): for sigproc_i, sigproc_v1_task in enumerate(sigproc_tasks): prep_task = task_templates.prep( self.protein, protease, self.decoys, pois=self.protein_of_interest, n_ptms_limit=self.n_ptms_limit, ) sim_v1_task = {} sim_v2_task = {} train_rf_task = {} test_rf_task = {} classify_rf_task = {} train_rf_task = task_templates.train_rf() test_rf_task = task_templates.rf_v2() if sigproc_v1_task: classify_rf_task = task_templates.classify_rf_v1( prep_relative_path="../prep", sim_relative_path="../sim_v1", train_relative_path="../train_rf", sigproc_relative_path=f"../sigproc_v1", ) sim_v1_task = task_templates.sim_v1( list(aa_list), err_set, n_pres=self.n_pres, n_mocks=self.n_mocks, n_edmans=self.n_edmans, n_samples_train=self.n_samples_train, n_samples_test=self.n_samples_test, ) sim_v1_task.sim_v1.parameters.random_seed = self.random_seed lnfit_task = self.lnfits("v2") e_block = self.erisyon_block(aa_list, protease, err_set) sigproc_suffix = ( f"_sigproc_{sigproc_i}" if len(sigproc_tasks) > 1 else "" ) run_name = f"{e_block._erisyon.run_name}{sigproc_suffix}" if self.force_run_name is not None: run_name = self.force_run_name run_desc = Munch( run_name=run_name, **e_block, **prep_task, **sim_v1_task, **sim_v2_task, **train_rf_task, **test_rf_task, **sigproc_v1_task, **lnfit_task, **classify_rf_task, ) run_descs += [run_desc] # for classify jobs that involve PTMs or MHC, we'll do run reporting # differently rather than emitting a section for each run. if not ptm_report and not mhc_report and not pro_report: self.report_section_markdown(f"# RUN {run_desc.run_name}") self.report_section_run_object(run_desc) if test_rf_task: self.report_section_from_template( "train_and_test_template.ipynb" ) self.report_section_markdown(f"# JOB {self.job}") self.report_section_job_object() if ptm_report: self.report_section_from_template("train_and_test_template_ptm.ipynb") elif mhc_report: self.report_section_from_template("train_and_test_template_mhc.ipynb") elif pro_report: self.report_section_from_template("train_and_test_template_pro.ipynb") else: self.report_section_from_template("train_and_test_epilog_template.ipynb") n_runs = len(run_descs) if n_runs > 1 and sigproc_tasks[0]: # TASK: better logic for when to include spike_template. --spike? self.report_section_from_template("spike_template.ipynb") sigproc_imports_desc = "" if sigproc_tasks[0]: sigproc_imports_desc = "## Sigproc imports:\n" sigproc_imports_desc += "\n".join( [f"\t* {s.ims_import.inputs.src_dir}" for s in sigproc_tasks] ) self.report_section_first_run_object() self.report_section_from_template("sigproc_v1_template.ipynb") self.report_section_from_template("classify_template.ipynb") self.report_preamble( utils.smart_wrap( f""" # Classify Overview ## {n_runs} run_desc(s) processed. ## Sample: {self.sample} ## Job: {self.job} {sigproc_imports_desc} """, width=None, ) ) return run_descs
def it_checks_bound_type(): with zest.raises(SchemaInvalid): s(s.is_list(min_len="str")) with zest.raises(SchemaInvalid): s(s.is_list(max_len="str"))
def it_has_elems_as_first_arg(): test_s = s(s.is_list(s.is_int())) test_s.validate([1]) with zest.raises(SchemaValidationFailed): test_s.validate(["str"])
class CalibNNV1Generator(SigprocV1Generator): """ Import calibration runs """ schema = s( s.is_kws_r( **SigprocV1Generator.schema.schema(), **BaseGenerator.scope_run_schema.schema(), mode=s.is_str(help=f"Current modes are: [{', '.join(modes)}]", userdata=dict(cli=True)), channel=s.is_list(s.is_int(), help=f"Channel list to include"), dye_names=s.is_str( help= "Dye names of each channel; will be saved with this scope.", userdata=dict(cli=True), ), scope_name=s.is_str( help="Scope name, will be saved with this scope.", userdata=dict(cli=True), ), )) def generate(self): runs = [] sigproc_tasks = self.sigprocs_v1() if len(self.sigproc_source) != 1: raise ValueError(f"Calibrations can have only one sigproc_source") if self.mode not in modes: raise ValueError(f"Unknown calib mode {self.mode}") sigproc_task = sigproc_tasks[0] calib_task = task_templates.calib_nn_v1( mode=self.mode, n_pres=self.n_pres, n_mocks=self.n_mocks, n_edmans=self.n_edmans, dye_names=self.dye_names, scope_name=self.scope_name, channels=self.channel, ) run = Munch( run_name=f"calib_{self.mode}", **sigproc_task, **calib_task, ) self.report_section_run_object(run) calib_template = "calib_nn_template.ipynb" self.report_section_from_template(calib_template) runs += [run] n_runs = len(runs) self.report_preamble( utils.smart_wrap( f""" # Calib Overview ## {n_runs} run(s) processed. """, width=None, )) return runs
def it_fetches_list_elem_type(): schema = s(s.is_dict(elems=dict(a=s.is_list(s.is_int())))) tlf = schema.top_level_fields() assert tlf[0][0] == "a" and tlf[0][4] is int
class SigprocV2Params(ParamsAndPriors): """ About Calibration: The long term goal of the calibration files is to dissociate the name of the file from the records (subjects) in the file. For now, we're going to load all records from the calibration file """ defaults = dict( divs=5, peak_mea=11, n_fields_limit=None, run_regional_balance=True, run_analysis_gauss2_fitter=False, run_aligner=True, run_per_cycle_peakfinder=False, # TODO: Derive the following during calibration by spectral analysis (ie, 2 std of the power spectrum) # ALSO: This needs to be moved into the calibration because it can not allowed to be # different from the calibration results because the calibration bakes in the PSF # as a function of these parameters. low_inflection=0.03, low_sharpness=50.0, high_inflection=0.50, high_sharpness=50.0, self_calib=False, no_calib=False, instrument_identity=None, save_full_signal_radmat_npy=True, calibration_file=None, channel_align_bounds=None, n_cycles_limit=None, ch_aln_override=None, ch_for_alignment=None, run_fast_peak_finder=False, run_minimal_analysis_gauss2_fitter=True, ) schema = s( s.is_kws_r( calibration_file=s.is_str(noneable=True, required=False), instrument_identity=s.is_str(noneable=True), mode=s.is_str(options=common.SIGPROC_V2_MODES), divs=s.is_int(), peak_mea=s.is_int(), n_fields_limit=s.is_int(noneable=True), run_regional_balance=s.is_bool(), run_analysis_gauss2_fitter=s.is_bool(), run_aligner=s.is_bool(), run_per_cycle_peakfinder=s.is_bool(), low_inflection=s.is_float(), low_sharpness=s.is_float(), high_inflection=s.is_float(), high_sharpness=s.is_float(), self_calib=s.is_bool(noneable=True), no_calib=s.is_bool(noneable=True), save_full_signal_radmat_npy=s.is_bool(), channel_align_bounds=s.is_int(noneable=True), n_cycles_limit=s.is_int(noneable=True), # ch_aln_override allows for a temporarily needed hack to bypass the calibration system ch_aln_override=s.is_list(elems=s.is_list(elems=s.is_float()), noneable=True), ch_for_alignment=s.is_int(noneable=True), run_fast_peak_finder=s.is_bool(), run_minimal_analysis_gauss2_fitter=s.is_bool(), )) def validate(self): # Note: does not call super because the override_nones is set to false here self.schema.apply_defaults(self.defaults, apply_to=self, override_nones=False) self.schema.validate(self, context=self.__class__.__name__) if self.mode == common.SIGPROC_V2_ILLUM_CALIB: pass # ZBS: At the moment these checks are more trouble than they are worth # if local.path(self.calibration_file).exists(): # if not log.confirm_yn( # f"\nCalibration file '{self.calibration_file}' already exists " # "when creating a SIGPROC_V2_PSF_CALIB. Overwrite?", # "y", # ): # raise SchemaValidationFailed( # f"Not overwriting calibration file '{self.calibration_file}'" # ) else: # Analyzing if self.self_calib: assert ( self.calibration_file is None ), "In self-calibration mode you may not specify a calibration file" assert ( self.instrument_identity is None ), "In self-calibration mode you may not specify an instrument identity" assert ( self.no_calib is not True ), "In self-calibration mode you may not specify the no_calib option" # elif ( # not self.no_calib # and self.calibration_file != "" # and self.calibration_file is not None # ): # self.calibration = Calib.load_file( # self.calibration_file, self.instrument_identity # ) elif self.no_calib: assert ( self.no_calib_psf_sigma is not None ), "In no_calib mode you must specify an estimated no_calib_psf_sigma" return True
class BaseGenerator(Munch): """ Base of all generators. Expects sub-classes to provide a class member "required_schema" which is used for parsing the kwargs on the __init__() """ schema = None # Should be overloaded in any sub-class defaults = {} # Should be overloaded in any sub-class job_setup_schema = s( s.is_kws_r( job=s.is_str(help="See Main Help"), sample=s.is_str(allow_empty_string=False, help="See Main Help"), )) protein_schema = s( s.is_kws_r( protein=s.is_list(elems=s.is_kws_r( id=s.is_str(), seqstr=s.is_str(), )), protein_of_interest=s.is_list( s.is_str(allow_empty_string=False), noneable=True, help= "The id of the protein(s) of interest, used in survey and reporting", ), )) label_set_schema = s( s.is_kws_r( label_set=s.is_list(elems=s.is_str(), help="See Main Help"))) lnfit_schema = s( s.is_kws_r( lnfit_name=s.is_list(s.is_str(), noneable=True, help="See Main Help"), lnfit_params=s.is_list(s.is_str(), noneable=True, help="See Main Help"), lnfit_dye_on_threshold=s.is_list(s.is_int(), noneable=True, help="See Main Help"), lnfit_photometry_only=s.is_list(s.is_str(), noneable=True, help="See Main Help"), )) scope_run_schema = s( s.is_kws_r( n_edmans=s.is_int(help="See Main Help"), n_pres=s.is_int(help="See Main Help"), n_mocks=s.is_int(help="See Main Help"), )) peptide_setup_schema = s( s.is_kws_r( protease=s.is_list(elems=s.is_str(), help="See Main Help"), decoys=s.is_str(help="See Main Help"), random_seed=s.is_int(noneable=True, help="See Main Help"), n_ptms_limit=s.is_int( bounds=(0, 12), help= "Max number of PTMs per peptide to allow. Peptides with more PTM sites than this will not consider any PTM permutations.", ), )) sim_schema = s( s.is_kws_r( n_samples_train=s.is_int(bounds=(1, None), help="See Main Help"), n_samples_test=s.is_int(bounds=(1, None), help="See Main Help"), )) classify_schema = s( s.is_kws_r( classify_skip_nn=s.is_bool( help="Skips Nearest Neighbor classifier if set"), classify_skip_rf=s.is_bool( help="Skips Random Forest classifier if set"), report_prec=s.is_list( elems=s.is_float(bounds=(0.001, 0.999)), help="The precision for classifier reporting", ), )) sigproc_source_schema = s( s.is_kws_r( sigproc_source=s.is_list(s.is_str(), noneable=True, help="See Main Help"), movie=s.is_bool(help="See Main Help"), n_frames_limit=s.is_int(bounds=(1, 500), noneable=True, help="See Main Help"), )) sigproc_v1_schema = s( s.is_kws_r( radial_filter=s.is_float(noneable=True, bounds=(0.01, 1.0), help="See Main Help"), peak_find_n_cycles=s.is_int(bounds=(1, 10000), help="See Main Help"), peak_find_start=s.is_int(bounds=(0, 10000), help="See Main Help"), anomaly_iqr_cutoff=s.is_int(bounds=(1, 100), help="See Main Help"), )) sigproc_v2_schema = s( s.is_kws_r( calibration_file=s.is_str(), instrument_subject_id=s.is_str(), )) report_metadata = Munch( metadata=Munch( kernelspec=Munch(display_name="Python 3", language="python", name="python3"), language_info=Munch( codemirror_mode=Munch(name="ipython", version=3), file_extension=".py", mimetype="text/x-python", name="python", nbconvert_exporter="python", pygments_lexer="ipython3", version="3.6.7", ), ), nbformat=4, nbformat_minor=2, ) error_model_schema = s( s.is_kws_r( err_p_edman_failure=s.is_list(elems=s.is_str( help="See Main Help")), err_p_detach=s.is_list(elems=s.is_str(help="See Main Help")), err_dye_beta=s.is_list(elems=s.is_str(help="See Main Help")), err_dye_sigma=s.is_list(elems=s.is_str(help="See Main Help")), err_p_bleach_per_cycle=s.is_list(elems=s.is_str( help="See Main Help")), err_p_non_fluorescent=s.is_list(elems=s.is_str( help="See Main Help")), )) error_model_defaults = Munch( err_p_edman_failure=0.06, err_p_detach=0.05, err_dye_beta=7500.0, err_dye_sigma=0.16, err_dye_gain=7500.0, err_dye_vpd=0.1, err_p_bleach_per_cycle=0.05, err_p_non_fluorescent=0.07, ) code_block = Munch(cell_type="code", execution_count=None, metadata=Munch(), outputs=[], source=[]) markdown_block = Munch(cell_type="markdown", metadata=Munch(), source=[]) def __init__(self, **kwargs): # APPLY defaults and then ask user for any elements that are not declared super().__init__(**kwargs) self.apply_defaults() debug(self) self.setup_err_model() self.validate() self._report_sections = [] self._report_preamble = None self._validate_protein_of_interest() def _validate_protein_of_interest(self): if "protein" in self: seq_ids = {seq["id"] for seq in self.protein} for poi in self.protein_of_interest: if poi not in seq_ids: raise ValueError( f"protein_of_interest '{poi}' is not in the protein id list. " f"Confirm you specified a Name and not a UniprotAC") def setup_err_model(self): err_param_dict = defaultdict(list) for name, type, _, user_data in self.error_model_schema.requirements(): values = self.get(name, []) for value in values: low_prob, high_prob, step_prob = None, None, 1 parts = value.split("|") if len(parts) == 2: dye_part = parts[0] prob_parts = parts[1] else: dye_part = None prob_parts = parts[0] prob_parts = prob_parts.split(":") if name in ("err_p_edman_failure", "err_p_detach"): if dye_part: raise SchemaValidationFailed( f"error model term '{name}' is not allowed to have a dye-index." ) else: if dye_part is None: raise SchemaValidationFailed( f"error model term '{name}' expected a dye-index.") low_prob = float(prob_parts[0]) if len(prob_parts) > 1: high_prob = float(prob_parts[1]) if len(prob_parts) > 2: step_prob = int(prob_parts[2]) if high_prob is None: high_prob = low_prob key = f"{name}:{dye_part if dye_part is not None else 0}" err_param_dict[key] += np.linspace(low_prob, high_prob, step_prob).tolist() err_param_dict[key] = list(set(err_param_dict[key])) self.err_param_dict = err_param_dict def apply_defaults(self): """Overloadable by sub-classes.""" self.schema.apply_defaults(self.defaults, self, override_nones=True) def validate(self): """Overloadable by sub-classes for extra validation""" self.schema.validate(self, context=self.__class__.__name__) def ims_imports(self, sigproc_source): if self.movie: ims_import = task_templates.ims_import( sigproc_source, is_movie=True, n_cycles_limit=self.n_frames_limit) else: ims_import = task_templates.ims_import(sigproc_source, is_movie=False) return ims_import def sigprocs_v1(self): sigproc_tasks = [] if self.sigproc_source: for ss in self.sigproc_source: ims_import = self.ims_imports(ss) sigproc = task_templates.sigproc_v1() # task_templates returns a generic sigprocv2 task, and we can fill in some # parameters that any sigprocv2 task might have based on the CliSwitches for # BaseVFSCommand. So any subclass will automatically get these params set. # Where should the schema check for them? sigproc.sigproc_v1.parameters.radial_filter = self.radial_filter sigproc.sigproc_v1.parameters.peak_find_n_cycles = ( self.peak_find_n_cycles) sigproc.sigproc_v1.parameters.peak_find_start = self.peak_find_start sigproc.sigproc_v1.parameters.anomaly_iqr_cutoff = ( self.anomaly_iqr_cutoff) sigproc_task = Munch(**ims_import, **sigproc) sigproc_tasks += [sigproc_task] return sigproc_tasks def sigprocs_v2(self, **kwargs): sigproc_tasks = [] if self.sigproc_source: for ss in self.sigproc_source: ims_import = self.ims_imports(ss) sigproc = task_templates.sigproc_v2(**kwargs) # task_templates returns a generic sigprocv2 task, and we can fill in some # parameters that any sigprocv2 task might have based on the CliSwitches for # BaseVFSCommand. So any subclass will automatically get these params set. # Where should the schema check for them? sigproc_task = Munch(**ims_import, **sigproc) sigproc_tasks += [sigproc_task] return sigproc_tasks def lnfits(self): # It is common to have multiple lnfit tasks for a single run, so this fn returns a # block with potentially multiple lnfit tasks using unique task names when more # than one is present. lnfit_tasks = {} if self.lnfit_params: if not self.lnfit_dye_on_threshold: raise ValueError( f"You must specify a --lnfit_dye_on_threshold when --lnfit_params is given" ) dye_thresholds = self.lnfit_dye_on_threshold lnfit_names = self.lnfit_name or ([None] * len(self.lnfit_params)) photometries_only = self.lnfit_photometry_only or ( [True] * len(self.lnfit_params)) if len(self.lnfit_params) > 1 and len(dye_thresholds) == 1: dye_thresholds *= len(self.lnfit_params) assert len(self.lnfit_params) == len(dye_thresholds) assert len(self.lnfit_params) == len(lnfit_names) for i, (params, thresh, name, photometry_only) in enumerate( zip(self.lnfit_params, dye_thresholds, lnfit_names, photometries_only)): task = task_templates.lnfit() task.lnfit.parameters["lognormal_fitter_v2_params"] = params task.lnfit.parameters["dye_on_threshold"] = thresh task.lnfit.parameters[ "photometry_only"] = photometry_only.lower() in ( "true", "1", ) task_name = "lnfit" if len(self.lnfit_params) > 1 or name: task_name = name or f"lnfit_{i}" helpers.task_rename(task, task_name) lnfit_tasks[task_name] = task[task_name] return lnfit_tasks def run_name(self, aa_list, protease=None, err_set=None): """ A helper for run folder names based on aa_list and protease. Note, not all generators will use this convention. Compose a run_name from protease and aa_list in normalized form: Eg: protease="trypsin", aa_list=("DE", "K") => "trypsin_de_k" """ if protease is None: protease = "" aa_list = [a.replace("[", "").replace("]", "") for a in aa_list] aa = "_".join(aa_list) if err_set is not None: err_str = hashlib.md5( json.dumps(err_set).encode()).hexdigest()[0:4] else: err_str = "" return re.sub( "[^0-9a-z_]+", "_", (protease + ("_" if protease != "" else "") + aa).lower() + "_" + err_str, ) def _label_str_permutate(self, label_str): """ Return list of permutations of a label_str such as: "A,B,C:2" => ("A", "B"), ("A", "C"), ("B", "C") A suffix label set may be added to each permutation with +: "A,B,C:2+S" => ("A", "B", "S"), ("A", "C", "S"), ("B", "C", "S") "A,B,C:2+S,T" => ("A", "B", "S", "T"), ("A", "C", "S", "T"), ("B", "C", "S", "T") """ check.t(label_str, str) semi_split = label_str.split(":") if len(semi_split) > 2: raise ValueError(f"Label-set '{label_str}' has >1 colon.") suffix_labels = "" if len(semi_split) == 2: suffix_split = semi_split[1].split("+") if len(suffix_split) > 2: raise ValueError(f"Label-set '{label_str}' has >1 plus.") if len(suffix_split) == 2: semi_split = [semi_split[0], suffix_split[0]] suffix_labels = suffix_split[1].split(",") suffix_labels = [slabel.strip() for slabel in suffix_labels] labels = semi_split[0].split(",") labels = [label.strip() for label in labels] if len(semi_split) == 1: perm_count = len(labels) else: perm_count = int(semi_split[1]) if not 0 < perm_count < len(labels): raise ValueError( f"Label-set '{label_str}' has a permutation count " f"of {perm_count}; needs to be between 0 and {len(labels) - 1}" ) perms = list(itertools.combinations(labels, perm_count)) if suffix_labels: perms = [p + tuple(suffix_labels) for p in perms] return perms def label_set_permutate(self): check.list_t(self.label_set, str) return utils.flatten([ self._label_str_permutate(label_str) for label_str in self.label_set ], 1) def error_set_permutate(self): tuples = [[(key, val) for val in vals] for key, vals in self.err_param_dict.items()] return tuples def run_parameter_permutator(self): """ Generate permutations of all the variable parameters Defaults all arguments to self.* Gracefully handles lack of protease. """ proteases = utils.non_none(self.get("protease"), [None]) if len(proteases) == 0: proteases = [None] proteases = [("protease", p) for p in proteases] label_sets = self.label_set_permutate() label_sets = [("label_set", s) for s in label_sets] err_sets = self.error_set_permutate() combined = [proteases, label_sets] + err_sets for params in itertools.product(*combined): protease = utils.filt_first(params, lambda i: i[0] == "protease") protease = protease[1] label_set = utils.filt_first(params, lambda i: i[0] == "label_set") label_set = label_set[1] # Given that the label_set is now known, the error model can be setup n_channels = len(label_set) err_set = Munch( p_edman_failure=[ self.error_model_defaults.err_p_edman_failure ] * 1, p_detach=[self.error_model_defaults.err_p_detach] * 1, dye_beta=[self.error_model_defaults.err_dye_beta] * n_channels, dye_sigma=[self.error_model_defaults.err_dye_sigma] * n_channels, dye_gain=[self.error_model_defaults.err_dye_gain] * n_channels, dye_vpd=[self.error_model_defaults.err_dye_vpd] * n_channels, p_bleach_per_cycle=[ self.error_model_defaults.err_p_bleach_per_cycle ] * n_channels, p_non_fluorescent=[ self.error_model_defaults.err_p_non_fluorescent ] * n_channels, ) for param in params: if param[0].startswith("err_"): parts = param[0].split(":") err_set[parts[0][4:]][int(parts[1])] = param[1] # The 4: removes the "err_" yield protease, label_set, err_set def erisyon_block(self, aa_list, protease=None, err_set=None): return task_templates.erisyon( run_name=self.run_name(aa_list, protease, err_set), sample=self.sample, generator_name=self.__class__.__name__, ) def _markdown_to_markdown_block(self, markdown): lines = [f"{line}\n" for line in markdown.split("\n")] block = Munch(**self.markdown_block) block.source = lines return block def report_preamble(self, markdown): """A a preamble in markdown format""" self._report_preamble = markdown def report_section_markdown(self, markdown): self._report_sections += [("markdown", markdown)] def report_section_run_object(self, run): self._report_sections += [ ( "code", [f'run = RunResult("./{run.run_name}")'], ), ] def report_section_job_object(self): self._report_sections += [ ( "code", [f'job = JobResult("//jobs_folder/{self.job}")'], ), ] def report_section_user_config(self): """ Emit report configuation parameters specified by the user via gen so that they can be further edited if desired, and used by reporting functions in the templates. """ config = [] if self.protein_of_interest: config += [ f"PGEN_protein_of_interest = {self.protein_of_interest}\n" ] if self.report_prec: config += [f"PGEN_report_precisions = {self.report_prec}\n"] if config: self.report_section_markdown("# PGEN-controlled report config") config = [ f"# These values were or can be specified by the user at gen time:\n" ] + config self._report_sections += [("code", config)] def report_section_run_array(self, runs, to_load=None): to_load_string = "" if to_load is None else f", to_load={to_load}" run_names = [run.run_name for run in runs] self._report_sections += [( "code", [ f"run_names = {run_names}\n" f'runs = [RunLoader(f"./{{name}}"{to_load_string}) for name in run_names]' ], )] def report_section_from_template(self, template_name): """Write the report from its pieces""" self._report_sections += [("template", template_name)] def report_assemble(self): """Assemble the report from its pieces. A giant Munch is returned""" report = Munch(**self.report_metadata) report.cells = [] preamble_block = self._markdown_to_markdown_block( self._report_preamble) report.cells += [preamble_block] # LOAD all templates templates_by_name = {} for section_type, section_data in self._report_sections: if section_type == "template": file_path = section_data templates_by_name[file_path] = utils.json_load_munch( f"./plaster/gen/nb_templates/{file_path}") # FIND all of the @IMPORT-MERGE blocks import_merge = [] for _, template in templates_by_name.items(): for cell in template.cells: if cell.cell_type == "code": first_line = utils.safe_list_get(cell.source, 0, "") if "# @IMPORT-MERGE" in first_line: for line in cell.source: if "import" in line: import_merge += [line] import_merge += ["from plaster.tools.zplots import zplots\n"] import_merge = sorted(list(set(import_merge))) + ["z=zplots.setup()"] import_block = Munch(**self.code_block) import_block.source = import_merge report.cells += [import_block] for section_type, section_data in self._report_sections: if section_type == "code": lines = section_data block = Munch(**self.code_block) block.source = lines report.cells += [block] elif section_type == "markdown": block = self._markdown_to_markdown_block(section_data) report.cells += [block] elif section_type == "template": file_path = section_data template = templates_by_name[file_path] for cell in template.cells: if cell.cell_type == "code": first_line = utils.safe_list_get(cell.source, 0, "") if ("@IMPORT-MERGE" not in first_line and "@REMOVE-FROM-TEMPLATE" not in first_line): block = Munch(**self.code_block) block.source = cell.source report.cells += [block] if cell.cell_type == "markdown": block = Munch(**self.markdown_block) block.source = cell.source report.cells += [block] return report def report_task(self): pass def generate(self): """ Abstract method to be overloaded. Expected to return a list of runs. """ pass
class Gen(BaseGenerator): schema = s(s.is_kws(label_set=s.is_list(s.is_str())))
class SimV2Params(ParamsAndPriors): # The following constants are repeated in sim_v2.h because it # is hard to get constants like this to be shared between # the two languages. This shouldn't be a problem as they are stable. # TODO: Move these to an import form the pyx CycleKindType = np.uint8 CYCLE_TYPE_PRE = 0 CYCLE_TYPE_MOCK = 1 CYCLE_TYPE_EDMAN = 2 channel__priors__columns = ( "ch_i", "channel_name", "bg_mu", "bg_sigma", "dye_name", "gain_mu", "gain_sigma", "index", "p_bleach", "row_k_sigma", ) dye__label__priors__columns = ( "channel_name", "dye_name", "aa", "label_name", "ptm_only", "p_non_fluorescent", "ch_i", "bg_mu", "bg_sigma", "gain_mu", "gain_sigma", "row_k_sigma", "p_bleach", ) defaults = Munch( n_pres=1, n_mocks=0, n_edmans=1, n_samples_train=5_000, n_samples_test=1_000, dyes=[], labels=[], random_seed=None, allow_train_test_to_be_identical=False, allow_edman_cterm=False, enable_ptm_labels=False, is_survey=False, train_includes_radmat=False, test_includes_dyemat=False, dump_debug=False, generate_flus=True, use_lognormal_model=False, ) schema = s( s.is_kws_r( priors_desc=Priors.priors_desc_schema, is_survey=s.is_bool(), n_pres=s.is_int(bounds=(0, None)), n_mocks=s.is_int(bounds=(0, None)), n_edmans=s.is_int(bounds=(0, None)), n_samples_train=s.is_int(bounds=(1, None)), n_samples_test=s.is_int(bounds=(1, None)), dyes=s.is_list(elems=s.is_kws_r( dye_name=s.is_str(), channel_name=s.is_str(), )), labels=s.is_list(elems=s.is_kws_r( aa=s.is_str(), dye_name=s.is_str(), label_name=s.is_str(), ptm_only=s.is_bool(required=False, noneable=True), )), channels=s.is_dict(required=False), random_seed=s.is_int(required=False, noneable=True), allow_train_test_to_be_identical=s.is_bool(required=False, noneable=True), allow_edman_cterm=s.is_bool(required=False, noneable=True), enable_ptm_labels=s.is_bool(required=False, noneable=True), train_includes_radmat=s.is_bool(required=False, noneable=True), test_includes_dyemat=s.is_bool(required=False, noneable=True), dump_debug=s.is_bool(), generate_flus=s.is_bool(), use_lognormal_model=s.is_bool(), )) # def copy(self): # dst = utils.munch_deep_copy(self, klass_set={SimV2Params}) # assert isinstance(dst, SimV2Params) # return dst def __init__(self, **kwargs): # _skip_setup_dfs is True in fixture mode super().__init__(source="SimV2Params", **kwargs) self._setup_dfs() def validate(self): super().validate() all_dye_names = list(set([d.dye_name for d in self.dyes])) # No duplicate dye names self._validate( len(all_dye_names) == len(self.dyes), "The dye list contains a duplicate") # No duplicate labels self._validate( len(list(set(utils.listi(self.labels, "aa")))) == len(self.labels), "There is a duplicate label in the label_set", ) # All labels have a legit dye name [ self._validate( label.dye_name in all_dye_names, f"Label {label.label_name} does not have a valid matching dye_name", ) for label in self.labels ] # Channel mappings mentioned_channels = {dye.channel_name: False for dye in self.dyes} if "channels" in self: # Validate that channel mapping is complete for channel_name, ch_i in self.channels.items(): self._validate( channel_name in mentioned_channels, f"Channel name '{channel_name}' was not found in dyes", ) mentioned_channels[channel_name] = True self._validate( all([mentioned for _, mentioned in mentioned_channels.items()]), "Not all channels in dyes were enumerated in channels", ) else: # No channel mapping: assign them self["channels"] = { ch_name: i for i, ch_name in enumerate(sorted(mentioned_channels.keys())) } @property def n_cycles(self): return self.n_pres + self.n_mocks + self.n_edmans def channel_names(self): return [ ch_name for ch_name, _ in sorted(self.channels.items(), key=lambda item: item[1]) ] def ch_i_by_name(self): return self.channels @property def n_channels(self): # if self.is_photobleaching_run: # return 1 return len(self.channels) @property def n_channels_and_cycles(self): return self.n_channels, self.n_cycles def _setup_dfs(self): """ Assemble all of the priors into several dataframes indexed differently. (Call after validate) * self.channel__priors: ch_i, ch_name, bg_mu, bg_sigma, gain_mu, gain_sigma, row_k_sigma, p_bleach --> Note, does NOT have p_non_fluorescent because this is a dye property * self.dye__label__priors: aa, label_name, dye_name, ch_i, ch_name, bg_mu, bg_sigma, gain_mu, gain_sigma, row_k_sigma, p_bleach p_non_fluorescent, """ # if self.is_photobleaching_run: # # Not sure what these should be yet # # self._ch_by_aa = {} # # self._channel__priors = pd.DataFrame(columns=self.channel__priors__columns) # # self._dye__label__priors = pd.DataFrame(columns=self.dye__label__priors__columns) # self.dyes = [Munch(dye_name="zero", channel_name="zero")] # self.channels = Munch(zero=0) # self.labels = [ # dict(aa=".", dye_name="zero", label_name="zero", ptm_only=False) # ] labels_df = pd.DataFrame(self.labels) # labels_df: (aa, dye_name, label_name, ptm_only) # assert len(labels_df) > 0 dyes_df = pd.DataFrame(self.dyes) # dyes_df: (dye_name, channel_name) # assert len(dyes_df) > 0 # LOOKUP dye priors dye_priors = [] for dye in self.dyes: # SEARCH priors by dye name and if not found by channel p_non_fluorescent = self.priors.get_exact( f"p_non_fluorescent.{dye.dye_name}") if p_non_fluorescent is None: p_non_fluorescent = self.priors.get( f"p_non_fluorescent.ch_{dye.channel_name}") dye_priors += [ Munch( dye_name=dye.dye_name, p_non_fluorescent=p_non_fluorescent.prior, ) ] dye_priors_df = pd.DataFrame(dye_priors) # dye_priors_df: (dye_name, p_non_fluorescent) dyes_df = utils.easy_join(dyes_df, dye_priors_df, "dye_name") # dyes_df: (dye_name, channel_name, p_non_fluorescent) # TODO: LOOKUP label priors # (p_failure_to_bind_aa, p_failure_to_attach_to_dye) # LOOKUP channel priors ch_priors = pd.DataFrame([ dict( channel_name=channel_name, ch_i=ch_i, bg_mu=self.priors.get(f"bg_mu.ch_{ch_i}").prior, bg_sigma=self.priors.get(f"bg_sigma.ch_{ch_i}").prior, gain_mu=self.priors.get(f"gain_mu.ch_{ch_i}").prior, gain_sigma=self.priors.get(f"gain_sigma.ch_{ch_i}").prior, row_k_sigma=self.priors.get(f"row_k_sigma.ch_{ch_i}").prior, p_bleach=self.priors.get(f"p_bleach.ch_{ch_i}").prior, ) for channel_name, ch_i in self.channels.items() ]) # ch_priors: (channel_name, ch_i, ...) self._channel__priors = (utils.easy_join( dyes_df, ch_priors, "channel_name").drop( columns=["p_non_fluorescent"]).drop_duplicates().reset_index()) # self._channel__priors: ( # 'ch_i', 'channel_name', 'bg_mu', 'bg_sigma', 'dye_name', # 'gain_mu', 'gain_sigma', 'index', 'p_bleach', 'row_k_sigma', # ) # SANITY check channel__priors group_by_ch = self._channel__priors.groupby("ch_i") for field in ( "bg_mu", "bg_sigma", "gain_mu", "gain_sigma", "row_k_sigma", ): assert np.all(group_by_ch[field].nunique() == 1) assert "p_non_fluorescent" not in self._channel__priors.columns labels_dyes_df = utils.easy_join(labels_df, dyes_df, "dye_name") self._dye__label__priors = utils.easy_join( labels_dyes_df, ch_priors, "channel_name").reset_index(drop=True) # self._dye__label__priors: ( # 'channel_name', 'dye_name', 'aa', 'label_name', # 'ptm_only', 'p_non_fluorescent', 'ch_i', 'bg_mu', 'bg_sigma', # 'gain_mu', 'gain_sigma', 'row_k_sigma', 'p_bleach' # ) self._ch_by_aa = { row.aa: row.ch_i for row in self._dye__label__priors.itertuples() } def ch_by_aa(self): return self._ch_by_aa def dye__label__priors(self): """ DataFrame( 'channel_name', 'dye_name', 'aa', 'label_name', 'ptm_only', 'p_non_fluorescent', 'ch_i', 'bg_mu', 'bg_sigma', 'gain_mu', 'gain_sigma', 'row_k_sigma', 'p_bleach' ) """ return self._dye__label__priors def channel__priors(self): """ DataFrame( 'ch_i', 'channel_name', 'bg_mu', 'bg_sigma', 'dye_name', 'gain_mu', 'gain_sigma', 'index', 'p_bleach', 'row_k_sigma', ) """ return self._channel__priors def by_channel(self): return self._channel__priors.set_index("ch_i") def to_label_list(self): """Summarize labels like: ["DE", "C"]""" return [ "".join([ label.aa for label in self.labels if label.dye_name == dye.dye_name ]) for dye in self.dyes ] def to_label_str(self): """Summarize labels like: DE,C""" return ",".join(self.to_label_list()) def cycles_array(self): cycles = np.zeros((self.n_cycles, ), dtype=self.CycleKindType) i = 0 for _ in range(self.n_pres): cycles[i] = self.CYCLE_TYPE_PRE i += 1 for _ in range(self.n_mocks): cycles[i] = self.CYCLE_TYPE_MOCK i += 1 for _ in range(self.n_edmans): cycles[i] = self.CYCLE_TYPE_EDMAN i += 1 return cycles def pcbs(self, pep_seq_df): """ pcb stands for (p)ep_i, (c)hannel_i, (b)right_probability This is a structure that is liek a "flu" but with an extra bright probability. Each peptide has a row for each amino acid That row has a columns (pep_i, ch_i, p_bright) And it will have np.nan for ch_i and p_bright **IF THERE IS NO LABEL** bright_probability is the inverse of all the ways a dye can fail to be visible ie the probability that a dye is active. pep_seq_df: Any DataFrame with an "aa" column Returns: contiguous ndarray(:, 3) where there 3 columns are: pep_i, ch_i, p_bright """ labelled_pep_df = pep_seq_df.join( self.dye__label__priors().set_index("aa"), on="aa", how="left") # p_bright = is the product of (1.0 - ) all the ways the dye can fail to be visible. labelled_pep_df["p_bright"] = ( # TODO: Sim needs to be converted to use priors sampling # at which point this function needs to be refactored # so that the parameters of the priors can be sampled in C. 1.0 - np.array([ i.sample() if isinstance(i, Prior) else np.nan for i in labelled_pep_df.p_non_fluorescent ]) # TODO: Add label priors # * (1.0 - labelled_pep_df.p_failure_to_attach_to_dye) # * (1.0 - labelled_pep_df.p_failure_to_bind_aa) ) labelled_pep_df.sort_values(by=["pep_i", "pep_offset_in_pro"], inplace=True) return np.ascontiguousarray( labelled_pep_df[["pep_i", "ch_i", "p_bright"]].values) @classmethod def from_aa_list_fixture(cls, aa_list, priors=None, **kwargs): """ This is a helper to generate channel when you have a list of aas. For example, two channels where ch0 is D&E and ch1 is Y. ["DE", "Y"]. """ check.list_or_tuple_t(aa_list, str) allowed_aa_mods = ["[", "]"] assert all([(aa.isalpha() or aa in allowed_aa_mods) for aas in aa_list for aa in list(aas)]) dyes = [ Munch(dye_name=f"dye_{ch}", channel_name=f"ch_{ch}") for ch, _ in enumerate(aa_list) ] # Note the extra for loop because "DE" needs to be split into "D" & "E" # which is done by aa_str_to_list() - which also handles PTMs like S[p] labels = [ Munch( aa=aa, dye_name=f"dye_{ch}", label_name=f"label_{ch}", ptm_only=False, ) for ch, aas in enumerate(aa_list) for aa in aa_str_to_list(aas) ] return cls(dyes=dyes, labels=labels, priors=priors, **kwargs)