class NNV2Params(ParamsAndPriors): defaults = Munch( include_training_set=False, n_neighbors=8, dt_score_bias=0.1, include_sigproc=False, run_against_all_dyetracks=False, run_row_k_fit=True, scoring_verbose=False, scoring_verbose_cc=False, dyetrack_n_counts=None, dyetrack_n_cycles=None, row_k_score_factor=0.05, cycle_balance=None, n_rows_limit=None, use_lognormal_model=False, ) schema = s( s.is_kws_r( prior_desc=Priors.priors_desc_schema, include_training_set=s.is_bool(), n_neighbors=s.is_int(), dt_score_bias=s.is_float(), include_sigproc=s.is_bool(), run_row_k_fit=s.is_bool(), run_against_all_dyetracks=s.is_bool(), scoring_verbose=s.is_bool(), scoring_verbose_cc=s.is_bool(), dyetrack_n_counts=s.is_int(noneable=True), dyetrack_n_cycles=s.is_int(noneable=True), row_k_score_factor=s.is_float(), n_rows_limit=s.is_int(noneable=True), use_lognormal_model=s.is_bool(), ))
class TestNNParams(Params): defaults = Munch( include_training_set=False, n_neighbors=8, dt_score_mode="gmm_normalized_wpdf_dist_sigma", dt_score_metric="", dt_score_bias=0.1, dt_filter_threshold=0, rare_penalty=0.8, penalty_coefs=None, radius=15.0, random_seed=None, ) schema = s( s.is_kws_r( include_training_set=s.is_bool(), n_neighbors=s.is_int(), dt_score_bias=s.is_float(), dt_score_mode=s.is_str(options=[ "gmm_normalized_wpdf", "gmm_normalized_wpdf_dist_sigma", "gmm_normalized_wpdf_no_inv_var", "one", "dt_freq_log_weight", "cdist_normalized", "cdist_weighted_sqrt", "cdist_weighted_log", "cdist_weighted_normalized", "cdist_weighted_normalized_sqrt", "cdist_weighted_normalized_log", ]), dt_score_metric=s.is_str(options=[ "", "braycurtis", "canberra", "chebyshev", "cityblock", "correlation", "cosine", "euclidean", "jensenshannon", "minkowski", "seuclidean", "sqeuclidean", ]), dt_filter_threshold=s.is_int(), penalty_coefs=s.is_list(elems=s.is_float(), min_len=2, max_len=2, noneable=True), rare_penalty=s.is_float(noneable=True), radius=s.is_float(), random_seed=s.is_int(noneable=True), ))
class RadFilterParams(Params): defaults = Munch( field_quality_thresh=450.0, dark_thresh_in_stds=4.0, noi_thresh_in_stds=2.5, ) schema = s( s.is_kws_r( field_quality_thresh=s.is_float(), dark_thresh_in_stds=s.is_float(), noi_thresh_in_stds=s.is_float(), ) )
def it_validates_float(): test_s = s(s.is_float()) test_s.validate(1.0) with zest.raises(SchemaValidationFailed): test_s.validate("a str") with zest.raises(SchemaValidationFailed): test_s.validate(1)
def it_returns_required_elems(): userdata = dict(some_key=1) test_s = s( s.is_dict( all_required=True, elems=dict( a=s.is_int(), b=s.is_float(help="A float"), c=s.is_number(), d=s.is_str(userdata=userdata), e=s.is_list(), f=s.is_dict(all_required=True, elems=dict(d=s.is_int(), e=s.is_int())), ), )) reqs = test_s.requirements() assert reqs == [ ("a", int, None, None), ("b", float, "A float", None), ("c", float, None, None), ("d", str, None, userdata), ("e", list, None, None), ("f", dict, None, None), ]
class ClassifyV1Generator(BaseGenerator): """ General-purpose generator for classifying peptides/proteins. May be used to search for one or more "needle" peptides. Assumptions: Generator-specific arguments: @--protein_of_interest="P10636-8" # Only affects reporting downstream """ # These schema are in general subsets of the "params" for different plaster tasks, # and for convenience in sharing among generators they are defined in BaseGenerator. # Its a bit arbitrary where some parameters end up, because they might be shared # by two different tasks that both get run as part of a classify run. For example, # this classify generator supports runs that classify either just simulations, or # additionally actual data from a scope. Both sims and scope runs need n_edmans, # n_mocks, n_pres. But the schema for each cannot both contain these else we'll # pass duplicate key names into the schema below. schema = s( s.is_kws_r( **BaseGenerator.job_setup_schema.schema(), **BaseGenerator.protein_schema.schema(), **BaseGenerator.label_set_schema.schema(), **BaseGenerator.lnfit_schema.schema(), **BaseGenerator.scope_run_schema.schema(), **BaseGenerator.peptide_setup_schema.schema(), **BaseGenerator.sigproc_source_schema.schema(), **BaseGenerator.sigproc_v1_schema.schema(), **BaseGenerator.error_model_schema.schema(), **BaseGenerator.sim_schema.schema(), **BaseGenerator.scheme_schema.schema(), rf=s.is_bool(help="Include rf classifier", noneable=True), report_prec=s.is_list( elems=s.is_float(bounds=(0.001, 0.999)), help="The precision for classifier reporting", ), ) ) defaults = Munch( n_edmans=10, n_pres=0, n_mocks=1, n_samples_train=5_000, n_samples_test=1_000, decoys="none", random_seed=None, rf=True, sigproc_source=None, protein_of_interest=None, lnfit_name=None, lnfit_params=None, lnfit_dye_on_threshold=None, movie=False, radial_filter=None, peak_find_n_cycles=4, peak_find_start=0, anomaly_iqr_cutoff=95, # dye_beta=[7500.0], # dye_sigma=[0.16], n_ptms_limit=5, report_prec=[0.95, 0.9, 0.8], ) def apply_defaults(self): super().apply_defaults() # Plumbum creates empty lists on list switches. This means # that the apply defaults doesn't quite work right. # TASK: Find a cleaner solution. For now hard-code # if len(self.err_dye_beta) == 0: # self.err_dye_beta = self.defaults.dye_beta # if len(self.dye_sigma) == 0: # self.dye_sigma = self.defaults.dye_sigma if len(self.report_prec) == 0: self.report_prec = self.defaults.report_prec def validate(self): super().validate() assert self.rf def generate(self): self.report_section_user_config() sigproc_tasks = self.sigprocs_v1() or [{}] # guarantee traverse loop once # TODO: 'default' reporting needs to be rethought. Maybe we just employ # gen switch that says which report type. The pattern that has developed # is that each project of any substance wants a special type of report. These # projects are different enough that you always want to include custom stuff. # Presumably as we do more collabs/projects, they tend to group into a # handful of basic types. # # Bear in mind that we're in the classify generator, so all of these # refer to jobs that involve classification. (jobs like photobleaching # or other sigprocv2-only tasks don't -- those have their own hacky # report logic similar to what you'll see below). # # Currently those types are: 'standard' sigprocv2 with classify, # spike-in sigprocv2 with classify. # # VFS-only types: 'standard classify', PTM classify, # MHC classify (perhaps this is really standard classify, but is big, and # does not use a protease, and has all small uniform-length peptides) # # See all the hacky logic after these loops that patch together # a report by trying to deduce which of the above we're looking # at. # # Maybe we just need different generators instead of including # complex reporting logic? # # Etc. # # PTM, MHC, and PRO are the three classes of highest-level specialized reports # that report on all of the runs in a job taken together. Whereas the default # report that comes out of classify will emit a long report with one section per # run, this became totally unwieldy when a job has 50+ (or hundreds!) of runs. # In that case you really only want a high-level report with a way to explore # the runs, and that's exactly what the specialized PTM, MHC, and PRO templates # are created for. Here we try to cleverly deduce what kind of report we should # do based on whether there are PTMs present, Proteins-of-interest present, or # in the hackiest case, whether the sample or job name contains a given string. # # A PTM report is done if PTMs have been specified for any of the proteins ptm_report = any([pro.get("ptm_locs") for pro in self.protein]) # A MHC-style report (which is special in that we know ahead of time that # the peptides are identical for all runs -- because we started with a list # of peptides -- so we can do lots of interesting comparisons that you can't # do when the peptides differ from run-to-run) is created for jobs which have # the string 'mhc' in their job-name or sample-name. This needs to change, # but our Broad MHC project is the only one of this class for a year now. # This report is useful for any job that contains runs whose peptides are # identical -- this means either peptides were provided in the first place # and no protease was given to the "prep" task, or that only one protease, # and potentially lots of label schemes, is used. mhc_report = not ptm_report and ( "mhc" in self.job.lower() or "mhc" in self.sample.lower() ) # A protein-identification report is done if there are proteins of interest pro_report = ( not ptm_report and not mhc_report and ( bool(self.protein_of_interest) or any([pro.get("is_poi") for pro in self.protein]) ) ) run_descs = [] for protease, aa_list, err_set in self.run_parameter_permutator(): for sigproc_i, sigproc_v1_task in enumerate(sigproc_tasks): prep_task = task_templates.prep( self.protein, protease, self.decoys, pois=self.protein_of_interest, n_ptms_limit=self.n_ptms_limit, ) sim_v1_task = {} sim_v2_task = {} train_rf_task = {} test_rf_task = {} classify_rf_task = {} train_rf_task = task_templates.train_rf() test_rf_task = task_templates.rf_v2() if sigproc_v1_task: classify_rf_task = task_templates.classify_rf_v1( prep_relative_path="../prep", sim_relative_path="../sim_v1", train_relative_path="../train_rf", sigproc_relative_path=f"../sigproc_v1", ) sim_v1_task = task_templates.sim_v1( list(aa_list), err_set, n_pres=self.n_pres, n_mocks=self.n_mocks, n_edmans=self.n_edmans, n_samples_train=self.n_samples_train, n_samples_test=self.n_samples_test, ) sim_v1_task.sim_v1.parameters.random_seed = self.random_seed lnfit_task = self.lnfits("v2") e_block = self.erisyon_block(aa_list, protease, err_set) sigproc_suffix = ( f"_sigproc_{sigproc_i}" if len(sigproc_tasks) > 1 else "" ) run_name = f"{e_block._erisyon.run_name}{sigproc_suffix}" if self.force_run_name is not None: run_name = self.force_run_name run_desc = Munch( run_name=run_name, **e_block, **prep_task, **sim_v1_task, **sim_v2_task, **train_rf_task, **test_rf_task, **sigproc_v1_task, **lnfit_task, **classify_rf_task, ) run_descs += [run_desc] # for classify jobs that involve PTMs or MHC, we'll do run reporting # differently rather than emitting a section for each run. if not ptm_report and not mhc_report and not pro_report: self.report_section_markdown(f"# RUN {run_desc.run_name}") self.report_section_run_object(run_desc) if test_rf_task: self.report_section_from_template( "train_and_test_template.ipynb" ) self.report_section_markdown(f"# JOB {self.job}") self.report_section_job_object() if ptm_report: self.report_section_from_template("train_and_test_template_ptm.ipynb") elif mhc_report: self.report_section_from_template("train_and_test_template_mhc.ipynb") elif pro_report: self.report_section_from_template("train_and_test_template_pro.ipynb") else: self.report_section_from_template("train_and_test_epilog_template.ipynb") n_runs = len(run_descs) if n_runs > 1 and sigproc_tasks[0]: # TASK: better logic for when to include spike_template. --spike? self.report_section_from_template("spike_template.ipynb") sigproc_imports_desc = "" if sigproc_tasks[0]: sigproc_imports_desc = "## Sigproc imports:\n" sigproc_imports_desc += "\n".join( [f"\t* {s.ims_import.inputs.src_dir}" for s in sigproc_tasks] ) self.report_section_first_run_object() self.report_section_from_template("sigproc_v1_template.ipynb") self.report_section_from_template("classify_template.ipynb") self.report_preamble( utils.smart_wrap( f""" # Classify Overview ## {n_runs} run_desc(s) processed. ## Sample: {self.sample} ## Job: {self.job} {sigproc_imports_desc} """, width=None, ) ) return run_descs
class BaseGenerator(report_builder.ReportBuilder, Munch): """ Base of all generators. Expects sub-classes to provide a class member "required_schema" which is used for parsing the kwargs on the __init__() Inherits from ReportBuilder for backwards compatibility with generators which expect to find report methods on the generator class """ schema = None # Should be overloaded in any sub-class defaults = {} # Should be overloaded in any sub-class job_setup_schema = s( s.is_kws_r( job=s.is_str(help="See Main Help"), sample=s.is_str(allow_empty_string=False, help="See Main Help"), )) protein_schema = s( s.is_kws_r( protein=s.is_list(elems=s.is_kws_r( id=s.is_str(), seqstr=s.is_str(), )), protein_of_interest=s.is_list( s.is_str(allow_empty_string=False), noneable=True, help= "The id of the protein(s) of interest, used in survey and reporting", ), )) label_set_schema = s( s.is_kws_r( label_set=s.is_list(elems=s.is_str(), help="See Main Help"))) lnfit_schema = s( s.is_kws_r( lnfit_name=s.is_list(s.is_str(), noneable=True, help="See Main Help"), lnfit_params=s.is_list(s.is_str(), noneable=True, help="See Main Help"), lnfit_dye_on_threshold=s.is_list(s.is_int(), noneable=True, help="See Main Help"), lnfit_photometry_only=s.is_list(s.is_str(), noneable=True, help="See Main Help"), )) scope_run_schema = s( s.is_kws_r( n_edmans=s.is_int(help="See Main Help"), n_pres=s.is_int(help="See Main Help"), n_mocks=s.is_int(help="See Main Help"), )) peptide_setup_schema = s( s.is_kws_r( protease=s.is_list(elems=s.is_str(), help="See Main Help"), decoys=s.is_str(help="See Main Help"), random_seed=s.is_int(noneable=True, help="See Main Help"), n_ptms_limit=s.is_int( bounds=(0, 12), help= "Max number of PTMs per peptide to allow. Peptides with more PTM sites than this will not consider any PTM permutations.", ), )) sim_schema = s( s.is_kws_r( n_samples_train=s.is_int(bounds=(1, None), help="See Main Help"), n_samples_test=s.is_int(bounds=(1, None), help="See Main Help"), allow_edman_cterm=s.is_bool( noneable=True, help= "Edman cycles can remove final C-terminal AA from peptides at plate boundary.", ), use_lognormal_model=s.is_bool( help="Use older lognormal radiometry model", ), is_photobleaching_run=s.is_bool(), photobleaching_run_n_dye_count=s.is_int(noneable=True), )) sigproc_source_schema = s( s.is_kws_r( movie=s.is_bool(noneable=True, help="See Main Help"), n_cycles_limit=s.is_int(noneable=True, help="See Main Help"), start_cycle=s.is_int(noneable=True, help="See Main Help"), dst_ch_i_to_src_ch_i=s.is_str(noneable=True, help="Comma separated"), )) sigproc_v1_schema = s( s.is_kws_r( sigproc_source=s.is_str(noneable=True, help="See Main Help"), radial_filter=s.is_float(noneable=True, bounds=(0.01, 1.0), help="See Main Help"), peak_find_n_cycles=s.is_int(bounds=(1, 10000), help="See Main Help"), peak_find_start=s.is_int(bounds=(0, 10000), help="See Main Help"), anomaly_iqr_cutoff=s.is_int(bounds=(1, 100), help="See Main Help"), )) sigproc_v2_schema = s( s.is_kws_r( calibration_job=s.is_str(noneable=True), sigproc_source=s.is_str(noneable=True, help="See Main Help"), self_calib=s.is_bool(noneable=True), ch_aln=s.is_str(noneable=True, help="comma delimited in x0,y0,x1,y1,..."), ch_for_alignment=s.is_int(noneable=True), calib_dst_ch_i_to_src_ch_i=s.is_str(noneable=True, help="Comma separated"), )) sigproc_v2_calib_schema = s( s.is_kws_r( sigproc_source=s.is_str(noneable=True, help="See Main Help"), movie=s.is_bool(noneable=True), mode=s.is_str(options=["illum"]), # mode will eventually have a second option "dye calib" )) # TODO: Remove all error_model_schema error_model_schema = s( s.is_kws_r( err_p_edman_failure=s.is_list(elems=s.is_str( help="See Main Help")), err_p_detach=s.is_list(elems=s.is_str(help="See Main Help")), err_p_bleach=s.is_list(elems=s.is_str(help="See Main Help")), err_p_non_fluorescent=s.is_list(elems=s.is_str( help="See Main Help")), err_row_k_sigma=s.is_list(elems=s.is_str(help="See Main Help")), # For lognormal: to be deprecated err_dye_beta=s.is_list(elems=s.is_str(help="See Main Help")), err_dye_sigma=s.is_list(elems=s.is_str(help="See Main Help")), err_dye_zero_beta=s.is_list(elems=s.is_str(help="See Main Help")), err_dye_zero_sigma=s.is_list(elems=s.is_str(help="See Main Help")), # For normal err_gain_mu=s.is_list(elems=s.is_str(help="See Main Help")), err_gain_sigma=s.is_list(elems=s.is_str(help="See Main Help")), err_bg_mu=s.is_list(elems=s.is_str(help="See Main Help")), err_bg_sigma=s.is_list(elems=s.is_str(help="See Main Help")), )) # Scheme is a flag that allows passing a pair of (protease, label_set) in directly, # Rather than passing them separately and getting permutations scheme_schema = s( s.is_kws_r(scheme=s.is_list(elems=s.is_str(), help="See Main Help"))) classifier_choice_schema = s(s.is_kws_r(classifier=s.is_str())) error_model_defaults_chemistry = Munch( err_p_edman_failure=0.06, err_p_detach=0.05, err_p_bleach=0.05, err_p_non_fluorescent=0.07, ) error_model_defaults_lognormal = Munch( err_row_k_sigma=0.16, err_dye_beta=7500.0, err_dye_sigma=0.16, err_dye_zero_beta=0.0, err_dye_zero_sigma=400.0, ) error_model_defaults_normal = Munch( # Based on eye-balling val18_2t err_row_k_sigma=0.16, err_gain_mu=15_000.0, err_gain_sigma=1_200.0, err_bg_mu=0.0, err_bg_sigma=400.0, ) has_report = True def __init__(self, **kwargs): # APPLY defaults and then ask user for any elements that are not declared super().__init__(**kwargs) self.apply_defaults() self.setup_err_model() self.validate() self.reports = Munch() self.add_report("report", self) # static reports are ipynb files that are placed in the _reports # folder under a job and are executed by the indexer. # self.static_reports is a list of file names (without paths) self.static_reports = [] self._validate_protein_of_interest() def add_report(self, report_name, builder): assert report_name not in self.reports self.reports[report_name] = builder def _validate_protein_of_interest(self): if "protein" in self: seq_ids = {seq["id"] for seq in self.protein} for poi in self.protein_of_interest: if poi not in seq_ids: raise ValueError( f"protein_of_interest '{poi}' is not in the protein id list. " f"Confirm you specified a Name and not a UniprotAC") def setup_err_model(self): err_param_dict = defaultdict(list) for name, type, _, user_data in self.error_model_schema.requirements(): values = self.get(name, []) for value in values: low_prob, high_prob, step_prob = None, None, 1 parts = value.split("|") if len(parts) == 2: dye_part = parts[0] prob_parts = parts[1] else: dye_part = None prob_parts = parts[0] prob_parts = prob_parts.split(":") if name in ( "err_p_edman_failure", "err_p_detach", "err_row_k_beta", "err_row_k_sigma", ): if dye_part: raise SchemaValidationFailed( f"error model term '{name}' is not allowed to have a dye-index." ) else: if dye_part is None: raise SchemaValidationFailed( f"error model term '{name}' expected a dye-index.") low_prob = float(prob_parts[0]) if len(prob_parts) > 1: high_prob = float(prob_parts[1]) if len(prob_parts) > 2: step_prob = int(prob_parts[2]) if high_prob is None: high_prob = low_prob key = f"{name}:{dye_part if dye_part is not None else 0}" err_param_dict[key] += np.linspace(low_prob, high_prob, step_prob).tolist() err_param_dict[key] = list(set(err_param_dict[key])) self.err_param_dict = err_param_dict def apply_defaults(self): """Overloadable by sub-classes.""" self.schema.apply_defaults(self.defaults, self, override_nones=True) def validate(self): """Overloadable by sub-classes for extra validation""" self.schema.validate(self, context=self.__class__.__name__) def sigprocs_v1(self): tasks = [] if self.sigproc_source: ims_import = task_templates.ims_import( self.sigproc_source, is_movie=self.movie, n_cycles_limit=self.n_cycles_limit, start_cycle=self.start_cycle, dst_ch_i_to_src_ch_i=self.dst_ch_i_to_src_ch_i, ) sigproc = task_templates.sigproc_v1() sigproc.sigproc_v1.parameters.radial_filter = self.radial_filter sigproc.sigproc_v1.parameters.peak_find_n_cycles = self.peak_find_n_cycles sigproc.sigproc_v1.parameters.peak_find_start = self.peak_find_start sigproc.sigproc_v1.parameters.anomaly_iqr_cutoff = self.anomaly_iqr_cutoff tasks += [Munch(**ims_import, **sigproc)] return tasks def tasks_for_sigproc_v2(self): tasks = {} if self.sigproc_source: ims_import_task = task_templates.ims_import( self.sigproc_source, is_movie=self.movie, n_cycles_limit=self.n_cycles_limit, start_cycle=self.start_cycle, dst_ch_i_to_src_ch_i=self.dst_ch_i_to_src_ch_i, ) calib_priors = None if self.calibration_job is not None: calib_src_path = (local.path(self.calibration_job) / "sigproc_v2_calib/plaster_output/sigproc_v2") calib_result = SigprocV2Result.load_from_folder( calib_src_path, prop_list=["calib_priors"]) calib_priors = calib_result.calib_priors if self.calib_dst_ch_i_to_src_ch_i is not None: # Convert a string like 2,1,0 and remap check.t(self.calib_dst_ch_i_to_src_ch_i, str) calib_dst_ch_i_to_src_ch_i = [ int(ch_i) for ch_i in self.calib_dst_ch_i_to_src_ch_i.split(",") ] ch_remapped_priors = Priors.copy(calib_priors) ch_remapped_priors.delete_ch_specific_records() ch_aln_prior = ch_remapped_priors.get_exact(f"ch_aln") if ch_aln_prior is not None: ch_aln_prior = ChannelAlignPrior.ch_remap( ch_aln_prior.prior, calib_dst_ch_i_to_src_ch_i) for dst_ch_i, src_ch_i in enumerate( calib_dst_ch_i_to_src_ch_i): def remap(src_key, dst_key): prior = calib_priors.get_exact(src_key) if prior is not None: ch_remapped_priors.add( dst_key, prior.prior, "remapped channel in gen") remap(f"reg_illum.ch_{src_ch_i}", f"reg_illum.ch_{dst_ch_i}") remap(f"reg_psf.ch_{src_ch_i}", f"reg_psf.ch_{dst_ch_i}") calib_priors = ch_remapped_priors ch_aln = None if self.ch_aln is not None: ch_aln = np.array([float(i) for i in self.ch_aln.split(",")]) assert ch_aln.shape[0] % 2 == 0 ch_aln = ch_aln.reshape((-1, 2)) sigproc_v2_task = task_templates.sigproc_v2_analyze( calib_priors=calib_priors, self_calib=self.self_calib, ch_aln=ch_aln, ch_for_alignment=self.ch_for_alignment, ) tasks = Munch(**ims_import_task, **sigproc_v2_task) return tasks def lnfits(self, sigproc_version): # It is common to have multiple lnfit tasks for a single run, so this fn returns a # block with potentially multiple lnfit tasks using unique task names when more # than one is present. lnfit_tasks = {} if self.lnfit_params: if not self.lnfit_dye_on_threshold: raise ValueError( f"You must specify a --lnfit_dye_on_threshold when --lnfit_params is given" ) dye_thresholds = self.lnfit_dye_on_threshold lnfit_names = self.lnfit_name or ([None] * len(self.lnfit_params)) photometries_only = self.lnfit_photometry_only or ( [True] * len(self.lnfit_params)) if len(self.lnfit_params) > 1 and len(dye_thresholds) == 1: dye_thresholds *= len(self.lnfit_params) assert len(self.lnfit_params) == len(dye_thresholds) assert len(self.lnfit_params) == len(lnfit_names) for i, (params, thresh, name, photometry_only) in enumerate( zip(self.lnfit_params, dye_thresholds, lnfit_names, photometries_only)): task = task_templates.lnfit(sigproc_version=sigproc_version) task.lnfit.parameters["lognormal_fitter_v2_params"] = params task.lnfit.parameters["dye_on_threshold"] = thresh task.lnfit.parameters[ "photometry_only"] = photometry_only.lower() in ( "true", "1", ) task_name = "lnfit" if len(self.lnfit_params) > 1 or name: task_name = name or f"lnfit_{i}" helpers.task_rename(task, task_name) lnfit_tasks[task_name] = task[task_name] return lnfit_tasks def run_name(self, aa_list, protease=None, err_set=None): """ A helper for run folder names based on aa_list and protease. Note, not all generators will use this convention. Compose a run_name from protease and aa_list in normalized form: Eg: protease="trypsin", aa_list=("DE", "K") => "trypsin_de_k" """ if protease is None: protease = "" if aa_list is not None: aa_list = [a.replace("[", "").replace("]", "") for a in aa_list] aas = "_".join(aa_list) else: aas = "bleach" if err_set is not None: err_str = hashlib.md5( json.dumps(err_set).encode()).hexdigest()[0:4] else: err_str = "" return re.sub( "[^0-9a-z_]+", "_", (protease + ("_" if protease != "" else "") + aas).lower() + "_" + err_str, ) def _label_str_permutate(self, label_str): """ Return list of permutations of a label_str such as: "A,B,C:2" => ("A", "B"), ("A", "C"), ("B", "C") A suffix label set may be added to each permutation with +: "A,B,C:2+S" => ("A", "B", "S"), ("A", "C", "S"), ("B", "C", "S") "A,B,C:2+S,T" => ("A", "B", "S", "T"), ("A", "C", "S", "T"), ("B", "C", "S", "T") """ check.t(label_str, str) semi_split = label_str.split(":") if len(semi_split) > 2: raise ValueError(f"Label-set '{label_str}' has >1 colon.") suffix_labels = "" if len(semi_split) == 2: suffix_split = semi_split[1].split("+") if len(suffix_split) > 2: raise ValueError(f"Label-set '{label_str}' has >1 plus.") if len(suffix_split) == 2: semi_split = [semi_split[0], suffix_split[0]] suffix_labels = suffix_split[1].split(",") suffix_labels = [slabel.strip() for slabel in suffix_labels] labels = semi_split[0].split(",") labels = [label.strip() for label in labels] if len(semi_split) == 1: perm_count = len(labels) else: perm_count = int(semi_split[1]) if not 0 < perm_count < len(labels): raise ValueError( f"Label-set '{label_str}' has a permutation count " f"of {perm_count}; needs to be between 0 and {len(labels) - 1}" ) perms = list(itertools.combinations(labels, perm_count)) if suffix_labels: perms = [p + tuple(suffix_labels) for p in perms] return perms def label_set_permutate(self) -> List[Tuple[str, ...]]: """ Returns a list of label sets, where each label set is a tuple of strings """ check.list_t(self.label_set, str) return utils.flatten([ self._label_str_permutate(label_str) for label_str in self.label_set ], 1) def error_set_permutate(self): tuples = [[(key, val) for val in vals] for key, vals in self.err_param_dict.items()] return tuples def scheme_set_permutate(self) -> List[Scheme]: """ Unparsed schemes are of form: protease/label_set, where protease is a str, and label_set is a str parseable by self._label_str_permutate """ parsed_schemes = [] for scheme in self.scheme: split = scheme.split("/") if len(split) != 2 or not all(split): raise ValueError( f"Scheme {scheme} must be of form: protease/label_set") parsed_label_set = self._label_str_permutate(split[1]) parsed_schemes += [ Scheme(split[0], label_set) for label_set in parsed_label_set ] return parsed_schemes def default_err_set(self, n_channels, use_lognormal_model): if use_lognormal_model: defaults = Munch( **self.error_model_defaults_chemistry, **self.error_model_defaults_lognormal, ) # TODO: No longer correct return Munch( p_edman_failure=[defaults.err_p_edman_failure] * 1, p_detach=[defaults.err_p_detach] * 1, p_bleach=[defaults.err_p_bleach] * n_channels, p_non_fluorescent=[defaults.err_p_non_fluorescent] * n_channels, row_k_sigma=[defaults.err_row_k_sigma] * 1, gain_mu=[defaults.err_dye_beta] * n_channels, gain_sigma=[defaults.err_dye_sigma] * n_channels, bg_mu=[defaults.err_dye_zero_beta] * n_channels, bg_sigma=[defaults.err_dye_zero_sigma] * n_channels, ) else: defaults = Munch( **self.error_model_defaults_chemistry, **self.error_model_defaults_normal, ) return Munch( p_edman_failure=[defaults.err_p_edman_failure] * 1, p_detach=[defaults.err_p_detach] * 1, p_bleach=[defaults.err_p_bleach] * n_channels, p_non_fluorescent=[defaults.err_p_non_fluorescent] * n_channels, row_k_sigma=[defaults.err_row_k_sigma] * 1, gain_mu=[defaults.err_gain_mu] * n_channels, gain_sigma=[defaults.err_gain_sigma] * n_channels, bg_mu=[defaults.err_bg_mu] * n_channels, bg_sigma=[defaults.err_bg_sigma] * n_channels, ) def photobleaching_err_set(self, n_channels, use_lognormal_model): if use_lognormal_model: defaults = Munch( **self.error_model_defaults_chemistry, **self.error_model_defaults_lognormal, ) # TODO: No longer correct return Munch( p_edman_failure=[0.0] * 1, p_detach=[0.0] * 1, p_bleach=[0.0] * n_channels, p_non_fluorescent=[0.0] * n_channels, row_k_sigma=[defaults.err_row_k_sigma] * 1, gain_mu=[defaults.err_dye_beta] * n_channels, gain_sigma=[defaults.err_dye_sigma] * n_channels, bg_mu=[defaults.err_dye_zero_beta] * n_channels, bg_sigma=[defaults.err_dye_zero_sigma] * n_channels, ) else: defaults = Munch( **self.error_model_defaults_chemistry, **self.error_model_defaults_normal, ) return Munch( p_edman_failure=[0.0] * 1, p_detach=[0.0] * 1, p_bleach=[0.0] * n_channels, p_non_fluorescent=[0.0] * n_channels, row_k_sigma=[defaults.err_row_k_sigma] * 1, gain_mu=[defaults.err_gain_mu] * n_channels, gain_sigma=[defaults.err_gain_sigma] * n_channels, bg_mu=[defaults.err_bg_mu] * n_channels, bg_sigma=[defaults.err_bg_sigma] * n_channels, ) def run_parameter_permutator(self, use_lognormal_model=True): """ Generate permutations of all the variable parameters Defaults all arguments to self.* Gracefully handles lack of protease. """ proteases = utils.non_none(self.get("protease"), [None]) proteases = [("protease", p) for p in proteases] label_sets = self.label_set_permutate() label_sets = [("label_set", s) for s in label_sets] if len(proteases) == 0: proteases = [("protease", None)] err_sets = self.error_set_permutate() combined = [proteases, label_sets] + err_sets # Schemes is a list of schemes, where each scheme is a tuple containing: # - A Label set, in the form of Tuple['label_set', Tuple[str, ...]] # - A protease, in the form of Tuple['protease', str] # Build scheme set from protease and label set args schemes = list(itertools.product(*combined)) # Add in directly specified schemes schemes += [(("protease", scheme.protease), ("label_set", scheme.label_set)) for scheme in self.scheme_set_permutate()] for params in schemes: protease = utils.filt_first(params, lambda i: i[0] == "protease") protease = protease[1] label_set = utils.filt_first(params, lambda i: i[0] == "label_set") label_set = label_set[1] # Given that the label_set is now known, the error model can be setup n_channels = len(label_set) err_set = self.default_err_set(n_channels, use_lognormal_model) for param in params: if param[0].startswith("err_"): parts = param[0].split(":") err_set[parts[0][4:]][int( parts[1])] = param[1] # The 4: removes the "err_" yield protease, label_set, err_set def erisyon_block(self, aa_list, protease=None, err_set=None): return task_templates.erisyon( run_name=self.run_name(aa_list, protease, err_set), sample=self.sample, generator_name=self.__class__.__name__, ) def report_section_user_config(self, report=None): """ Emit report configuation parameters specified by the user via gen so that they can be further edited if desired, and used by reporting functions in the templates. """ if report is None: report = self config = [] if self.protein_of_interest: config += [ f"PGEN_protein_of_interest = {self.protein_of_interest}\n" ] if self.report_prec: config += [f"PGEN_report_precisions = {self.report_prec}\n"] if config: self.report_section_markdown("# PGEN-controlled report config") config = [ f"# These values were or can be specified by the user at gen time:\n" ] + config report.add_report_section("code", config) def report_assemble(self): """ Overrides report_assemble in ReportBuilder to implement the self.has_report behavior """ if not self.has_report: return None else: return super().report_assemble() def generate(self): """ Abstract method to be overloaded. Expected to return a list of runs. """ pass
class SigprocV1Params(Params): defaults = dict( hat_rad=2, iqr_rng=96, threshold_abs=1.0, channel_indices_for_alignment=None, channel_indices_for_peak_finding=None, radiometry_channels=None, save_debug=False, peak_find_n_cycles=4, peak_find_start=0, radial_filter=None, anomaly_iqr_cutoff=95, n_fields_limit=None, save_full_signal_radmat_npy=False, ) schema = s( s.is_kws_r( anomaly_iqr_cutoff=s.is_number(noneable=True, bounds=(0, 100)), radial_filter=s.is_float(noneable=True, bounds=(0, 1)), peak_find_n_cycles=s.is_int(bounds=(1, None), noneable=True), peak_find_start=s.is_int(bounds=(0, None), noneable=True), save_debug=s.is_bool(), hat_rad=s.is_int(bounds=(1, 3)), iqr_rng=s.is_number(noneable=True, bounds=(0, 100)), threshold_abs=s.is_number( bounds=(0, 100)), # Not sure of a reasonable bound channel_indices_for_alignment=s.is_list(s.is_int(), noneable=True), channel_indices_for_peak_finding=s.is_list(s.is_int(), noneable=True), radiometry_channels=s.is_dict(noneable=True), n_fields_limit=s.is_int(noneable=True), save_full_signal_radmat_npy=s.is_bool(), )) def validate(self): # Note: does not call super because the override_nones is set to false here self.schema.apply_defaults(self.defaults, apply_to=self, override_nones=False) self.schema.validate(self, context=self.__class__.__name__) if self.radiometry_channels is not None: pat = re.compile(r"[0-9a-z_]+") for name, channel_i in self.radiometry_channels.items(): self._validate( pat.fullmatch(name), "radiometry_channels name must be lower-case alphanumeric (including underscore)", ) self._validate(isinstance(channel_i, int), "channel_i must be an integer") def set_radiometry_channels_from_input_channels_if_needed( self, n_channels): if self.radiometry_channels is None: # Assume channels from nd2 manifest channels = list(range(n_channels)) self.radiometry_channels = {f"ch_{ch}": ch for ch in channels} @property def n_output_channels(self): return len(self.radiometry_channels.keys()) @property def n_input_channels(self): return len(self.radiometry_channels.keys()) @property def channels_cycles_dim(self): # This is a cache set in sigproc_v1. # It is a helper for the repeative call: # n_outchannels, n_inchannels, n_cycles, dim = return self._outchannels_inchannels_cycles_dim def _input_channels(self): """ Return a list that converts channel number of the output to the channel of the input Example: input might have channels ["foo", "bar"] the radiometry_channels has: {"bar": 0}] Thus this function returns [1] because the 0th output channel is mapped to the "1" input channel """ return [ self.radiometry_channels[name] for name in sorted(self.radiometry_channels.keys()) ] # def input_names(self): # return sorted(self.radiometry_channels.keys()) def output_channel_to_input_channel(self, out_ch): return self._input_channels()[out_ch] def input_channel_to_output_channel(self, in_ch): """Not every input channel necessarily has an output; can return None""" return utils.filt_first_arg(self._input_channels(), lambda x: x == in_ch)
class ErrorModel(Params): schema = s( s.is_kws_r( p_dud=s.is_deprecated(), p_edman_failure=s.is_float(bounds=(0, 1)), p_detach=s.is_float(bounds=(0, 1)), dyes=s.is_list(elems=s.is_kws_r( dye_name=s.is_str(), p_bleach_per_cycle=s.is_float(bounds=(0, 1)), p_non_fluorescent=s.is_float(bounds=(0, 1)), # gain and vpd are the new parameters and beta, sigma are the legacy gain=s.is_float(required=False, bounds=(0, None)), vpd=s.is_float(required=False, bounds=(0, None)), beta=s.is_float(required=False, bounds=(0, None)), sigma=s.is_float(required=False, bounds=(0, None)), )), labels=s.is_list(elems=s.is_kws_r( label_name=s.is_str(), p_failure_to_bind_amino_acid=s.is_float(bounds=(0, 1)), p_failure_to_attach_to_dye=s.is_float(bounds=(0, 1)), )), )) defaults = Munch(p_edman_failure=0.06, p_detach=0.05, dyes=[], labels=[]) def __init__(self, **kwargs): dyes = kwargs["dyes"] = kwargs.pop("dyes", []) for dye in dyes: dye.p_bleach_per_cycle = dye.get( "p_bleach_per_cycle", kwargs.pop("p_bleach_per_cycle", 0.05)) dye.p_non_fluorescent = dye.get( "p_non_fluorescent", kwargs.pop("p_non_fluorescent", 0.07)) labels = kwargs["labels"] = kwargs.pop("labels", []) for label in labels: label.p_failure_to_bind_amino_acid = label.get( "p_failure_to_bind_amino_acid", kwargs.pop("p_failure_to_bind_amino_acid", 0.0), ) label.p_failure_to_attach_to_dye = label.get( "p_failure_to_attach_to_dye", kwargs.pop("p_failure_to_attach_to_dye", 0.0), ) super().__init__(**kwargs) @classmethod def no_errors(cls, n_channels, **kwargs): beta = kwargs.pop("beta", 7500.0) sigma = kwargs.pop("sigma", 0.0) gain = kwargs.pop("gain", 10.0) vpd = kwargs.pop("vpd", 0.1) return cls( p_edman_failure=0.0, p_detach=0.0, dyes=[ Munch( dye_name=f"dye_{ch}", p_bleach_per_cycle=0.0, p_non_fluorescent=0.0, sigma=sigma, beta=beta, gain=gain, vpd=vpd, ) for ch in range(n_channels) ], labels=[ Munch( label_name=f"label_{ch}", p_failure_to_bind_amino_acid=0.0, p_failure_to_attach_to_dye=0.0, ) for ch in range(n_channels) ], **kwargs, ) @classmethod def from_err_set(cls, err_set, **kwargs): """err_set is a construct used by the error iterators in pgen""" n_channels = len(err_set.p_non_fluorescent) return cls( p_edman_failure=err_set.p_edman_failure[0], p_detach=err_set.p_detach[0], dyes=[ Munch( dye_name=f"dye_{ch}", p_bleach_per_cycle=p_bleach_per_cycle, p_non_fluorescent=p_non_fluorescent, sigma=dye_sigma, beta=dye_beta, gain=dye_gain, vpd=dye_vpd, ) for ch, dye_beta, dye_sigma, dye_gain, dye_vpd, p_bleach_per_cycle, p_non_fluorescent in zip( range(n_channels), err_set.dye_beta, err_set.dye_sigma, err_set.dye_gain, err_set.dye_vpd, err_set.p_bleach_per_cycle, err_set.p_non_fluorescent, ) ], labels=[ Munch( label_name=f"label_{ch}", p_failure_to_bind_amino_acid=0.0, p_failure_to_attach_to_dye=0.0, ) for ch in range(n_channels) ], **kwargs, ) @classmethod def from_defaults(cls, n_channels): return cls( p_edman_failure=cls.defaults.p_edman_failure, p_detach=cls.defaults.p_detach, dyes=[ Munch( dye_name=f"dye_{ch}", p_bleach_per_cycle=0.05, p_non_fluorescent=0.07, sigma=0.16, beta=7500.0, gain=7500.0, vpd=0.10, ) for ch in range(n_channels) ], labels=[ Munch( label_name=f"label_{ch}", p_failure_to_bind_amino_acid=0.0, p_failure_to_attach_to_dye=0.0, ) for ch in range(n_channels) ], ) def scale_dyes(self, key, scalar): for dye in self.dyes: dye[key] *= scalar def set_dye_param(self, key, val): for dye in self.dyes: dye[key] = val
class BaseGenerator(Munch): """ Base of all generators. Expects sub-classes to provide a class member "required_schema" which is used for parsing the kwargs on the __init__() """ schema = None # Should be overloaded in any sub-class defaults = {} # Should be overloaded in any sub-class job_setup_schema = s( s.is_kws_r( job=s.is_str(help="See Main Help"), sample=s.is_str(allow_empty_string=False, help="See Main Help"), )) protein_schema = s( s.is_kws_r( protein=s.is_list(elems=s.is_kws_r( id=s.is_str(), seqstr=s.is_str(), )), protein_of_interest=s.is_list( s.is_str(allow_empty_string=False), noneable=True, help= "The id of the protein(s) of interest, used in survey and reporting", ), )) label_set_schema = s( s.is_kws_r( label_set=s.is_list(elems=s.is_str(), help="See Main Help"))) lnfit_schema = s( s.is_kws_r( lnfit_name=s.is_list(s.is_str(), noneable=True, help="See Main Help"), lnfit_params=s.is_list(s.is_str(), noneable=True, help="See Main Help"), lnfit_dye_on_threshold=s.is_list(s.is_int(), noneable=True, help="See Main Help"), lnfit_photometry_only=s.is_list(s.is_str(), noneable=True, help="See Main Help"), )) scope_run_schema = s( s.is_kws_r( n_edmans=s.is_int(help="See Main Help"), n_pres=s.is_int(help="See Main Help"), n_mocks=s.is_int(help="See Main Help"), )) peptide_setup_schema = s( s.is_kws_r( protease=s.is_list(elems=s.is_str(), help="See Main Help"), decoys=s.is_str(help="See Main Help"), random_seed=s.is_int(noneable=True, help="See Main Help"), n_ptms_limit=s.is_int( bounds=(0, 12), help= "Max number of PTMs per peptide to allow. Peptides with more PTM sites than this will not consider any PTM permutations.", ), )) sim_schema = s( s.is_kws_r( n_samples_train=s.is_int(bounds=(1, None), help="See Main Help"), n_samples_test=s.is_int(bounds=(1, None), help="See Main Help"), )) classify_schema = s( s.is_kws_r( classify_skip_nn=s.is_bool( help="Skips Nearest Neighbor classifier if set"), classify_skip_rf=s.is_bool( help="Skips Random Forest classifier if set"), report_prec=s.is_list( elems=s.is_float(bounds=(0.001, 0.999)), help="The precision for classifier reporting", ), )) sigproc_source_schema = s( s.is_kws_r( sigproc_source=s.is_list(s.is_str(), noneable=True, help="See Main Help"), movie=s.is_bool(help="See Main Help"), n_frames_limit=s.is_int(bounds=(1, 500), noneable=True, help="See Main Help"), )) sigproc_v1_schema = s( s.is_kws_r( radial_filter=s.is_float(noneable=True, bounds=(0.01, 1.0), help="See Main Help"), peak_find_n_cycles=s.is_int(bounds=(1, 10000), help="See Main Help"), peak_find_start=s.is_int(bounds=(0, 10000), help="See Main Help"), anomaly_iqr_cutoff=s.is_int(bounds=(1, 100), help="See Main Help"), )) sigproc_v2_schema = s( s.is_kws_r( calibration_file=s.is_str(), instrument_subject_id=s.is_str(), )) report_metadata = Munch( metadata=Munch( kernelspec=Munch(display_name="Python 3", language="python", name="python3"), language_info=Munch( codemirror_mode=Munch(name="ipython", version=3), file_extension=".py", mimetype="text/x-python", name="python", nbconvert_exporter="python", pygments_lexer="ipython3", version="3.6.7", ), ), nbformat=4, nbformat_minor=2, ) error_model_schema = s( s.is_kws_r( err_p_edman_failure=s.is_list(elems=s.is_str( help="See Main Help")), err_p_detach=s.is_list(elems=s.is_str(help="See Main Help")), err_dye_beta=s.is_list(elems=s.is_str(help="See Main Help")), err_dye_sigma=s.is_list(elems=s.is_str(help="See Main Help")), err_p_bleach_per_cycle=s.is_list(elems=s.is_str( help="See Main Help")), err_p_non_fluorescent=s.is_list(elems=s.is_str( help="See Main Help")), )) error_model_defaults = Munch( err_p_edman_failure=0.06, err_p_detach=0.05, err_dye_beta=7500.0, err_dye_sigma=0.16, err_dye_gain=7500.0, err_dye_vpd=0.1, err_p_bleach_per_cycle=0.05, err_p_non_fluorescent=0.07, ) code_block = Munch(cell_type="code", execution_count=None, metadata=Munch(), outputs=[], source=[]) markdown_block = Munch(cell_type="markdown", metadata=Munch(), source=[]) def __init__(self, **kwargs): # APPLY defaults and then ask user for any elements that are not declared super().__init__(**kwargs) self.apply_defaults() debug(self) self.setup_err_model() self.validate() self._report_sections = [] self._report_preamble = None self._validate_protein_of_interest() def _validate_protein_of_interest(self): if "protein" in self: seq_ids = {seq["id"] for seq in self.protein} for poi in self.protein_of_interest: if poi not in seq_ids: raise ValueError( f"protein_of_interest '{poi}' is not in the protein id list. " f"Confirm you specified a Name and not a UniprotAC") def setup_err_model(self): err_param_dict = defaultdict(list) for name, type, _, user_data in self.error_model_schema.requirements(): values = self.get(name, []) for value in values: low_prob, high_prob, step_prob = None, None, 1 parts = value.split("|") if len(parts) == 2: dye_part = parts[0] prob_parts = parts[1] else: dye_part = None prob_parts = parts[0] prob_parts = prob_parts.split(":") if name in ("err_p_edman_failure", "err_p_detach"): if dye_part: raise SchemaValidationFailed( f"error model term '{name}' is not allowed to have a dye-index." ) else: if dye_part is None: raise SchemaValidationFailed( f"error model term '{name}' expected a dye-index.") low_prob = float(prob_parts[0]) if len(prob_parts) > 1: high_prob = float(prob_parts[1]) if len(prob_parts) > 2: step_prob = int(prob_parts[2]) if high_prob is None: high_prob = low_prob key = f"{name}:{dye_part if dye_part is not None else 0}" err_param_dict[key] += np.linspace(low_prob, high_prob, step_prob).tolist() err_param_dict[key] = list(set(err_param_dict[key])) self.err_param_dict = err_param_dict def apply_defaults(self): """Overloadable by sub-classes.""" self.schema.apply_defaults(self.defaults, self, override_nones=True) def validate(self): """Overloadable by sub-classes for extra validation""" self.schema.validate(self, context=self.__class__.__name__) def ims_imports(self, sigproc_source): if self.movie: ims_import = task_templates.ims_import( sigproc_source, is_movie=True, n_cycles_limit=self.n_frames_limit) else: ims_import = task_templates.ims_import(sigproc_source, is_movie=False) return ims_import def sigprocs_v1(self): sigproc_tasks = [] if self.sigproc_source: for ss in self.sigproc_source: ims_import = self.ims_imports(ss) sigproc = task_templates.sigproc_v1() # task_templates returns a generic sigprocv2 task, and we can fill in some # parameters that any sigprocv2 task might have based on the CliSwitches for # BaseVFSCommand. So any subclass will automatically get these params set. # Where should the schema check for them? sigproc.sigproc_v1.parameters.radial_filter = self.radial_filter sigproc.sigproc_v1.parameters.peak_find_n_cycles = ( self.peak_find_n_cycles) sigproc.sigproc_v1.parameters.peak_find_start = self.peak_find_start sigproc.sigproc_v1.parameters.anomaly_iqr_cutoff = ( self.anomaly_iqr_cutoff) sigproc_task = Munch(**ims_import, **sigproc) sigproc_tasks += [sigproc_task] return sigproc_tasks def sigprocs_v2(self, **kwargs): sigproc_tasks = [] if self.sigproc_source: for ss in self.sigproc_source: ims_import = self.ims_imports(ss) sigproc = task_templates.sigproc_v2(**kwargs) # task_templates returns a generic sigprocv2 task, and we can fill in some # parameters that any sigprocv2 task might have based on the CliSwitches for # BaseVFSCommand. So any subclass will automatically get these params set. # Where should the schema check for them? sigproc_task = Munch(**ims_import, **sigproc) sigproc_tasks += [sigproc_task] return sigproc_tasks def lnfits(self): # It is common to have multiple lnfit tasks for a single run, so this fn returns a # block with potentially multiple lnfit tasks using unique task names when more # than one is present. lnfit_tasks = {} if self.lnfit_params: if not self.lnfit_dye_on_threshold: raise ValueError( f"You must specify a --lnfit_dye_on_threshold when --lnfit_params is given" ) dye_thresholds = self.lnfit_dye_on_threshold lnfit_names = self.lnfit_name or ([None] * len(self.lnfit_params)) photometries_only = self.lnfit_photometry_only or ( [True] * len(self.lnfit_params)) if len(self.lnfit_params) > 1 and len(dye_thresholds) == 1: dye_thresholds *= len(self.lnfit_params) assert len(self.lnfit_params) == len(dye_thresholds) assert len(self.lnfit_params) == len(lnfit_names) for i, (params, thresh, name, photometry_only) in enumerate( zip(self.lnfit_params, dye_thresholds, lnfit_names, photometries_only)): task = task_templates.lnfit() task.lnfit.parameters["lognormal_fitter_v2_params"] = params task.lnfit.parameters["dye_on_threshold"] = thresh task.lnfit.parameters[ "photometry_only"] = photometry_only.lower() in ( "true", "1", ) task_name = "lnfit" if len(self.lnfit_params) > 1 or name: task_name = name or f"lnfit_{i}" helpers.task_rename(task, task_name) lnfit_tasks[task_name] = task[task_name] return lnfit_tasks def run_name(self, aa_list, protease=None, err_set=None): """ A helper for run folder names based on aa_list and protease. Note, not all generators will use this convention. Compose a run_name from protease and aa_list in normalized form: Eg: protease="trypsin", aa_list=("DE", "K") => "trypsin_de_k" """ if protease is None: protease = "" aa_list = [a.replace("[", "").replace("]", "") for a in aa_list] aa = "_".join(aa_list) if err_set is not None: err_str = hashlib.md5( json.dumps(err_set).encode()).hexdigest()[0:4] else: err_str = "" return re.sub( "[^0-9a-z_]+", "_", (protease + ("_" if protease != "" else "") + aa).lower() + "_" + err_str, ) def _label_str_permutate(self, label_str): """ Return list of permutations of a label_str such as: "A,B,C:2" => ("A", "B"), ("A", "C"), ("B", "C") A suffix label set may be added to each permutation with +: "A,B,C:2+S" => ("A", "B", "S"), ("A", "C", "S"), ("B", "C", "S") "A,B,C:2+S,T" => ("A", "B", "S", "T"), ("A", "C", "S", "T"), ("B", "C", "S", "T") """ check.t(label_str, str) semi_split = label_str.split(":") if len(semi_split) > 2: raise ValueError(f"Label-set '{label_str}' has >1 colon.") suffix_labels = "" if len(semi_split) == 2: suffix_split = semi_split[1].split("+") if len(suffix_split) > 2: raise ValueError(f"Label-set '{label_str}' has >1 plus.") if len(suffix_split) == 2: semi_split = [semi_split[0], suffix_split[0]] suffix_labels = suffix_split[1].split(",") suffix_labels = [slabel.strip() for slabel in suffix_labels] labels = semi_split[0].split(",") labels = [label.strip() for label in labels] if len(semi_split) == 1: perm_count = len(labels) else: perm_count = int(semi_split[1]) if not 0 < perm_count < len(labels): raise ValueError( f"Label-set '{label_str}' has a permutation count " f"of {perm_count}; needs to be between 0 and {len(labels) - 1}" ) perms = list(itertools.combinations(labels, perm_count)) if suffix_labels: perms = [p + tuple(suffix_labels) for p in perms] return perms def label_set_permutate(self): check.list_t(self.label_set, str) return utils.flatten([ self._label_str_permutate(label_str) for label_str in self.label_set ], 1) def error_set_permutate(self): tuples = [[(key, val) for val in vals] for key, vals in self.err_param_dict.items()] return tuples def run_parameter_permutator(self): """ Generate permutations of all the variable parameters Defaults all arguments to self.* Gracefully handles lack of protease. """ proteases = utils.non_none(self.get("protease"), [None]) if len(proteases) == 0: proteases = [None] proteases = [("protease", p) for p in proteases] label_sets = self.label_set_permutate() label_sets = [("label_set", s) for s in label_sets] err_sets = self.error_set_permutate() combined = [proteases, label_sets] + err_sets for params in itertools.product(*combined): protease = utils.filt_first(params, lambda i: i[0] == "protease") protease = protease[1] label_set = utils.filt_first(params, lambda i: i[0] == "label_set") label_set = label_set[1] # Given that the label_set is now known, the error model can be setup n_channels = len(label_set) err_set = Munch( p_edman_failure=[ self.error_model_defaults.err_p_edman_failure ] * 1, p_detach=[self.error_model_defaults.err_p_detach] * 1, dye_beta=[self.error_model_defaults.err_dye_beta] * n_channels, dye_sigma=[self.error_model_defaults.err_dye_sigma] * n_channels, dye_gain=[self.error_model_defaults.err_dye_gain] * n_channels, dye_vpd=[self.error_model_defaults.err_dye_vpd] * n_channels, p_bleach_per_cycle=[ self.error_model_defaults.err_p_bleach_per_cycle ] * n_channels, p_non_fluorescent=[ self.error_model_defaults.err_p_non_fluorescent ] * n_channels, ) for param in params: if param[0].startswith("err_"): parts = param[0].split(":") err_set[parts[0][4:]][int(parts[1])] = param[1] # The 4: removes the "err_" yield protease, label_set, err_set def erisyon_block(self, aa_list, protease=None, err_set=None): return task_templates.erisyon( run_name=self.run_name(aa_list, protease, err_set), sample=self.sample, generator_name=self.__class__.__name__, ) def _markdown_to_markdown_block(self, markdown): lines = [f"{line}\n" for line in markdown.split("\n")] block = Munch(**self.markdown_block) block.source = lines return block def report_preamble(self, markdown): """A a preamble in markdown format""" self._report_preamble = markdown def report_section_markdown(self, markdown): self._report_sections += [("markdown", markdown)] def report_section_run_object(self, run): self._report_sections += [ ( "code", [f'run = RunResult("./{run.run_name}")'], ), ] def report_section_job_object(self): self._report_sections += [ ( "code", [f'job = JobResult("//jobs_folder/{self.job}")'], ), ] def report_section_user_config(self): """ Emit report configuation parameters specified by the user via gen so that they can be further edited if desired, and used by reporting functions in the templates. """ config = [] if self.protein_of_interest: config += [ f"PGEN_protein_of_interest = {self.protein_of_interest}\n" ] if self.report_prec: config += [f"PGEN_report_precisions = {self.report_prec}\n"] if config: self.report_section_markdown("# PGEN-controlled report config") config = [ f"# These values were or can be specified by the user at gen time:\n" ] + config self._report_sections += [("code", config)] def report_section_run_array(self, runs, to_load=None): to_load_string = "" if to_load is None else f", to_load={to_load}" run_names = [run.run_name for run in runs] self._report_sections += [( "code", [ f"run_names = {run_names}\n" f'runs = [RunLoader(f"./{{name}}"{to_load_string}) for name in run_names]' ], )] def report_section_from_template(self, template_name): """Write the report from its pieces""" self._report_sections += [("template", template_name)] def report_assemble(self): """Assemble the report from its pieces. A giant Munch is returned""" report = Munch(**self.report_metadata) report.cells = [] preamble_block = self._markdown_to_markdown_block( self._report_preamble) report.cells += [preamble_block] # LOAD all templates templates_by_name = {} for section_type, section_data in self._report_sections: if section_type == "template": file_path = section_data templates_by_name[file_path] = utils.json_load_munch( f"./plaster/gen/nb_templates/{file_path}") # FIND all of the @IMPORT-MERGE blocks import_merge = [] for _, template in templates_by_name.items(): for cell in template.cells: if cell.cell_type == "code": first_line = utils.safe_list_get(cell.source, 0, "") if "# @IMPORT-MERGE" in first_line: for line in cell.source: if "import" in line: import_merge += [line] import_merge += ["from plaster.tools.zplots import zplots\n"] import_merge = sorted(list(set(import_merge))) + ["z=zplots.setup()"] import_block = Munch(**self.code_block) import_block.source = import_merge report.cells += [import_block] for section_type, section_data in self._report_sections: if section_type == "code": lines = section_data block = Munch(**self.code_block) block.source = lines report.cells += [block] elif section_type == "markdown": block = self._markdown_to_markdown_block(section_data) report.cells += [block] elif section_type == "template": file_path = section_data template = templates_by_name[file_path] for cell in template.cells: if cell.cell_type == "code": first_line = utils.safe_list_get(cell.source, 0, "") if ("@IMPORT-MERGE" not in first_line and "@REMOVE-FROM-TEMPLATE" not in first_line): block = Munch(**self.code_block) block.source = cell.source report.cells += [block] if cell.cell_type == "markdown": block = Munch(**self.markdown_block) block.source = cell.source report.cells += [block] return report def report_task(self): pass def generate(self): """ Abstract method to be overloaded. Expected to return a list of runs. """ pass
class SigprocV2Params(ParamsAndPriors): """ About Calibration: The long term goal of the calibration files is to dissociate the name of the file from the records (subjects) in the file. For now, we're going to load all records from the calibration file """ defaults = dict( divs=5, peak_mea=11, n_fields_limit=None, run_regional_balance=True, run_analysis_gauss2_fitter=False, run_aligner=True, run_per_cycle_peakfinder=False, # TODO: Derive the following during calibration by spectral analysis (ie, 2 std of the power spectrum) # ALSO: This needs to be moved into the calibration because it can not allowed to be # different from the calibration results because the calibration bakes in the PSF # as a function of these parameters. low_inflection=0.03, low_sharpness=50.0, high_inflection=0.50, high_sharpness=50.0, self_calib=False, no_calib=False, instrument_identity=None, save_full_signal_radmat_npy=True, calibration_file=None, channel_align_bounds=None, n_cycles_limit=None, ch_aln_override=None, ch_for_alignment=None, run_fast_peak_finder=False, run_minimal_analysis_gauss2_fitter=True, ) schema = s( s.is_kws_r( calibration_file=s.is_str(noneable=True, required=False), instrument_identity=s.is_str(noneable=True), mode=s.is_str(options=common.SIGPROC_V2_MODES), divs=s.is_int(), peak_mea=s.is_int(), n_fields_limit=s.is_int(noneable=True), run_regional_balance=s.is_bool(), run_analysis_gauss2_fitter=s.is_bool(), run_aligner=s.is_bool(), run_per_cycle_peakfinder=s.is_bool(), low_inflection=s.is_float(), low_sharpness=s.is_float(), high_inflection=s.is_float(), high_sharpness=s.is_float(), self_calib=s.is_bool(noneable=True), no_calib=s.is_bool(noneable=True), save_full_signal_radmat_npy=s.is_bool(), channel_align_bounds=s.is_int(noneable=True), n_cycles_limit=s.is_int(noneable=True), # ch_aln_override allows for a temporarily needed hack to bypass the calibration system ch_aln_override=s.is_list(elems=s.is_list(elems=s.is_float()), noneable=True), ch_for_alignment=s.is_int(noneable=True), run_fast_peak_finder=s.is_bool(), run_minimal_analysis_gauss2_fitter=s.is_bool(), )) def validate(self): # Note: does not call super because the override_nones is set to false here self.schema.apply_defaults(self.defaults, apply_to=self, override_nones=False) self.schema.validate(self, context=self.__class__.__name__) if self.mode == common.SIGPROC_V2_ILLUM_CALIB: pass # ZBS: At the moment these checks are more trouble than they are worth # if local.path(self.calibration_file).exists(): # if not log.confirm_yn( # f"\nCalibration file '{self.calibration_file}' already exists " # "when creating a SIGPROC_V2_PSF_CALIB. Overwrite?", # "y", # ): # raise SchemaValidationFailed( # f"Not overwriting calibration file '{self.calibration_file}'" # ) else: # Analyzing if self.self_calib: assert ( self.calibration_file is None ), "In self-calibration mode you may not specify a calibration file" assert ( self.instrument_identity is None ), "In self-calibration mode you may not specify an instrument identity" assert ( self.no_calib is not True ), "In self-calibration mode you may not specify the no_calib option" # elif ( # not self.no_calib # and self.calibration_file != "" # and self.calibration_file is not None # ): # self.calibration = Calib.load_file( # self.calibration_file, self.instrument_identity # ) elif self.no_calib: assert ( self.no_calib_psf_sigma is not None ), "In no_calib mode you must specify an estimated no_calib_psf_sigma" return True