Esempio n. 1
0
class ImsImportParams(Params):
    defaults = Munch(
        is_movie=False,
        start_field=0,
        n_fields_limit=None,
        start_cycle=0,
        n_cycles_limit=None,
        dst_ch_i_to_src_ch_i=None,
        is_z_stack_single_file=False,
        z_stack_n_slices_per_field=None,
    )

    # Note that in movie mode what is called "field" is really the "frame" since the
    # stage does not move between shots.
    # The single .nd2 file in movie mode then treats the "fields" as if they are "cycles"
    # of a single field.

    schema = s(
        s.is_kws_r(
            is_movie=s.is_bool(noneable=True),
            start_field=s.is_int(),
            n_fields_limit=s.is_int(noneable=True),
            start_cycle=s.is_int(noneable=True),
            n_cycles_limit=s.is_int(noneable=True),
            dst_ch_i_to_src_ch_i=s.is_list(elems=s.is_int(), noneable=True),
            is_z_stack_single_file=s.is_bool(),
            z_stack_n_slices_per_field=s.is_int(noneable=True),
        ))
Esempio n. 2
0
class PrepParams(Params):
    defaults = Munch(
        protease=None,
        decoy_mode=None,
        include_misses=0,
        n_peps_limit=None,
        drop_duplicates=False,
        n_ptms_limit=None,
    )

    schema = s(
        s.is_kws_r(
            protease=s.is_list(noneable=True, elems=s.is_str()),
            decoy_mode=s.is_str(noneable=True),
            include_misses=s.is_int(),
            n_peps_limit=s.is_int(noneable=True),
            drop_duplicates=s.is_bool(),
            n_ptms_limit=s.is_int(noneable=True),
            proteins=s.is_list(
                s.is_kws(
                    name=s.is_str(required=True),
                    sequence=s.is_str(required=True),
                    ptm_locs=s.is_str(noneable=True),
                    report=s.is_int(noneable=True),
                    abundance=s.is_number(noneable=True),
                )),
        ))
Esempio n. 3
0
class NNV2Params(ParamsAndPriors):
    defaults = Munch(
        include_training_set=False,
        n_neighbors=8,
        dt_score_bias=0.1,
        include_sigproc=False,
        run_against_all_dyetracks=False,
        run_row_k_fit=True,
        scoring_verbose=False,
        scoring_verbose_cc=False,
        dyetrack_n_counts=None,
        dyetrack_n_cycles=None,
        row_k_score_factor=0.05,
        cycle_balance=None,
        n_rows_limit=None,
        use_lognormal_model=False,
    )

    schema = s(
        s.is_kws_r(
            prior_desc=Priors.priors_desc_schema,
            include_training_set=s.is_bool(),
            n_neighbors=s.is_int(),
            dt_score_bias=s.is_float(),
            include_sigproc=s.is_bool(),
            run_row_k_fit=s.is_bool(),
            run_against_all_dyetracks=s.is_bool(),
            scoring_verbose=s.is_bool(),
            scoring_verbose_cc=s.is_bool(),
            dyetrack_n_counts=s.is_int(noneable=True),
            dyetrack_n_cycles=s.is_int(noneable=True),
            row_k_score_factor=s.is_float(),
            n_rows_limit=s.is_int(noneable=True),
            use_lognormal_model=s.is_bool(),
        ))
Esempio n. 4
0
class LNFitParams(Params):
    defaults = Munch(photometry_only=False)

    schema = s(
        s.is_kws_r(
            dye_on_threshold=s.is_int(),
            photometry_only=s.is_bool(),
            lognormal_fitter_v2_params=s.is_str(),
        ))
Esempio n. 5
0
class TestNNParams(Params):
    defaults = Munch(
        include_training_set=False,
        n_neighbors=8,
        dt_score_mode="gmm_normalized_wpdf_dist_sigma",
        dt_score_metric="",
        dt_score_bias=0.1,
        dt_filter_threshold=0,
        rare_penalty=0.8,
        penalty_coefs=None,
        radius=15.0,
        random_seed=None,
    )

    schema = s(
        s.is_kws_r(
            include_training_set=s.is_bool(),
            n_neighbors=s.is_int(),
            dt_score_bias=s.is_float(),
            dt_score_mode=s.is_str(options=[
                "gmm_normalized_wpdf",
                "gmm_normalized_wpdf_dist_sigma",
                "gmm_normalized_wpdf_no_inv_var",
                "one",
                "dt_freq_log_weight",
                "cdist_normalized",
                "cdist_weighted_sqrt",
                "cdist_weighted_log",
                "cdist_weighted_normalized",
                "cdist_weighted_normalized_sqrt",
                "cdist_weighted_normalized_log",
            ]),
            dt_score_metric=s.is_str(options=[
                "",
                "braycurtis",
                "canberra",
                "chebyshev",
                "cityblock",
                "correlation",
                "cosine",
                "euclidean",
                "jensenshannon",
                "minkowski",
                "seuclidean",
                "sqeuclidean",
            ]),
            dt_filter_threshold=s.is_int(),
            penalty_coefs=s.is_list(elems=s.is_float(),
                                    min_len=2,
                                    max_len=2,
                                    noneable=True),
            rare_penalty=s.is_float(noneable=True),
            radius=s.is_float(),
            random_seed=s.is_int(noneable=True),
        ))
Esempio n. 6
0
class SigprocV2Params(Params):
    defaults = dict(
        radiometry_channels=None,
        n_fields_limit=None,
        save_full_signal_radmat_npy=False,
        # use_cycle_zero_psfs_only=False,
    )

    schema = s(
        s.is_kws_r(
            radiometry_channels=s.is_dict(noneable=True),
            n_fields_limit=s.is_int(noneable=True),
            save_full_signal_radmat_npy=s.is_bool(),
            calibration=s.is_dict(),
            instrument_subject_id=s.is_str(noneable=True),
            # use_cycle_zero_psfs_only=s.is_bool(),
        ))

    def validate(self):
        # Note: does not call super because the override_nones is set to false here
        self.schema.apply_defaults(self.defaults,
                                   apply_to=self,
                                   override_nones=False)
        self.schema.validate(self, context=self.__class__.__name__)

        self.calibration = Calibration(self.calibration)
        if self.instrument_subject_id is not None:
            self.calibration.filter_subject_ids(self.instrument_subject_id)
            if len(self.calibration.keys()) == 0:
                raise ValueError(
                    f"All calibration records removed after filter_subject_ids on subject_id '{self.instrument_subject_id}'"
                )

        assert not self.calibration.has_subject_ids()

        if self.radiometry_channels is not None:
            pat = re.compile(r"[0-9a-z_]+")
            for name, channel_i in self.radiometry_channels.items():
                self._validate(
                    pat.fullmatch(name),
                    "radiometry_channels name must be lower-case alphanumeric (including underscore)",
                )
                self._validate(isinstance(channel_i, int),
                               "channel_i must be an integer")

    def set_radiometry_channels_from_input_channels_if_needed(
            self, n_channels):
        if self.radiometry_channels is None:
            # Assume channels from nd2 manifest
            channels = list(range(n_channels))
            self.radiometry_channels = {f"ch_{ch}": ch for ch in channels}

    @property
    def n_output_channels(self):
        return len(self.radiometry_channels.keys())

    @property
    def n_input_channels(self):
        return len(self.radiometry_channels.keys())

    # @property
    # def channels_cycles_dim(self):
    #     # This is a cache set in sigproc_v1.
    #     # It is a helper for the repetitive call:
    #     # n_outchannels, n_inchannels, n_cycles, dim =
    #     return self._outchannels_inchannels_cycles_dim

    def _input_channels(self):
        """
        Return a list that converts channel number of the output to the channel of the input
        Example:
            input might have channels ["foo", "bar"]
            the radiometry_channels has: {"bar": 0}]
            Thus this function returns [1] because the 0th output channel is mapped
            to the "1" input channel
        """
        return [
            self.radiometry_channels[name]
            for name in sorted(self.radiometry_channels.keys())
        ]

    # def input_names(self):
    #     return sorted(self.radiometry_channels.keys())

    def output_channel_to_input_channel(self, out_ch):
        return self._input_channels()[out_ch]

    def input_channel_to_output_channel(self, in_ch):
        """Not every input channel necessarily has an output; can return None"""
        return utils.filt_first_arg(self._input_channels(),
                                    lambda x: x == in_ch)
Esempio n. 7
0
class ClassifyV1Generator(BaseGenerator):
    """
    General-purpose generator for classifying peptides/proteins.
    May be used to search for one or more "needle" peptides.

    Assumptions:

    Generator-specific arguments:
    @--protein_of_interest="P10636-8"           # Only affects reporting downstream

    """

    # These schema are in general subsets of the "params" for different plaster tasks,
    # and for convenience in sharing among generators they are defined in BaseGenerator.
    # Its a bit arbitrary where some parameters end up, because they might be shared
    # by two different tasks that both get run as part of a classify run.  For example,
    # this classify generator supports runs that classify either just simulations, or
    # additionally actual data from a scope.  Both sims and scope runs need n_edmans,
    # n_mocks, n_pres.  But the schema for each cannot both contain these else we'll
    # pass duplicate key names into the schema below.

    schema = s(
        s.is_kws_r(
            **BaseGenerator.job_setup_schema.schema(),
            **BaseGenerator.protein_schema.schema(),
            **BaseGenerator.label_set_schema.schema(),
            **BaseGenerator.lnfit_schema.schema(),
            **BaseGenerator.scope_run_schema.schema(),
            **BaseGenerator.peptide_setup_schema.schema(),
            **BaseGenerator.sigproc_source_schema.schema(),
            **BaseGenerator.sigproc_v1_schema.schema(),
            **BaseGenerator.error_model_schema.schema(),
            **BaseGenerator.sim_schema.schema(),
            **BaseGenerator.scheme_schema.schema(),
            rf=s.is_bool(help="Include rf classifier", noneable=True),
            report_prec=s.is_list(
                elems=s.is_float(bounds=(0.001, 0.999)),
                help="The precision for classifier reporting",
            ),
        )
    )

    defaults = Munch(
        n_edmans=10,
        n_pres=0,
        n_mocks=1,
        n_samples_train=5_000,
        n_samples_test=1_000,
        decoys="none",
        random_seed=None,
        rf=True,
        sigproc_source=None,
        protein_of_interest=None,
        lnfit_name=None,
        lnfit_params=None,
        lnfit_dye_on_threshold=None,
        movie=False,
        radial_filter=None,
        peak_find_n_cycles=4,
        peak_find_start=0,
        anomaly_iqr_cutoff=95,
        # dye_beta=[7500.0],
        # dye_sigma=[0.16],
        n_ptms_limit=5,
        report_prec=[0.95, 0.9, 0.8],
    )

    def apply_defaults(self):
        super().apply_defaults()

        # Plumbum creates empty lists on list switches. This means
        # that the apply defaults doesn't quite work right.
        # TASK: Find a cleaner solution. For now hard-code
        # if len(self.err_dye_beta) == 0:
        #     self.err_dye_beta = self.defaults.dye_beta
        # if len(self.dye_sigma) == 0:
        #     self.dye_sigma = self.defaults.dye_sigma
        if len(self.report_prec) == 0:
            self.report_prec = self.defaults.report_prec

    def validate(self):
        super().validate()
        assert self.rf

    def generate(self):
        self.report_section_user_config()

        sigproc_tasks = self.sigprocs_v1() or [{}]  # guarantee traverse loop once

        # TODO: 'default' reporting needs to be rethought.  Maybe we just employ
        # gen switch that says which report type.  The pattern that has developed
        # is that each project of any substance wants a special type of report.  These
        # projects are different enough that you always want to include custom stuff.
        # Presumably as we do more collabs/projects, they tend to group into a
        # handful of basic types.
        #
        # Bear in mind that we're in the classify generator, so all of these
        # refer to jobs that involve classification. (jobs like photobleaching
        # or other sigprocv2-only tasks don't -- those have their own hacky
        # report logic similar to what you'll see below).
        #
        # Currently those types are: 'standard' sigprocv2 with classify,
        # spike-in sigprocv2 with classify.
        #
        # VFS-only types: 'standard classify', PTM classify,
        # MHC classify (perhaps this is really standard classify, but is big, and
        # does not use a protease, and has all small uniform-length peptides)
        #
        # See all the hacky logic after these loops that patch together
        # a report by trying to deduce which of the above we're looking
        # at.
        #
        # Maybe we just need different generators instead of including
        # complex reporting logic?
        #
        # Etc.
        #

        # PTM, MHC, and PRO are the three classes of highest-level specialized reports
        # that report on all of the runs in a job taken together.  Whereas the default
        # report that comes out of classify will emit a long report with one section per
        # run, this became totally unwieldy when a job has 50+ (or hundreds!) of runs.
        # In that case you really only want a high-level report with a way to explore
        # the runs, and that's exactly what the specialized PTM, MHC, and PRO templates
        # are created for.  Here we try to cleverly deduce what kind of report we should
        # do based on whether there are PTMs present, Proteins-of-interest present, or
        # in the hackiest case, whether the sample or job name contains a given string.
        #
        # A PTM report is done if PTMs have been specified for any of the proteins
        ptm_report = any([pro.get("ptm_locs") for pro in self.protein])

        # A MHC-style report (which is special in that we know ahead of time that
        # the peptides are identical for all runs -- because we started with a list
        # of peptides -- so we can do lots of interesting comparisons that you can't
        # do when the peptides differ from run-to-run) is created for jobs which have
        # the string 'mhc' in their job-name or sample-name.  This needs to change,
        # but our Broad MHC project is the only one of this class for a year now.
        # This report is useful for any job that contains runs whose peptides are
        # identical -- this means either peptides were provided in the first place
        # and no protease was given to the "prep" task, or that only one protease,
        # and potentially lots of label schemes, is used.
        mhc_report = not ptm_report and (
            "mhc" in self.job.lower() or "mhc" in self.sample.lower()
        )

        # A protein-identification report is done if there are proteins of interest
        pro_report = (
            not ptm_report
            and not mhc_report
            and (
                bool(self.protein_of_interest)
                or any([pro.get("is_poi") for pro in self.protein])
            )
        )

        run_descs = []
        for protease, aa_list, err_set in self.run_parameter_permutator():
            for sigproc_i, sigproc_v1_task in enumerate(sigproc_tasks):
                prep_task = task_templates.prep(
                    self.protein,
                    protease,
                    self.decoys,
                    pois=self.protein_of_interest,
                    n_ptms_limit=self.n_ptms_limit,
                )

                sim_v1_task = {}
                sim_v2_task = {}
                train_rf_task = {}
                test_rf_task = {}
                classify_rf_task = {}

                train_rf_task = task_templates.train_rf()
                test_rf_task = task_templates.rf_v2()
                if sigproc_v1_task:
                    classify_rf_task = task_templates.classify_rf_v1(
                        prep_relative_path="../prep",
                        sim_relative_path="../sim_v1",
                        train_relative_path="../train_rf",
                        sigproc_relative_path=f"../sigproc_v1",
                    )

                sim_v1_task = task_templates.sim_v1(
                    list(aa_list),
                    err_set,
                    n_pres=self.n_pres,
                    n_mocks=self.n_mocks,
                    n_edmans=self.n_edmans,
                    n_samples_train=self.n_samples_train,
                    n_samples_test=self.n_samples_test,
                )
                sim_v1_task.sim_v1.parameters.random_seed = self.random_seed

                lnfit_task = self.lnfits("v2")

                e_block = self.erisyon_block(aa_list, protease, err_set)

                sigproc_suffix = (
                    f"_sigproc_{sigproc_i}" if len(sigproc_tasks) > 1 else ""
                )

                run_name = f"{e_block._erisyon.run_name}{sigproc_suffix}"
                if self.force_run_name is not None:
                    run_name = self.force_run_name

                run_desc = Munch(
                    run_name=run_name,
                    **e_block,
                    **prep_task,
                    **sim_v1_task,
                    **sim_v2_task,
                    **train_rf_task,
                    **test_rf_task,
                    **sigproc_v1_task,
                    **lnfit_task,
                    **classify_rf_task,
                )
                run_descs += [run_desc]

                # for classify jobs that involve PTMs or MHC, we'll do run reporting
                # differently rather than emitting a section for each run.
                if not ptm_report and not mhc_report and not pro_report:
                    self.report_section_markdown(f"# RUN {run_desc.run_name}")
                    self.report_section_run_object(run_desc)
                    if test_rf_task:
                        self.report_section_from_template(
                            "train_and_test_template.ipynb"
                        )

        self.report_section_markdown(f"# JOB {self.job}")
        self.report_section_job_object()

        if ptm_report:
            self.report_section_from_template("train_and_test_template_ptm.ipynb")
        elif mhc_report:
            self.report_section_from_template("train_and_test_template_mhc.ipynb")
        elif pro_report:
            self.report_section_from_template("train_and_test_template_pro.ipynb")
        else:
            self.report_section_from_template("train_and_test_epilog_template.ipynb")

        n_runs = len(run_descs)
        if n_runs > 1 and sigproc_tasks[0]:
            # TASK: better logic for when to include spike_template.  --spike?
            self.report_section_from_template("spike_template.ipynb")

        sigproc_imports_desc = ""
        if sigproc_tasks[0]:
            sigproc_imports_desc = "## Sigproc imports:\n"
            sigproc_imports_desc += "\n".join(
                [f"\t* {s.ims_import.inputs.src_dir}" for s in sigproc_tasks]
            )

            self.report_section_first_run_object()
            self.report_section_from_template("sigproc_v1_template.ipynb")
            self.report_section_from_template("classify_template.ipynb")

        self.report_preamble(
            utils.smart_wrap(
                f"""
                # Classify Overview
                ## {n_runs} run_desc(s) processed.
                ## Sample: {self.sample}
                ## Job: {self.job}
                {sigproc_imports_desc}
            """,
                width=None,
            )
        )

        return run_descs
Esempio n. 8
0
class BaseGenerator(report_builder.ReportBuilder, Munch):
    """
    Base of all generators.

    Expects sub-classes to provide a class member "required_schema"
    which is used for parsing the kwargs on the __init__()

    Inherits from ReportBuilder for backwards compatibility with generators which expect to find report methods on the generator class
    """

    schema = None  # Should be overloaded in any sub-class
    defaults = {}  # Should be overloaded in any sub-class

    job_setup_schema = s(
        s.is_kws_r(
            job=s.is_str(help="See Main Help"),
            sample=s.is_str(allow_empty_string=False, help="See Main Help"),
        ))

    protein_schema = s(
        s.is_kws_r(
            protein=s.is_list(elems=s.is_kws_r(
                id=s.is_str(),
                seqstr=s.is_str(),
            )),
            protein_of_interest=s.is_list(
                s.is_str(allow_empty_string=False),
                noneable=True,
                help=
                "The id of the protein(s) of interest, used in survey and reporting",
            ),
        ))

    label_set_schema = s(
        s.is_kws_r(
            label_set=s.is_list(elems=s.is_str(), help="See Main Help")))

    lnfit_schema = s(
        s.is_kws_r(
            lnfit_name=s.is_list(s.is_str(),
                                 noneable=True,
                                 help="See Main Help"),
            lnfit_params=s.is_list(s.is_str(),
                                   noneable=True,
                                   help="See Main Help"),
            lnfit_dye_on_threshold=s.is_list(s.is_int(),
                                             noneable=True,
                                             help="See Main Help"),
            lnfit_photometry_only=s.is_list(s.is_str(),
                                            noneable=True,
                                            help="See Main Help"),
        ))

    scope_run_schema = s(
        s.is_kws_r(
            n_edmans=s.is_int(help="See Main Help"),
            n_pres=s.is_int(help="See Main Help"),
            n_mocks=s.is_int(help="See Main Help"),
        ))

    peptide_setup_schema = s(
        s.is_kws_r(
            protease=s.is_list(elems=s.is_str(), help="See Main Help"),
            decoys=s.is_str(help="See Main Help"),
            random_seed=s.is_int(noneable=True, help="See Main Help"),
            n_ptms_limit=s.is_int(
                bounds=(0, 12),
                help=
                "Max number of PTMs per peptide to allow.  Peptides with more PTM sites than this will not consider any PTM permutations.",
            ),
        ))

    sim_schema = s(
        s.is_kws_r(
            n_samples_train=s.is_int(bounds=(1, None), help="See Main Help"),
            n_samples_test=s.is_int(bounds=(1, None), help="See Main Help"),
            allow_edman_cterm=s.is_bool(
                noneable=True,
                help=
                "Edman cycles can remove final C-terminal AA from peptides at plate boundary.",
            ),
            use_lognormal_model=s.is_bool(
                help="Use older lognormal radiometry model", ),
            is_photobleaching_run=s.is_bool(),
            photobleaching_run_n_dye_count=s.is_int(noneable=True),
        ))

    sigproc_source_schema = s(
        s.is_kws_r(
            movie=s.is_bool(noneable=True, help="See Main Help"),
            n_cycles_limit=s.is_int(noneable=True, help="See Main Help"),
            start_cycle=s.is_int(noneable=True, help="See Main Help"),
            dst_ch_i_to_src_ch_i=s.is_str(noneable=True,
                                          help="Comma separated"),
        ))

    sigproc_v1_schema = s(
        s.is_kws_r(
            sigproc_source=s.is_str(noneable=True, help="See Main Help"),
            radial_filter=s.is_float(noneable=True,
                                     bounds=(0.01, 1.0),
                                     help="See Main Help"),
            peak_find_n_cycles=s.is_int(bounds=(1, 10000),
                                        help="See Main Help"),
            peak_find_start=s.is_int(bounds=(0, 10000), help="See Main Help"),
            anomaly_iqr_cutoff=s.is_int(bounds=(1, 100), help="See Main Help"),
        ))

    sigproc_v2_schema = s(
        s.is_kws_r(
            calibration_job=s.is_str(noneable=True),
            sigproc_source=s.is_str(noneable=True, help="See Main Help"),
            self_calib=s.is_bool(noneable=True),
            ch_aln=s.is_str(noneable=True,
                            help="comma delimited in x0,y0,x1,y1,..."),
            ch_for_alignment=s.is_int(noneable=True),
            calib_dst_ch_i_to_src_ch_i=s.is_str(noneable=True,
                                                help="Comma separated"),
        ))

    sigproc_v2_calib_schema = s(
        s.is_kws_r(
            sigproc_source=s.is_str(noneable=True, help="See Main Help"),
            movie=s.is_bool(noneable=True),
            mode=s.is_str(options=["illum"]),
            # mode will eventually have a second option "dye calib"
        ))

    # TODO: Remove all error_model_schema
    error_model_schema = s(
        s.is_kws_r(
            err_p_edman_failure=s.is_list(elems=s.is_str(
                help="See Main Help")),
            err_p_detach=s.is_list(elems=s.is_str(help="See Main Help")),
            err_p_bleach=s.is_list(elems=s.is_str(help="See Main Help")),
            err_p_non_fluorescent=s.is_list(elems=s.is_str(
                help="See Main Help")),
            err_row_k_sigma=s.is_list(elems=s.is_str(help="See Main Help")),
            # For lognormal: to be deprecated
            err_dye_beta=s.is_list(elems=s.is_str(help="See Main Help")),
            err_dye_sigma=s.is_list(elems=s.is_str(help="See Main Help")),
            err_dye_zero_beta=s.is_list(elems=s.is_str(help="See Main Help")),
            err_dye_zero_sigma=s.is_list(elems=s.is_str(help="See Main Help")),
            # For normal
            err_gain_mu=s.is_list(elems=s.is_str(help="See Main Help")),
            err_gain_sigma=s.is_list(elems=s.is_str(help="See Main Help")),
            err_bg_mu=s.is_list(elems=s.is_str(help="See Main Help")),
            err_bg_sigma=s.is_list(elems=s.is_str(help="See Main Help")),
        ))

    # Scheme is a flag that allows passing a pair of (protease, label_set) in directly,
    # Rather than passing them separately and getting permutations
    scheme_schema = s(
        s.is_kws_r(scheme=s.is_list(elems=s.is_str(), help="See Main Help")))

    classifier_choice_schema = s(s.is_kws_r(classifier=s.is_str()))

    error_model_defaults_chemistry = Munch(
        err_p_edman_failure=0.06,
        err_p_detach=0.05,
        err_p_bleach=0.05,
        err_p_non_fluorescent=0.07,
    )

    error_model_defaults_lognormal = Munch(
        err_row_k_sigma=0.16,
        err_dye_beta=7500.0,
        err_dye_sigma=0.16,
        err_dye_zero_beta=0.0,
        err_dye_zero_sigma=400.0,
    )

    error_model_defaults_normal = Munch(
        # Based on eye-balling val18_2t
        err_row_k_sigma=0.16,
        err_gain_mu=15_000.0,
        err_gain_sigma=1_200.0,
        err_bg_mu=0.0,
        err_bg_sigma=400.0,
    )

    has_report = True

    def __init__(self, **kwargs):
        # APPLY defaults and then ask user for any elements that are not declared

        super().__init__(**kwargs)
        self.apply_defaults()
        self.setup_err_model()
        self.validate()

        self.reports = Munch()
        self.add_report("report", self)

        # static reports are ipynb files that are placed in the _reports
        # folder under a job and are executed by the indexer.
        # self.static_reports is a list of file names (without paths)
        self.static_reports = []

        self._validate_protein_of_interest()

    def add_report(self, report_name, builder):
        assert report_name not in self.reports
        self.reports[report_name] = builder

    def _validate_protein_of_interest(self):
        if "protein" in self:
            seq_ids = {seq["id"] for seq in self.protein}
            for poi in self.protein_of_interest:
                if poi not in seq_ids:
                    raise ValueError(
                        f"protein_of_interest '{poi}' is not in the protein id list. "
                        f"Confirm you specified a Name and not a UniprotAC")

    def setup_err_model(self):
        err_param_dict = defaultdict(list)
        for name, type, _, user_data in self.error_model_schema.requirements():
            values = self.get(name, [])
            for value in values:
                low_prob, high_prob, step_prob = None, None, 1

                parts = value.split("|")
                if len(parts) == 2:
                    dye_part = parts[0]
                    prob_parts = parts[1]
                else:
                    dye_part = None
                    prob_parts = parts[0]

                prob_parts = prob_parts.split(":")

                if name in (
                        "err_p_edman_failure",
                        "err_p_detach",
                        "err_row_k_beta",
                        "err_row_k_sigma",
                ):
                    if dye_part:
                        raise SchemaValidationFailed(
                            f"error model term '{name}' is not allowed to have a dye-index."
                        )
                else:
                    if dye_part is None:
                        raise SchemaValidationFailed(
                            f"error model term '{name}' expected a dye-index.")

                low_prob = float(prob_parts[0])
                if len(prob_parts) > 1:
                    high_prob = float(prob_parts[1])
                if len(prob_parts) > 2:
                    step_prob = int(prob_parts[2])
                if high_prob is None:
                    high_prob = low_prob

                key = f"{name}:{dye_part if dye_part is not None else 0}"
                err_param_dict[key] += np.linspace(low_prob, high_prob,
                                                   step_prob).tolist()
                err_param_dict[key] = list(set(err_param_dict[key]))
        self.err_param_dict = err_param_dict

    def apply_defaults(self):
        """Overloadable by sub-classes."""
        self.schema.apply_defaults(self.defaults, self, override_nones=True)

    def validate(self):
        """Overloadable by sub-classes for extra validation"""
        self.schema.validate(self, context=self.__class__.__name__)

    def sigprocs_v1(self):
        tasks = []
        if self.sigproc_source:
            ims_import = task_templates.ims_import(
                self.sigproc_source,
                is_movie=self.movie,
                n_cycles_limit=self.n_cycles_limit,
                start_cycle=self.start_cycle,
                dst_ch_i_to_src_ch_i=self.dst_ch_i_to_src_ch_i,
            )
            sigproc = task_templates.sigproc_v1()
            sigproc.sigproc_v1.parameters.radial_filter = self.radial_filter
            sigproc.sigproc_v1.parameters.peak_find_n_cycles = self.peak_find_n_cycles
            sigproc.sigproc_v1.parameters.peak_find_start = self.peak_find_start
            sigproc.sigproc_v1.parameters.anomaly_iqr_cutoff = self.anomaly_iqr_cutoff
            tasks += [Munch(**ims_import, **sigproc)]
        return tasks

    def tasks_for_sigproc_v2(self):
        tasks = {}
        if self.sigproc_source:

            ims_import_task = task_templates.ims_import(
                self.sigproc_source,
                is_movie=self.movie,
                n_cycles_limit=self.n_cycles_limit,
                start_cycle=self.start_cycle,
                dst_ch_i_to_src_ch_i=self.dst_ch_i_to_src_ch_i,
            )

            calib_priors = None
            if self.calibration_job is not None:
                calib_src_path = (local.path(self.calibration_job) /
                                  "sigproc_v2_calib/plaster_output/sigproc_v2")
                calib_result = SigprocV2Result.load_from_folder(
                    calib_src_path, prop_list=["calib_priors"])
                calib_priors = calib_result.calib_priors

                if self.calib_dst_ch_i_to_src_ch_i is not None:
                    # Convert a string like 2,1,0 and remap
                    check.t(self.calib_dst_ch_i_to_src_ch_i, str)
                    calib_dst_ch_i_to_src_ch_i = [
                        int(ch_i)
                        for ch_i in self.calib_dst_ch_i_to_src_ch_i.split(",")
                    ]

                    ch_remapped_priors = Priors.copy(calib_priors)
                    ch_remapped_priors.delete_ch_specific_records()

                    ch_aln_prior = ch_remapped_priors.get_exact(f"ch_aln")
                    if ch_aln_prior is not None:
                        ch_aln_prior = ChannelAlignPrior.ch_remap(
                            ch_aln_prior.prior, calib_dst_ch_i_to_src_ch_i)

                    for dst_ch_i, src_ch_i in enumerate(
                            calib_dst_ch_i_to_src_ch_i):

                        def remap(src_key, dst_key):
                            prior = calib_priors.get_exact(src_key)
                            if prior is not None:
                                ch_remapped_priors.add(
                                    dst_key, prior.prior,
                                    "remapped channel in gen")

                        remap(f"reg_illum.ch_{src_ch_i}",
                              f"reg_illum.ch_{dst_ch_i}")
                        remap(f"reg_psf.ch_{src_ch_i}",
                              f"reg_psf.ch_{dst_ch_i}")

                    calib_priors = ch_remapped_priors

            ch_aln = None
            if self.ch_aln is not None:
                ch_aln = np.array([float(i) for i in self.ch_aln.split(",")])
                assert ch_aln.shape[0] % 2 == 0
                ch_aln = ch_aln.reshape((-1, 2))

            sigproc_v2_task = task_templates.sigproc_v2_analyze(
                calib_priors=calib_priors,
                self_calib=self.self_calib,
                ch_aln=ch_aln,
                ch_for_alignment=self.ch_for_alignment,
            )

            tasks = Munch(**ims_import_task, **sigproc_v2_task)

        return tasks

    def lnfits(self, sigproc_version):
        # It is common to have multiple lnfit tasks for a single run, so this fn returns a
        # block with potentially multiple lnfit tasks using unique task names when more
        # than one is present.
        lnfit_tasks = {}
        if self.lnfit_params:
            if not self.lnfit_dye_on_threshold:
                raise ValueError(
                    f"You must specify a --lnfit_dye_on_threshold when --lnfit_params is given"
                )

            dye_thresholds = self.lnfit_dye_on_threshold
            lnfit_names = self.lnfit_name or ([None] * len(self.lnfit_params))
            photometries_only = self.lnfit_photometry_only or (
                [True] * len(self.lnfit_params))

            if len(self.lnfit_params) > 1 and len(dye_thresholds) == 1:
                dye_thresholds *= len(self.lnfit_params)

            assert len(self.lnfit_params) == len(dye_thresholds)
            assert len(self.lnfit_params) == len(lnfit_names)

            for i, (params, thresh, name, photometry_only) in enumerate(
                    zip(self.lnfit_params, dye_thresholds, lnfit_names,
                        photometries_only)):
                task = task_templates.lnfit(sigproc_version=sigproc_version)
                task.lnfit.parameters["lognormal_fitter_v2_params"] = params
                task.lnfit.parameters["dye_on_threshold"] = thresh
                task.lnfit.parameters[
                    "photometry_only"] = photometry_only.lower() in (
                        "true",
                        "1",
                    )

                task_name = "lnfit"
                if len(self.lnfit_params) > 1 or name:
                    task_name = name or f"lnfit_{i}"
                    helpers.task_rename(task, task_name)
                lnfit_tasks[task_name] = task[task_name]
        return lnfit_tasks

    def run_name(self, aa_list, protease=None, err_set=None):
        """
        A helper for run folder names based on aa_list and protease.
        Note, not all generators will use this convention.

        Compose a run_name from protease and aa_list in normalized form:
        Eg: protease="trypsin", aa_list=("DE", "K") => "trypsin_de_k"
        """
        if protease is None:
            protease = ""

        if aa_list is not None:
            aa_list = [a.replace("[", "").replace("]", "") for a in aa_list]
            aas = "_".join(aa_list)
        else:
            aas = "bleach"

        if err_set is not None:
            err_str = hashlib.md5(
                json.dumps(err_set).encode()).hexdigest()[0:4]
        else:
            err_str = ""

        return re.sub(
            "[^0-9a-z_]+",
            "_",
            (protease + ("_" if protease != "" else "") + aas).lower() + "_" +
            err_str,
        )

    def _label_str_permutate(self, label_str):
        """
        Return list of permutations of a label_str such as:
        "A,B,C:2" => ("A", "B"), ("A", "C"), ("B", "C")

        A suffix label set may be added to each permutation with +:
        "A,B,C:2+S" => ("A", "B", "S"), ("A", "C", "S"), ("B", "C", "S")
        "A,B,C:2+S,T" => ("A", "B", "S", "T"), ("A", "C", "S", "T"), ("B", "C", "S", "T")
        """

        check.t(label_str, str)
        semi_split = label_str.split(":")

        if len(semi_split) > 2:
            raise ValueError(f"Label-set '{label_str}' has >1 colon.")

        suffix_labels = ""
        if len(semi_split) == 2:
            suffix_split = semi_split[1].split("+")

            if len(suffix_split) > 2:
                raise ValueError(f"Label-set '{label_str}' has >1 plus.")

            if len(suffix_split) == 2:
                semi_split = [semi_split[0], suffix_split[0]]
                suffix_labels = suffix_split[1].split(",")
                suffix_labels = [slabel.strip() for slabel in suffix_labels]

        labels = semi_split[0].split(",")
        labels = [label.strip() for label in labels]

        if len(semi_split) == 1:
            perm_count = len(labels)
        else:
            perm_count = int(semi_split[1])
            if not 0 < perm_count < len(labels):
                raise ValueError(
                    f"Label-set '{label_str}' has a permutation count "
                    f"of {perm_count}; needs to be between 0 and {len(labels) - 1}"
                )

        perms = list(itertools.combinations(labels, perm_count))

        if suffix_labels:
            perms = [p + tuple(suffix_labels) for p in perms]

        return perms

    def label_set_permutate(self) -> List[Tuple[str, ...]]:
        """
        Returns a list of label sets, where each label set is a tuple of strings
        """
        check.list_t(self.label_set, str)
        return utils.flatten([
            self._label_str_permutate(label_str)
            for label_str in self.label_set
        ], 1)

    def error_set_permutate(self):
        tuples = [[(key, val) for val in vals]
                  for key, vals in self.err_param_dict.items()]
        return tuples

    def scheme_set_permutate(self) -> List[Scheme]:
        """
        Unparsed schemes are of form: protease/label_set, where protease is a str,
        and label_set is a str parseable by self._label_str_permutate
        """
        parsed_schemes = []
        for scheme in self.scheme:
            split = scheme.split("/")
            if len(split) != 2 or not all(split):
                raise ValueError(
                    f"Scheme {scheme} must be of form: protease/label_set")

            parsed_label_set = self._label_str_permutate(split[1])
            parsed_schemes += [
                Scheme(split[0], label_set) for label_set in parsed_label_set
            ]
        return parsed_schemes

    def default_err_set(self, n_channels, use_lognormal_model):

        if use_lognormal_model:
            defaults = Munch(
                **self.error_model_defaults_chemistry,
                **self.error_model_defaults_lognormal,
            )

            # TODO: No longer correct
            return Munch(
                p_edman_failure=[defaults.err_p_edman_failure] * 1,
                p_detach=[defaults.err_p_detach] * 1,
                p_bleach=[defaults.err_p_bleach] * n_channels,
                p_non_fluorescent=[defaults.err_p_non_fluorescent] *
                n_channels,
                row_k_sigma=[defaults.err_row_k_sigma] * 1,
                gain_mu=[defaults.err_dye_beta] * n_channels,
                gain_sigma=[defaults.err_dye_sigma] * n_channels,
                bg_mu=[defaults.err_dye_zero_beta] * n_channels,
                bg_sigma=[defaults.err_dye_zero_sigma] * n_channels,
            )
        else:
            defaults = Munch(
                **self.error_model_defaults_chemistry,
                **self.error_model_defaults_normal,
            )
            return Munch(
                p_edman_failure=[defaults.err_p_edman_failure] * 1,
                p_detach=[defaults.err_p_detach] * 1,
                p_bleach=[defaults.err_p_bleach] * n_channels,
                p_non_fluorescent=[defaults.err_p_non_fluorescent] *
                n_channels,
                row_k_sigma=[defaults.err_row_k_sigma] * 1,
                gain_mu=[defaults.err_gain_mu] * n_channels,
                gain_sigma=[defaults.err_gain_sigma] * n_channels,
                bg_mu=[defaults.err_bg_mu] * n_channels,
                bg_sigma=[defaults.err_bg_sigma] * n_channels,
            )

    def photobleaching_err_set(self, n_channels, use_lognormal_model):

        if use_lognormal_model:
            defaults = Munch(
                **self.error_model_defaults_chemistry,
                **self.error_model_defaults_lognormal,
            )

            # TODO: No longer correct
            return Munch(
                p_edman_failure=[0.0] * 1,
                p_detach=[0.0] * 1,
                p_bleach=[0.0] * n_channels,
                p_non_fluorescent=[0.0] * n_channels,
                row_k_sigma=[defaults.err_row_k_sigma] * 1,
                gain_mu=[defaults.err_dye_beta] * n_channels,
                gain_sigma=[defaults.err_dye_sigma] * n_channels,
                bg_mu=[defaults.err_dye_zero_beta] * n_channels,
                bg_sigma=[defaults.err_dye_zero_sigma] * n_channels,
            )
        else:
            defaults = Munch(
                **self.error_model_defaults_chemistry,
                **self.error_model_defaults_normal,
            )
            return Munch(
                p_edman_failure=[0.0] * 1,
                p_detach=[0.0] * 1,
                p_bleach=[0.0] * n_channels,
                p_non_fluorescent=[0.0] * n_channels,
                row_k_sigma=[defaults.err_row_k_sigma] * 1,
                gain_mu=[defaults.err_gain_mu] * n_channels,
                gain_sigma=[defaults.err_gain_sigma] * n_channels,
                bg_mu=[defaults.err_bg_mu] * n_channels,
                bg_sigma=[defaults.err_bg_sigma] * n_channels,
            )

    def run_parameter_permutator(self, use_lognormal_model=True):
        """
        Generate permutations of all the variable parameters
        Defaults all arguments to self.*
        Gracefully handles lack of protease.
        """

        proteases = utils.non_none(self.get("protease"), [None])
        proteases = [("protease", p) for p in proteases]

        label_sets = self.label_set_permutate()
        label_sets = [("label_set", s) for s in label_sets]

        if len(proteases) == 0:
            proteases = [("protease", None)]

        err_sets = self.error_set_permutate()

        combined = [proteases, label_sets] + err_sets

        # Schemes is a list of schemes, where each scheme is a tuple containing:
        # - A Label set, in the form of Tuple['label_set', Tuple[str, ...]]
        # - A protease, in the form of Tuple['protease', str]

        # Build scheme set from protease and label set args
        schemes = list(itertools.product(*combined))

        # Add in directly specified schemes
        schemes += [(("protease", scheme.protease), ("label_set",
                                                     scheme.label_set))
                    for scheme in self.scheme_set_permutate()]

        for params in schemes:
            protease = utils.filt_first(params, lambda i: i[0] == "protease")
            protease = protease[1]
            label_set = utils.filt_first(params, lambda i: i[0] == "label_set")
            label_set = label_set[1]

            # Given that the label_set is now known, the error model can be setup
            n_channels = len(label_set)
            err_set = self.default_err_set(n_channels, use_lognormal_model)

            for param in params:
                if param[0].startswith("err_"):
                    parts = param[0].split(":")
                    err_set[parts[0][4:]][int(
                        parts[1])] = param[1]  # The 4: removes the "err_"

            yield protease, label_set, err_set

    def erisyon_block(self, aa_list, protease=None, err_set=None):
        return task_templates.erisyon(
            run_name=self.run_name(aa_list, protease, err_set),
            sample=self.sample,
            generator_name=self.__class__.__name__,
        )

    def report_section_user_config(self, report=None):
        """
        Emit report configuation parameters specified by the user via gen so that they
        can be further edited if desired, and used by reporting functions in the templates.
        """
        if report is None:
            report = self

        config = []
        if self.protein_of_interest:
            config += [
                f"PGEN_protein_of_interest = {self.protein_of_interest}\n"
            ]
        if self.report_prec:
            config += [f"PGEN_report_precisions = {self.report_prec}\n"]
        if config:
            self.report_section_markdown("# PGEN-controlled report config")
            config = [
                f"# These values were or can be specified by the user at gen time:\n"
            ] + config
            report.add_report_section("code", config)

    def report_assemble(self):
        """
        Overrides report_assemble in ReportBuilder to implement the self.has_report behavior
        """
        if not self.has_report:
            return None
        else:
            return super().report_assemble()

    def generate(self):
        """
        Abstract method to be overloaded.
        Expected to return a list of runs.
        """
        pass
Esempio n. 9
0
class SigprocV1Params(Params):
    defaults = dict(
        hat_rad=2,
        iqr_rng=96,
        threshold_abs=1.0,
        channel_indices_for_alignment=None,
        channel_indices_for_peak_finding=None,
        radiometry_channels=None,
        save_debug=False,
        peak_find_n_cycles=4,
        peak_find_start=0,
        radial_filter=None,
        anomaly_iqr_cutoff=95,
        n_fields_limit=None,
        save_full_signal_radmat_npy=False,
    )

    schema = s(
        s.is_kws_r(
            anomaly_iqr_cutoff=s.is_number(noneable=True, bounds=(0, 100)),
            radial_filter=s.is_float(noneable=True, bounds=(0, 1)),
            peak_find_n_cycles=s.is_int(bounds=(1, None), noneable=True),
            peak_find_start=s.is_int(bounds=(0, None), noneable=True),
            save_debug=s.is_bool(),
            hat_rad=s.is_int(bounds=(1, 3)),
            iqr_rng=s.is_number(noneable=True, bounds=(0, 100)),
            threshold_abs=s.is_number(
                bounds=(0, 100)),  # Not sure of a reasonable bound
            channel_indices_for_alignment=s.is_list(s.is_int(), noneable=True),
            channel_indices_for_peak_finding=s.is_list(s.is_int(),
                                                       noneable=True),
            radiometry_channels=s.is_dict(noneable=True),
            n_fields_limit=s.is_int(noneable=True),
            save_full_signal_radmat_npy=s.is_bool(),
        ))

    def validate(self):
        # Note: does not call super because the override_nones is set to false here
        self.schema.apply_defaults(self.defaults,
                                   apply_to=self,
                                   override_nones=False)
        self.schema.validate(self, context=self.__class__.__name__)

        if self.radiometry_channels is not None:
            pat = re.compile(r"[0-9a-z_]+")
            for name, channel_i in self.radiometry_channels.items():
                self._validate(
                    pat.fullmatch(name),
                    "radiometry_channels name must be lower-case alphanumeric (including underscore)",
                )
                self._validate(isinstance(channel_i, int),
                               "channel_i must be an integer")

    def set_radiometry_channels_from_input_channels_if_needed(
            self, n_channels):
        if self.radiometry_channels is None:
            # Assume channels from nd2 manifest
            channels = list(range(n_channels))
            self.radiometry_channels = {f"ch_{ch}": ch for ch in channels}

    @property
    def n_output_channels(self):
        return len(self.radiometry_channels.keys())

    @property
    def n_input_channels(self):
        return len(self.radiometry_channels.keys())

    @property
    def channels_cycles_dim(self):
        # This is a cache set in sigproc_v1.
        # It is a helper for the repeative call:
        # n_outchannels, n_inchannels, n_cycles, dim =
        return self._outchannels_inchannels_cycles_dim

    def _input_channels(self):
        """
        Return a list that converts channel number of the output to the channel of the input
        Example:
            input might have channels ["foo", "bar"]
            the radiometry_channels has: {"bar": 0}]
            Thus this function returns [1] because the 0th output channel is mapped
            to the "1" input channel
        """
        return [
            self.radiometry_channels[name]
            for name in sorted(self.radiometry_channels.keys())
        ]

    # def input_names(self):
    #     return sorted(self.radiometry_channels.keys())

    def output_channel_to_input_channel(self, out_ch):
        return self._input_channels()[out_ch]

    def input_channel_to_output_channel(self, in_ch):
        """Not every input channel necessarily has an output; can return None"""
        return utils.filt_first_arg(self._input_channels(),
                                    lambda x: x == in_ch)
Esempio n. 10
0
class SimParams(Params):
    """
    Simulations parameters is and ErrorModel + parameters for sim
    """

    defaults = Munch(
        n_pres=1,
        n_mocks=0,
        n_edmans=1,
        n_samples_train=5_000,
        n_samples_test=1_000,
        dyes=[],
        labels=[],
        random_seed=None,
        train_n_sample_multiplier=
        None,  # This does not appear to be used anywhere. tfb
        allow_train_test_to_be_identical=False,
        enable_ptm_labels=False,
        is_survey=False,
    )

    schema = s(
        s.is_kws_r(
            is_survey=s.is_bool(),
            error_model=s.is_kws(**ErrorModel.schema.schema()),
            n_pres=s.is_int(bounds=(0, None)),
            n_mocks=s.is_int(bounds=(0, None)),
            n_edmans=s.is_int(bounds=(0, None)),
            n_samples_train=s.is_int(bounds=(1, None)),
            n_samples_test=s.is_int(bounds=(1, None)),
            dyes=s.is_list(elems=s.is_kws_r(dye_name=s.is_str(),
                                            channel_name=s.is_str())),
            labels=s.is_list(elems=s.is_kws_r(
                amino_acid=s.is_str(),
                dye_name=s.is_str(),
                label_name=s.is_str(),
                ptm_only=s.is_bool(required=False, noneable=True),
            )),
            random_seed=s.is_int(required=False, noneable=True),
            allow_train_test_to_be_identical=s.is_bool(required=False,
                                                       noneable=True),
            enable_ptm_labels=s.is_bool(required=False, noneable=True),
        ))

    def copy(self):
        # REMOVE everything that _build_join_dfs put in
        utils.safe_del(self, "df")
        utils.safe_del(self, "by_channel")
        utils.safe_del(self, "ch_by_aa")
        utils.safe_del(self, "channel_i_to_gain")
        utils.safe_del(self, "channel_i_to_vpd")

        dst = utils.munch_deep_copy(self, klass_set={SimParams})
        dst.error_model = ErrorModel(**dst.error_model)
        assert isinstance(dst, SimParams)
        return dst

    def __init__(self, include_dfs=True, **kwargs):
        kwargs["error_model"] = kwargs.pop("error_model", ErrorModel())
        super().__init__(**kwargs)
        if include_dfs:
            self._build_join_dfs()

    def validate(self):
        super().validate()

        all_dye_names = list(set([d.dye_name for d in self.dyes]))

        # No duplicate dye names
        self._validate(
            len(all_dye_names) == len(self.dyes),
            "The dye list contains a duplicate")

        # No duplicate labels
        self._validate(
            len(list(set(utils.listi(self.labels,
                                     "amino_acid")))) == len(self.labels),
            "There is a duplicate label",
        )

        # All labels have a legit dye name
        [
            self._validate(
                label.dye_name in all_dye_names,
                f"Label {label.label_name} does not have a valid matching dye_name",
            ) for label in self.labels
        ]

    @property
    def n_cycles(self):
        return self.n_pres + self.n_mocks + self.n_edmans

    def channels(self):
        return sorted(list(set(utils.listi(self.dyes, "channel_name"))))

    def channel_i_by_name(self):
        channels = self.channels()
        return {
            channel_name: channel_i
            for channel_i, channel_name in enumerate(channels)
        }

    @property
    def n_channels(self):
        return len(self.channel_i_by_name().keys())

    @property
    def n_channels_and_cycles(self):
        return self.n_channels, self.n_cycles

    def _build_join_dfs(self):
        """
        The error model contains information about the dyes and labels and other terms.
        Those error model parameters are wired together by names which are useful
        for reconciling calibrations.

        But here, these "by name" parameters are all put into a dataframe so that
        they can be indexed by integers.
        """
        sim_dyes_df = pd.DataFrame(self.dyes)
        assert len(sim_dyes_df) > 0

        sim_labels_df = pd.DataFrame(self.labels)
        assert len(sim_labels_df) > 0

        error_model_dyes_df = pd.DataFrame(self.error_model.dyes)
        assert len(error_model_dyes_df) > 0

        error_model_labels_df = pd.DataFrame(self.error_model.labels)
        assert len(error_model_labels_df) > 0

        if len(sim_dyes_df) > 0:
            channel_df = (sim_dyes_df[[
                "channel_name"
            ]].drop_duplicates().reset_index(
                drop=True).rename_axis("ch_i").reset_index())

            label_df = pd.merge(left=sim_labels_df,
                                right=error_model_labels_df,
                                on="label_name")

            dye_df = pd.merge(left=sim_dyes_df,
                              right=error_model_dyes_df,
                              on="dye_name")
            dye_df = pd.merge(left=dye_df, right=channel_df, on="channel_name")

            self.df = (pd.merge(
                left=label_df, right=dye_df,
                on="dye_name").drop_duplicates().reset_index(drop=True))
        else:
            self.df = pd.DataFrame()

        assert np.all(
            self.df.groupby("ch_i").p_bleach_per_cycle.nunique() == 1)
        assert np.all(self.df.groupby("ch_i").beta.nunique() == 1)
        assert np.all(self.df.groupby("ch_i").sigma.nunique() == 1)

        self.by_channel = [
            Munch(
                p_bleach_per_cycle=self.df[self.df.ch_i ==
                                           ch].iloc[0].p_bleach_per_cycle,
                beta=self.df[self.df.ch_i == ch].iloc[0].beta,
                sigma=self.df[self.df.ch_i == ch].iloc[0].sigma,
                gain=self.df[self.df.ch_i == ch].iloc[0].gain,
                vpd=self.df[self.df.ch_i == ch].iloc[0].vpd,
            ) for ch in range(self.n_channels)
        ]

        self.ch_by_aa = {
            row.amino_acid: row.ch_i
            for row in self.df.itertuples()
        }

        # These two needs to be lists (not ndarray) because they have to be duplicated
        self.channel_i_to_gain = [
            self.by_channel[i].gain for i in range(self.n_channels)
        ]
        self.channel_i_to_vpd = [
            self.by_channel[i].vpd for i in range(self.n_channels)
        ]

    def to_label_list(self):
        """Summarize labels like: ["DE", "C"]"""
        return [
            "".join([
                label.amino_acid for label in self.labels
                if label.dye_name == dye.dye_name
            ]) for dye in self.dyes
        ]

    def to_label_str(self):
        """Summarize labels like: DE,C"""
        return ",".join(self.to_label_list())

    @classmethod
    def construct_from_aa_list(cls, aa_list, **kwargs):
        """
        This is a helper to generate channel when you have a list of aas.
        For example, two channels where ch0 is D&E and ch1 is Y.
        ["DE", "Y"].

        If you pass in an error model, it needs to match channels and labels.
        """

        check.list_or_tuple_t(aa_list, str)

        allowed_aa_mods = ["[", "]"]
        assert all([(aa.isalpha() or aa in allowed_aa_mods) for aas in aa_list
                    for aa in list(aas)])

        dyes = [
            Munch(dye_name=f"dye_{ch}", channel_name=f"ch_{ch}")
            for ch, _ in enumerate(aa_list)
        ]

        # Note the extra for loop because "DE" needs to be split into "D" & "E"
        # which is done by aa_str_to_list() - which also handles PTMs like S[p]
        labels = [
            Munch(
                amino_acid=aa,
                dye_name=f"dye_{ch}",
                label_name=f"label_{ch}",
                ptm_only=False,
            ) for ch, aas in enumerate(aa_list) for aa in aa_str_to_list(aas)
        ]

        return cls(dyes=dyes, labels=labels, **kwargs)
Esempio n. 11
0
class BaseGenerator(Munch):
    """
    Base of all generators.

    Expects sub-classes to provide a class member "required_schema"
    which is used for parsing the kwargs on the __init__()
    """

    schema = None  # Should be overloaded in any sub-class
    defaults = {}  # Should be overloaded in any sub-class

    job_setup_schema = s(
        s.is_kws_r(
            job=s.is_str(help="See Main Help"),
            sample=s.is_str(allow_empty_string=False, help="See Main Help"),
        ))

    protein_schema = s(
        s.is_kws_r(
            protein=s.is_list(elems=s.is_kws_r(
                id=s.is_str(),
                seqstr=s.is_str(),
            )),
            protein_of_interest=s.is_list(
                s.is_str(allow_empty_string=False),
                noneable=True,
                help=
                "The id of the protein(s) of interest, used in survey and reporting",
            ),
        ))

    label_set_schema = s(
        s.is_kws_r(
            label_set=s.is_list(elems=s.is_str(), help="See Main Help")))

    lnfit_schema = s(
        s.is_kws_r(
            lnfit_name=s.is_list(s.is_str(),
                                 noneable=True,
                                 help="See Main Help"),
            lnfit_params=s.is_list(s.is_str(),
                                   noneable=True,
                                   help="See Main Help"),
            lnfit_dye_on_threshold=s.is_list(s.is_int(),
                                             noneable=True,
                                             help="See Main Help"),
            lnfit_photometry_only=s.is_list(s.is_str(),
                                            noneable=True,
                                            help="See Main Help"),
        ))

    scope_run_schema = s(
        s.is_kws_r(
            n_edmans=s.is_int(help="See Main Help"),
            n_pres=s.is_int(help="See Main Help"),
            n_mocks=s.is_int(help="See Main Help"),
        ))

    peptide_setup_schema = s(
        s.is_kws_r(
            protease=s.is_list(elems=s.is_str(), help="See Main Help"),
            decoys=s.is_str(help="See Main Help"),
            random_seed=s.is_int(noneable=True, help="See Main Help"),
            n_ptms_limit=s.is_int(
                bounds=(0, 12),
                help=
                "Max number of PTMs per peptide to allow.  Peptides with more PTM sites than this will not consider any PTM permutations.",
            ),
        ))

    sim_schema = s(
        s.is_kws_r(
            n_samples_train=s.is_int(bounds=(1, None), help="See Main Help"),
            n_samples_test=s.is_int(bounds=(1, None), help="See Main Help"),
        ))

    classify_schema = s(
        s.is_kws_r(
            classify_skip_nn=s.is_bool(
                help="Skips Nearest Neighbor classifier if set"),
            classify_skip_rf=s.is_bool(
                help="Skips Random Forest classifier if set"),
            report_prec=s.is_list(
                elems=s.is_float(bounds=(0.001, 0.999)),
                help="The precision for classifier reporting",
            ),
        ))

    sigproc_source_schema = s(
        s.is_kws_r(
            sigproc_source=s.is_list(s.is_str(),
                                     noneable=True,
                                     help="See Main Help"),
            movie=s.is_bool(help="See Main Help"),
            n_frames_limit=s.is_int(bounds=(1, 500),
                                    noneable=True,
                                    help="See Main Help"),
        ))

    sigproc_v1_schema = s(
        s.is_kws_r(
            radial_filter=s.is_float(noneable=True,
                                     bounds=(0.01, 1.0),
                                     help="See Main Help"),
            peak_find_n_cycles=s.is_int(bounds=(1, 10000),
                                        help="See Main Help"),
            peak_find_start=s.is_int(bounds=(0, 10000), help="See Main Help"),
            anomaly_iqr_cutoff=s.is_int(bounds=(1, 100), help="See Main Help"),
        ))

    sigproc_v2_schema = s(
        s.is_kws_r(
            calibration_file=s.is_str(),
            instrument_subject_id=s.is_str(),
        ))

    report_metadata = Munch(
        metadata=Munch(
            kernelspec=Munch(display_name="Python 3",
                             language="python",
                             name="python3"),
            language_info=Munch(
                codemirror_mode=Munch(name="ipython", version=3),
                file_extension=".py",
                mimetype="text/x-python",
                name="python",
                nbconvert_exporter="python",
                pygments_lexer="ipython3",
                version="3.6.7",
            ),
        ),
        nbformat=4,
        nbformat_minor=2,
    )

    error_model_schema = s(
        s.is_kws_r(
            err_p_edman_failure=s.is_list(elems=s.is_str(
                help="See Main Help")),
            err_p_detach=s.is_list(elems=s.is_str(help="See Main Help")),
            err_dye_beta=s.is_list(elems=s.is_str(help="See Main Help")),
            err_dye_sigma=s.is_list(elems=s.is_str(help="See Main Help")),
            err_p_bleach_per_cycle=s.is_list(elems=s.is_str(
                help="See Main Help")),
            err_p_non_fluorescent=s.is_list(elems=s.is_str(
                help="See Main Help")),
        ))

    error_model_defaults = Munch(
        err_p_edman_failure=0.06,
        err_p_detach=0.05,
        err_dye_beta=7500.0,
        err_dye_sigma=0.16,
        err_dye_gain=7500.0,
        err_dye_vpd=0.1,
        err_p_bleach_per_cycle=0.05,
        err_p_non_fluorescent=0.07,
    )

    code_block = Munch(cell_type="code",
                       execution_count=None,
                       metadata=Munch(),
                       outputs=[],
                       source=[])

    markdown_block = Munch(cell_type="markdown", metadata=Munch(), source=[])

    def __init__(self, **kwargs):
        # APPLY defaults and then ask user for any elements that are not declared

        super().__init__(**kwargs)
        self.apply_defaults()
        debug(self)
        self.setup_err_model()
        self.validate()

        self._report_sections = []
        self._report_preamble = None
        self._validate_protein_of_interest()

    def _validate_protein_of_interest(self):
        if "protein" in self:
            seq_ids = {seq["id"] for seq in self.protein}
            for poi in self.protein_of_interest:
                if poi not in seq_ids:
                    raise ValueError(
                        f"protein_of_interest '{poi}' is not in the protein id list. "
                        f"Confirm you specified a Name and not a UniprotAC")

    def setup_err_model(self):
        err_param_dict = defaultdict(list)
        for name, type, _, user_data in self.error_model_schema.requirements():
            values = self.get(name, [])
            for value in values:
                low_prob, high_prob, step_prob = None, None, 1

                parts = value.split("|")
                if len(parts) == 2:
                    dye_part = parts[0]
                    prob_parts = parts[1]
                else:
                    dye_part = None
                    prob_parts = parts[0]

                prob_parts = prob_parts.split(":")

                if name in ("err_p_edman_failure", "err_p_detach"):
                    if dye_part:
                        raise SchemaValidationFailed(
                            f"error model term '{name}' is not allowed to have a dye-index."
                        )
                else:
                    if dye_part is None:
                        raise SchemaValidationFailed(
                            f"error model term '{name}' expected a dye-index.")

                low_prob = float(prob_parts[0])
                if len(prob_parts) > 1:
                    high_prob = float(prob_parts[1])
                if len(prob_parts) > 2:
                    step_prob = int(prob_parts[2])
                if high_prob is None:
                    high_prob = low_prob

                key = f"{name}:{dye_part if dye_part is not None else 0}"
                err_param_dict[key] += np.linspace(low_prob, high_prob,
                                                   step_prob).tolist()
                err_param_dict[key] = list(set(err_param_dict[key]))
        self.err_param_dict = err_param_dict

    def apply_defaults(self):
        """Overloadable by sub-classes."""
        self.schema.apply_defaults(self.defaults, self, override_nones=True)

    def validate(self):
        """Overloadable by sub-classes for extra validation"""
        self.schema.validate(self, context=self.__class__.__name__)

    def ims_imports(self, sigproc_source):
        if self.movie:
            ims_import = task_templates.ims_import(
                sigproc_source,
                is_movie=True,
                n_cycles_limit=self.n_frames_limit)
        else:
            ims_import = task_templates.ims_import(sigproc_source,
                                                   is_movie=False)

        return ims_import

    def sigprocs_v1(self):
        sigproc_tasks = []
        if self.sigproc_source:
            for ss in self.sigproc_source:
                ims_import = self.ims_imports(ss)
                sigproc = task_templates.sigproc_v1()
                # task_templates returns a generic sigprocv2 task, and we can fill in some
                # parameters that any sigprocv2 task might have based on the CliSwitches for
                # BaseVFSCommand.  So any subclass will automatically get these params set.
                # Where should the schema check for them?
                sigproc.sigproc_v1.parameters.radial_filter = self.radial_filter
                sigproc.sigproc_v1.parameters.peak_find_n_cycles = (
                    self.peak_find_n_cycles)
                sigproc.sigproc_v1.parameters.peak_find_start = self.peak_find_start
                sigproc.sigproc_v1.parameters.anomaly_iqr_cutoff = (
                    self.anomaly_iqr_cutoff)

                sigproc_task = Munch(**ims_import, **sigproc)
                sigproc_tasks += [sigproc_task]
        return sigproc_tasks

    def sigprocs_v2(self, **kwargs):
        sigproc_tasks = []
        if self.sigproc_source:
            for ss in self.sigproc_source:
                ims_import = self.ims_imports(ss)
                sigproc = task_templates.sigproc_v2(**kwargs)
                # task_templates returns a generic sigprocv2 task, and we can fill in some
                # parameters that any sigprocv2 task might have based on the CliSwitches for
                # BaseVFSCommand.  So any subclass will automatically get these params set.
                # Where should the schema check for them?
                sigproc_task = Munch(**ims_import, **sigproc)
                sigproc_tasks += [sigproc_task]
        return sigproc_tasks

    def lnfits(self):
        # It is common to have multiple lnfit tasks for a single run, so this fn returns a
        # block with potentially multiple lnfit tasks using unique task names when more
        # than one is present.
        lnfit_tasks = {}
        if self.lnfit_params:
            if not self.lnfit_dye_on_threshold:
                raise ValueError(
                    f"You must specify a --lnfit_dye_on_threshold when --lnfit_params is given"
                )

            dye_thresholds = self.lnfit_dye_on_threshold
            lnfit_names = self.lnfit_name or ([None] * len(self.lnfit_params))
            photometries_only = self.lnfit_photometry_only or (
                [True] * len(self.lnfit_params))

            if len(self.lnfit_params) > 1 and len(dye_thresholds) == 1:
                dye_thresholds *= len(self.lnfit_params)

            assert len(self.lnfit_params) == len(dye_thresholds)
            assert len(self.lnfit_params) == len(lnfit_names)

            for i, (params, thresh, name, photometry_only) in enumerate(
                    zip(self.lnfit_params, dye_thresholds, lnfit_names,
                        photometries_only)):
                task = task_templates.lnfit()
                task.lnfit.parameters["lognormal_fitter_v2_params"] = params
                task.lnfit.parameters["dye_on_threshold"] = thresh
                task.lnfit.parameters[
                    "photometry_only"] = photometry_only.lower() in (
                        "true",
                        "1",
                    )

                task_name = "lnfit"
                if len(self.lnfit_params) > 1 or name:
                    task_name = name or f"lnfit_{i}"
                    helpers.task_rename(task, task_name)
                lnfit_tasks[task_name] = task[task_name]
        return lnfit_tasks

    def run_name(self, aa_list, protease=None, err_set=None):
        """
        A helper for run folder names based on aa_list and protease.
        Note, not all generators will use this convention.

        Compose a run_name from protease and aa_list in normalized form:
        Eg: protease="trypsin", aa_list=("DE", "K") => "trypsin_de_k"
        """
        if protease is None:
            protease = ""
        aa_list = [a.replace("[", "").replace("]", "") for a in aa_list]
        aa = "_".join(aa_list)
        if err_set is not None:
            err_str = hashlib.md5(
                json.dumps(err_set).encode()).hexdigest()[0:4]
        else:
            err_str = ""
        return re.sub(
            "[^0-9a-z_]+",
            "_",
            (protease + ("_" if protease != "" else "") + aa).lower() + "_" +
            err_str,
        )

    def _label_str_permutate(self, label_str):
        """
        Return list of permutations of a label_str such as:
        "A,B,C:2" => ("A", "B"), ("A", "C"), ("B", "C")

        A suffix label set may be added to each permutation with +:
        "A,B,C:2+S" => ("A", "B", "S"), ("A", "C", "S"), ("B", "C", "S")
        "A,B,C:2+S,T" => ("A", "B", "S", "T"), ("A", "C", "S", "T"), ("B", "C", "S", "T")
        """

        check.t(label_str, str)
        semi_split = label_str.split(":")

        if len(semi_split) > 2:
            raise ValueError(f"Label-set '{label_str}' has >1 colon.")

        suffix_labels = ""
        if len(semi_split) == 2:
            suffix_split = semi_split[1].split("+")

            if len(suffix_split) > 2:
                raise ValueError(f"Label-set '{label_str}' has >1 plus.")

            if len(suffix_split) == 2:
                semi_split = [semi_split[0], suffix_split[0]]
                suffix_labels = suffix_split[1].split(",")
                suffix_labels = [slabel.strip() for slabel in suffix_labels]

        labels = semi_split[0].split(",")
        labels = [label.strip() for label in labels]

        if len(semi_split) == 1:
            perm_count = len(labels)
        else:
            perm_count = int(semi_split[1])
            if not 0 < perm_count < len(labels):
                raise ValueError(
                    f"Label-set '{label_str}' has a permutation count "
                    f"of {perm_count}; needs to be between 0 and {len(labels) - 1}"
                )

        perms = list(itertools.combinations(labels, perm_count))

        if suffix_labels:
            perms = [p + tuple(suffix_labels) for p in perms]

        return perms

    def label_set_permutate(self):
        check.list_t(self.label_set, str)
        return utils.flatten([
            self._label_str_permutate(label_str)
            for label_str in self.label_set
        ], 1)

    def error_set_permutate(self):
        tuples = [[(key, val) for val in vals]
                  for key, vals in self.err_param_dict.items()]
        return tuples

    def run_parameter_permutator(self):
        """
        Generate permutations of all the variable parameters
        Defaults all arguments to self.*
        Gracefully handles lack of protease.
        """
        proteases = utils.non_none(self.get("protease"), [None])
        if len(proteases) == 0:
            proteases = [None]
        proteases = [("protease", p) for p in proteases]

        label_sets = self.label_set_permutate()
        label_sets = [("label_set", s) for s in label_sets]

        err_sets = self.error_set_permutate()

        combined = [proteases, label_sets] + err_sets

        for params in itertools.product(*combined):
            protease = utils.filt_first(params, lambda i: i[0] == "protease")
            protease = protease[1]
            label_set = utils.filt_first(params, lambda i: i[0] == "label_set")
            label_set = label_set[1]

            # Given that the label_set is now known, the error model can be setup
            n_channels = len(label_set)
            err_set = Munch(
                p_edman_failure=[
                    self.error_model_defaults.err_p_edman_failure
                ] * 1,
                p_detach=[self.error_model_defaults.err_p_detach] * 1,
                dye_beta=[self.error_model_defaults.err_dye_beta] * n_channels,
                dye_sigma=[self.error_model_defaults.err_dye_sigma] *
                n_channels,
                dye_gain=[self.error_model_defaults.err_dye_gain] * n_channels,
                dye_vpd=[self.error_model_defaults.err_dye_vpd] * n_channels,
                p_bleach_per_cycle=[
                    self.error_model_defaults.err_p_bleach_per_cycle
                ] * n_channels,
                p_non_fluorescent=[
                    self.error_model_defaults.err_p_non_fluorescent
                ] * n_channels,
            )

            for param in params:
                if param[0].startswith("err_"):
                    parts = param[0].split(":")
                    err_set[parts[0][4:]][int(parts[1])] = param[1]
                    # The 4: removes the "err_"

            yield protease, label_set, err_set

    def erisyon_block(self, aa_list, protease=None, err_set=None):
        return task_templates.erisyon(
            run_name=self.run_name(aa_list, protease, err_set),
            sample=self.sample,
            generator_name=self.__class__.__name__,
        )

    def _markdown_to_markdown_block(self, markdown):
        lines = [f"{line}\n" for line in markdown.split("\n")]
        block = Munch(**self.markdown_block)
        block.source = lines
        return block

    def report_preamble(self, markdown):
        """A a preamble in markdown format"""
        self._report_preamble = markdown

    def report_section_markdown(self, markdown):
        self._report_sections += [("markdown", markdown)]

    def report_section_run_object(self, run):
        self._report_sections += [
            (
                "code",
                [f'run = RunResult("./{run.run_name}")'],
            ),
        ]

    def report_section_job_object(self):
        self._report_sections += [
            (
                "code",
                [f'job = JobResult("//jobs_folder/{self.job}")'],
            ),
        ]

    def report_section_user_config(self):
        """
        Emit report configuation parameters specified by the user via gen so that they
        can be further edited if desired, and used by reporting functions in the templates.
        """
        config = []
        if self.protein_of_interest:
            config += [
                f"PGEN_protein_of_interest = {self.protein_of_interest}\n"
            ]
        if self.report_prec:
            config += [f"PGEN_report_precisions = {self.report_prec}\n"]
        if config:
            self.report_section_markdown("# PGEN-controlled report config")
            config = [
                f"# These values were or can be specified by the user at gen time:\n"
            ] + config
            self._report_sections += [("code", config)]

    def report_section_run_array(self, runs, to_load=None):
        to_load_string = "" if to_load is None else f", to_load={to_load}"
        run_names = [run.run_name for run in runs]
        self._report_sections += [(
            "code",
            [
                f"run_names = {run_names}\n"
                f'runs = [RunLoader(f"./{{name}}"{to_load_string}) for name in run_names]'
            ],
        )]

    def report_section_from_template(self, template_name):
        """Write the report from its pieces"""
        self._report_sections += [("template", template_name)]

    def report_assemble(self):
        """Assemble the report from its pieces. A giant Munch is returned"""
        report = Munch(**self.report_metadata)
        report.cells = []

        preamble_block = self._markdown_to_markdown_block(
            self._report_preamble)
        report.cells += [preamble_block]

        # LOAD all templates
        templates_by_name = {}
        for section_type, section_data in self._report_sections:
            if section_type == "template":
                file_path = section_data
                templates_by_name[file_path] = utils.json_load_munch(
                    f"./plaster/gen/nb_templates/{file_path}")

        # FIND all of the @IMPORT-MERGE blocks
        import_merge = []
        for _, template in templates_by_name.items():
            for cell in template.cells:
                if cell.cell_type == "code":
                    first_line = utils.safe_list_get(cell.source, 0, "")
                    if "# @IMPORT-MERGE" in first_line:
                        for line in cell.source:
                            if "import" in line:
                                import_merge += [line]

        import_merge += ["from plaster.tools.zplots import zplots\n"]
        import_merge = sorted(list(set(import_merge))) + ["z=zplots.setup()"]
        import_block = Munch(**self.code_block)
        import_block.source = import_merge
        report.cells += [import_block]

        for section_type, section_data in self._report_sections:
            if section_type == "code":
                lines = section_data
                block = Munch(**self.code_block)
                block.source = lines
                report.cells += [block]

            elif section_type == "markdown":
                block = self._markdown_to_markdown_block(section_data)
                report.cells += [block]

            elif section_type == "template":
                file_path = section_data
                template = templates_by_name[file_path]
                for cell in template.cells:
                    if cell.cell_type == "code":
                        first_line = utils.safe_list_get(cell.source, 0, "")

                        if ("@IMPORT-MERGE" not in first_line
                                and "@REMOVE-FROM-TEMPLATE" not in first_line):
                            block = Munch(**self.code_block)
                            block.source = cell.source
                            report.cells += [block]

                    if cell.cell_type == "markdown":
                        block = Munch(**self.markdown_block)
                        block.source = cell.source
                        report.cells += [block]

        return report

    def report_task(self):
        pass

    def generate(self):
        """
        Abstract method to be overloaded.
        Expected to return a list of runs.
        """
        pass
Esempio n. 12
0
class SigprocV2Params(ParamsAndPriors):
    """
    About Calibration:
        The long term goal of the calibration files is to dissociate
        the name of the file from the records (subjects) in the file.
        For now, we're going to load all records from the calibration file
    """

    defaults = dict(
        divs=5,
        peak_mea=11,
        n_fields_limit=None,
        run_regional_balance=True,
        run_analysis_gauss2_fitter=False,
        run_aligner=True,
        run_per_cycle_peakfinder=False,
        # TODO: Derive the following during calibration by spectral analysis (ie, 2 std of the power spectrum)
        # ALSO: This needs to be moved into the calibration because it can not allowed to be
        # different from the calibration results because the calibration bakes in the PSF
        # as a function of these parameters.
        low_inflection=0.03,
        low_sharpness=50.0,
        high_inflection=0.50,
        high_sharpness=50.0,
        self_calib=False,
        no_calib=False,
        instrument_identity=None,
        save_full_signal_radmat_npy=True,
        calibration_file=None,
        channel_align_bounds=None,
        n_cycles_limit=None,
        ch_aln_override=None,
        ch_for_alignment=None,
        run_fast_peak_finder=False,
        run_minimal_analysis_gauss2_fitter=True,
    )

    schema = s(
        s.is_kws_r(
            calibration_file=s.is_str(noneable=True, required=False),
            instrument_identity=s.is_str(noneable=True),
            mode=s.is_str(options=common.SIGPROC_V2_MODES),
            divs=s.is_int(),
            peak_mea=s.is_int(),
            n_fields_limit=s.is_int(noneable=True),
            run_regional_balance=s.is_bool(),
            run_analysis_gauss2_fitter=s.is_bool(),
            run_aligner=s.is_bool(),
            run_per_cycle_peakfinder=s.is_bool(),
            low_inflection=s.is_float(),
            low_sharpness=s.is_float(),
            high_inflection=s.is_float(),
            high_sharpness=s.is_float(),
            self_calib=s.is_bool(noneable=True),
            no_calib=s.is_bool(noneable=True),
            save_full_signal_radmat_npy=s.is_bool(),
            channel_align_bounds=s.is_int(noneable=True),
            n_cycles_limit=s.is_int(noneable=True),
            # ch_aln_override allows for a temporarily needed hack to bypass the calibration system
            ch_aln_override=s.is_list(elems=s.is_list(elems=s.is_float()),
                                      noneable=True),
            ch_for_alignment=s.is_int(noneable=True),
            run_fast_peak_finder=s.is_bool(),
            run_minimal_analysis_gauss2_fitter=s.is_bool(),
        ))

    def validate(self):
        # Note: does not call super because the override_nones is set to false here
        self.schema.apply_defaults(self.defaults,
                                   apply_to=self,
                                   override_nones=False)
        self.schema.validate(self, context=self.__class__.__name__)

        if self.mode == common.SIGPROC_V2_ILLUM_CALIB:
            pass
            # ZBS: At the moment these checks are more trouble than they are worth
            # if local.path(self.calibration_file).exists():
            #     if not log.confirm_yn(
            #         f"\nCalibration file '{self.calibration_file}' already exists "
            #         "when creating a SIGPROC_V2_PSF_CALIB. Overwrite?",
            #         "y",
            #     ):
            #         raise SchemaValidationFailed(
            #             f"Not overwriting calibration file '{self.calibration_file}'"
            #         )

        else:
            # Analyzing
            if self.self_calib:
                assert (
                    self.calibration_file is None
                ), "In self-calibration mode you may not specify a calibration file"
                assert (
                    self.instrument_identity is None
                ), "In self-calibration mode you may not specify an instrument identity"
                assert (
                    self.no_calib is not True
                ), "In self-calibration mode you may not specify the no_calib option"

            # elif (
            #     not self.no_calib
            #     and self.calibration_file != ""
            #     and self.calibration_file is not None
            # ):
            #     self.calibration = Calib.load_file(
            #         self.calibration_file, self.instrument_identity
            #     )

            elif self.no_calib:
                assert (
                    self.no_calib_psf_sigma is not None
                ), "In no_calib mode you must specify an estimated no_calib_psf_sigma"

        return True
Esempio n. 13
0
 def it_validates_bool():
     test_s = s(s.is_bool())
     test_s.validate(True)
     with zest.raises(SchemaValidationFailed):
         test_s.validate(1)
Esempio n. 14
0
class SigprocV2Generator(BaseGenerator):
    """
    Examine sigproc_v2 and study their results.
    Note that this requires a calibration file produced by running the run
    generated by the sigproc_v2_calibration generator.
    """

    schema = s(
        s.is_kws_r(
            **BaseGenerator.sigproc_source_schema.schema(),
            **BaseGenerator.sigproc_v2_schema.schema(),
            **BaseGenerator.lnfit_schema.schema(),
            **BaseGenerator.error_model_schema.schema(),
            **RadFilterParams.schema.schema(),
            classify_dyetracks=s.is_bool(
                help="If true then compare to dyetracks"),
            dyetrack_n_cycles=s.is_int(
                noneable=True, help="Number of cycles of simulated dyetracks"),
            dyetrack_n_counts=s.is_int(noneable=True,
                                       help="Number of dyes max."),
            is_timelapse=s.is_bool(help="Is a timelapse experiment"),
        ))

    defaults = Munch(
        classify_dyetracks=False,
        movie=False,
        is_timelapse=False,
        start_cycle=0,
        **RadFilterParams.defaults,
    )

    def generate(self):
        runs = []

        lnfit_tasks = self.lnfits(sigproc_version="v2")

        assert isinstance(self.sigproc_source, str)

        sigproc_tasks = self.tasks_for_sigproc_v2()

        rad_filter_task = task_templates.rad_filter(
            field_quality_thresh=self.field_quality_thresh,
            dark_thresh_in_stds=self.dark_thresh_in_stds,
            noi_thresh_in_stds=self.noi_thresh_in_stds,
        )

        nn_n2_task = {}
        if self.classify_dyetracks:
            # TODO: This is a bit of a hacked up mess, this mode is
            # used for calibration purposes and might not be a long-term
            # feature and so is using hard-coded n_channels for example
            self.label_set = [""]
            self.scheme = []
            n_schemes = 0
            for protease, label_set, err_set in self.run_parameter_permutator(
            ):
                nn_n2_task = task_templates.nn_v2(
                    "../sigproc_v2",
                    err_set,
                    prep_folder=None,
                    sim_v2_folder=None,
                    rad_filter_folder=f"../rad_filter",
                    run_against_all_dyetracks=True,
                    run_row_k_fit=True,
                    include_sigproc=True,
                    dyetrack_n_cycles=self.dyetrack_n_cycles,
                    dyetrack_n_counts=self.dyetrack_n_counts,
                )

                n_schemes += 1

            assert n_schemes == 1

        run = Munch(
            run_name=f"sigproc_v2",
            **sigproc_tasks,
            **rad_filter_task,
            **lnfit_tasks,
            **nn_n2_task,
        )

        if self.force_run_name is not None:
            run.run_name = self.force_run_name

        # self.report_section_run_object(run)
        # template = "sigproc_v2_analyze_template.ipynb"
        # self.report_section_from_template(template)
        #
        # if lnfit_tasks:
        #     self.report_section_from_template("lnfit_template.ipynb")

        runs += [run]

        n_runs = len(runs)
        # self.report_preamble(
        #     utils.smart_wrap(
        #         f"""
        #         # Sigproc V2 Analyze
        #         ## {n_runs} run(s) processed.
        #         This file generated by {current_file_and_line_str()}.
        #         """,
        #         width=None,
        #     )
        # )

        if self.classify_dyetracks:
            rb = ReportBuilder()
            rb.report_section_run_object(run)
            template = "sigproc_v2_classify_dyetracks_template.ipynb"
            rb.report_section_from_template(template)
            self.add_report("sigproc_v2_classify_dyetracks", rb)

        # if self.is_timelapse and self.dyetrack_n_counts == 1:
        #     rb = ReportBuilder()
        #     rb.report_section_run_object(run)
        #     template = "sigproc_v2_timelapse_template.ipynb"
        #     rb.report_section_from_template(template)
        #     self.add_report("sigproc_v2_timelapse", rb)

        self.static_reports += ["sigproc_primary", "sigproc_secondary"]

        return runs
Esempio n. 15
0
class SimV1Params(ParamsAndPriors):
    """
    Simulations parameters is and ErrorModel + parameters for sim
    """

    defaults = Munch(
        n_pres=1,
        n_mocks=0,
        n_edmans=1,
        n_samples_train=5_000,
        n_samples_test=1_000,
        dyes=[],
        labels=[],
        random_seed=None,
        train_n_sample_multiplier=
        None,  # This does not appear to be used anywhere. tfb
        allow_train_test_to_be_identical=False,
        enable_ptm_labels=False,
        is_survey=False,
    )

    schema = s(
        s.is_kws_r(
            is_survey=s.is_bool(),
            priors_desc=Priors.priors_desc_schema,
            n_pres=s.is_int(bounds=(0, None)),
            n_mocks=s.is_int(bounds=(0, None)),
            n_edmans=s.is_int(bounds=(0, None)),
            n_samples_train=s.is_int(bounds=(1, None)),
            n_samples_test=s.is_int(bounds=(1, None)),
            dyes=s.is_list(elems=s.is_kws_r(dye_name=s.is_str(),
                                            channel_name=s.is_str())),
            labels=s.is_list(elems=s.is_kws_r(
                aa=s.is_str(),
                dye_name=s.is_str(),
                label_name=s.is_str(),
                ptm_only=s.is_bool(required=False, noneable=True),
            )),
            channels=s.is_dict(required=False),
            random_seed=s.is_int(required=False, noneable=True),
            allow_train_test_to_be_identical=s.is_bool(required=False,
                                                       noneable=True),
            enable_ptm_labels=s.is_bool(required=False, noneable=True),
        ))

    # def copy(self):
    #     # REMOVE everything that _build_join_dfs put in
    #     utils.safe_del(self, "df")
    #     utils.safe_del(self, "by_channel")
    #     utils.safe_del(self, "ch_by_aa")
    #
    #     dst = utils.munch_deep_copy(self, klass_set={SimV1Params})
    #     dst.error_model = ErrorModel(**dst.error_model)
    #     assert isinstance(dst, SimV1Params)
    #     return dst

    def __init__(self, **kwargs):
        super().__init__(source="SimV1Params", **kwargs)
        self._setup_dfs()

    def validate(self):
        super().validate()

        all_dye_names = list(set([d.dye_name for d in self.dyes]))

        # No duplicate dye names
        self._validate(
            len(all_dye_names) == len(self.dyes),
            "The dye list contains a duplicate")

        # No duplicate labels
        self._validate(
            len(list(set(utils.listi(self.labels, "aa")))) == len(self.labels),
            "There is a duplicate label",
        )

        # All labels have a legit dye name
        [
            self._validate(
                label.dye_name in all_dye_names,
                f"Label {label.label_name} does not have a valid matching dye_name",
            ) for label in self.labels
        ]

        # Channel mappings
        mentioned_channels = {dye.channel_name: False for dye in self.dyes}
        if "channels" in self:
            # Validate that channel mapping is complete
            for channel_name, ch_i in self.channels.items():
                self._validate(
                    channel_name in mentioned_channels,
                    f"Channel name '{channel_name}' was not found in dyes",
                )
                mentioned_channels[channel_name] = True

            self._validate(
                all([mentioned
                     for _, mentioned in mentioned_channels.items()]),
                "Not all channels in dyes were enumerated in channels",
            )
        else:
            # No channel mapping: assign them
            self["channels"] = {
                ch_name: i
                for i, ch_name in enumerate(sorted(mentioned_channels.keys()))
            }

    @property
    def n_cycles(self):
        return self.n_pres + self.n_mocks + self.n_edmans

    def channel_names(self):
        return sorted(list(set(utils.listi(self.dyes, "channel_name"))))

    def channel_i_by_name(self):
        channels = self.channel_names()
        return {
            channel_name: channel_i
            for channel_i, channel_name in enumerate(channels)
        }

    @property
    def n_channels(self):
        return len(self.channel_i_by_name().keys())

    @property
    def n_channels_and_cycles(self):
        return self.n_channels, self.n_cycles

    def _setup_dfs(self):
        """
        The error model contains information about the dyes and labels and other terms.
        Those error model parameters are wired together by names which are useful
        for reconciling calibrations.

        But here, these "by name" parameters are all put into a dataframe so that
        they can be indexed by integers.
        """
        dyes_df = pd.DataFrame(self.dyes)
        assert len(dyes_df) > 0

        labels_df = pd.DataFrame(self.labels)
        assert len(labels_df) > 0

        # LOOKUP dye priors
        dye_priors = []
        for dye in self.dyes:
            # SEARCH priors by dye name and if not found by channel
            p_non_fluorescent = self.priors.get_exact(
                f"p_non_fluorescent.{dye.dye_name}")
            if p_non_fluorescent is None:
                p_non_fluorescent = self.priors.get(
                    f"p_non_fluorescent.ch_{dye.channel_name}")

            dye_priors += [
                Munch(
                    dye_name=dye.dye_name,
                    p_non_fluorescent=p_non_fluorescent.prior,
                )
            ]

        dye_priors_df = pd.DataFrame(dye_priors)
        # dye_priors_df: (dye_name, p_non_fluorescent)

        dyes_df = utils.easy_join(dyes_df, dye_priors_df, "dye_name")
        # dyes_df: (dye_name, channel_name, p_non_fluorescent)

        # TODO: LOOKUP label priors
        #       (p_failure_to_bind_aa, p_failure_to_attach_to_dye)

        # LOOKUP channel priors
        ch_priors = pd.DataFrame([
            dict(
                channel_name=channel_name,
                ch_i=ch_i,
                bg_mu=self.priors.get(f"bg_mu.ch_{ch_i}").prior,
                bg_sigma=self.priors.get(f"bg_sigma.ch_{ch_i}").prior,
                gain_mu=self.priors.get(f"gain_mu.ch_{ch_i}").prior,
                gain_sigma=self.priors.get(f"gain_sigma.ch_{ch_i}").prior,
                row_k_sigma=self.priors.get(f"row_k_sigma.ch_{ch_i}").prior,
                p_bleach=self.priors.get(f"p_bleach.ch_{ch_i}").prior,
            ) for channel_name, ch_i in self.channels.items()
        ])
        # ch_priors: (channel_name, ch_i, ...)

        self._channel__priors = (utils.easy_join(
            dyes_df, ch_priors, "channel_name").drop(
                columns=["p_non_fluorescent"]).drop_duplicates().reset_index())
        # self._channel__priors: (
        #    'ch_i', 'channel_name', 'bg_mu', 'bg_sigma', 'dye_name',
        #    'gain_mu', 'gain_sigma', 'index', 'p_bleach', 'row_k_sigma',
        # )

        # SANITY check channel__priors
        group_by_ch = self._channel__priors.groupby("ch_i")
        for field in (
                "bg_mu",
                "bg_sigma",
                "gain_mu",
                "gain_sigma",
                "row_k_sigma",
        ):
            assert np.all(group_by_ch[field].nunique() == 1)
        assert "p_non_fluorescent" not in self._channel__priors.columns

        labels_dyes_df = utils.easy_join(labels_df, dyes_df, "dye_name")
        self._dye__label__priors = utils.easy_join(
            labels_dyes_df, ch_priors, "channel_name").reset_index(drop=True)

        # self._dye__label__priors: (
        #     'channel_name', 'dye_name', 'aa', 'label_name',
        #     'ptm_only', 'p_non_fluorescent', 'ch_i', 'bg_mu', 'bg_sigma',
        #     'gain_mu', 'gain_sigma', 'row_k_sigma', 'p_bleach'
        # )

        self._ch_by_aa = {
            row.aa: row.ch_i
            for row in self._dye__label__priors.itertuples()
        }

    def dye__label__priors(self):
        """
        DataFrame(
            'channel_name', 'dye_name', 'aa', 'label_name',
            'ptm_only', 'p_non_fluorescent', 'ch_i', 'bg_mu', 'bg_sigma',
            'gain_mu', 'gain_sigma', 'row_k_sigma', 'p_bleach'
        )
        """
        return self._dye__label__priors

    def channel__priors(self):
        """
        DataFrame(
            'ch_i', 'channel_name', 'bg_mu', 'bg_sigma', 'dye_name',
            'gain_mu', 'gain_sigma', 'index', 'p_bleach', 'row_k_sigma',
        )
        """
        return self._channel__priors

    def by_channel(self):
        return self._channel__priors.set_index("ch_i")

    def to_label_list(self):
        """Summarize labels like: ["DE", "C"]"""
        return [
            "".join([
                label.aa for label in self.labels
                if label.dye_name == dye.dye_name
            ]) for dye in self.dyes
        ]

    def to_label_str(self):
        """Summarize labels like: DE,C"""
        return ",".join(self.to_label_list())

    @classmethod
    def construct_from_aa_list(cls, aa_list, **kwargs):
        """
        This is a helper to generate channel when you have a list of aas.
        For example, two channels where ch0 is D&E and ch1 is Y.
        ["DE", "Y"].

        If you pass in an error model, it needs to match channels and labels.
        """

        check.list_or_tuple_t(aa_list, str)

        allowed_aa_mods = ["[", "]"]
        assert all([(aa.isalpha() or aa in allowed_aa_mods) for aas in aa_list
                    for aa in list(aas)])

        dyes = [
            Munch(dye_name=f"dye_{ch}", channel_name=f"ch_{ch}")
            for ch, _ in enumerate(aa_list)
        ]

        # Note the extra for loop because "DE" needs to be split into "D" & "E"
        # which is done by aa_str_to_list() - which also handles PTMs like S[p]
        labels = [
            Munch(
                aa=aa,
                dye_name=f"dye_{ch}",
                label_name=f"label_{ch}",
                ptm_only=False,
            ) for ch, aas in enumerate(aa_list) for aa in aa_str_to_list(aas)
        ]

        return cls(dyes=dyes, labels=labels, **kwargs)
Esempio n. 16
0
class PrepParams(Params):
    PHOTOBLEACHING_PSEUDO_AA = "X"
    ALLOW_NONES_AND_NANS_IN_ABUNDANCE = False
    NORMALIZE_ABUNDANCE = False  # Abundance is normalized in gen

    defaults = Munch(
        protease=None,
        decoy_mode=None,
        include_misses=0,
        n_peps_limit=None,
        drop_duplicates=False,
        n_ptms_limit=None,
        is_photobleaching_run=False,
        photobleaching_n_cycles=None,
        photobleaching_run_n_dye_count=None,
    )

    schema = s(
        s.is_kws_r(
            protease=s.is_list(noneable=True, elems=s.is_str()),
            decoy_mode=s.is_str(noneable=True),
            include_misses=s.is_int(),
            n_peps_limit=s.is_int(noneable=True),
            drop_duplicates=s.is_bool(),
            n_ptms_limit=s.is_int(noneable=True),
            proteins=s.is_list(
                s.is_kws(
                    name=s.is_str(required=True),
                    sequence=s.is_str(required=True),
                    ptm_locs=s.is_str(noneable=True),
                    is_poi=s.is_int(noneable=True),
                    abundance=s.is_number(noneable=True),
                )),
            is_photobleaching_run=s.is_bool(),
            photobleaching_n_cycles=s.is_int(noneable=True),
            photobleaching_run_n_dye_count=s.is_int(noneable=True),
        ))

    def validate(self):
        super().validate()

        # Try to normalize abundance values if provided. If abundance values are provided, do basic validation.
        # If no abundance values are provided, do nothing.
        # When a protein csv with no abundance columns is provided, it will come through as all nans
        # Note that self.proteins is likely a list of Munches, but could be a list of dicts, so don't assume we can access items as attrs

        abundance_info_present = any(
            "abundance" in protein and protein["abundance"] is not None
            and not math.isnan(protein["abundance"])
            for protein in self.proteins)

        if abundance_info_present:
            abundance_criteria = [
                (lambda protein: "abundance" in protein, "Abundance missing"),
                (
                    lambda protein: protein["abundance"] >= 0
                    if protein["abundance"] is not None else True,
                    "Abundance must be greater than or equal to zero",
                ),
            ]

            if not self.ALLOW_NONES_AND_NANS_IN_ABUNDANCE:
                abundance_criteria += [
                    (
                        lambda protein: protein["abundance"] is not None,
                        "Abundance must not be None",
                    ),
                    (
                        lambda protein: not math.isnan(protein["abundance"]),
                        "Abundance must not be NaN",
                    ),
                ]

            # Find min abundance value, also check for zeros and NaNs and error if found
            min_abundance = None
            for protein in self.proteins:
                # Check to make sure abundance passes criteria
                for criteria_fn, msg in abundance_criteria:
                    if not criteria_fn(protein):
                        abundance_value = protein.get("abundance")
                        raise SchemaValidationFailed(
                            f"Protein {protein.get('name')} has invalid abundance: {abundance_value} - {msg}"
                        )

                # Find min abundance value
                if (min_abundance is None or protein["abundance"] <
                        min_abundance) and protein["abundance"] > 0:
                    min_abundance = protein["abundance"]

            if self.NORMALIZE_ABUNDANCE:
                if min_abundance != 1:
                    log.info("abundance data is not normalized, normalizing.")
                    # normalize abundance by min value
                    for protein in self.proteins:
                        if protein["abundance"] is not None:
                            protein["abundance"] /= min_abundance
        else:
            # Abundance information is missing from all proteins
            # Set abudance to 1
            for protein in self.proteins:
                protein["abundance"] = 1
Esempio n. 17
0
class SurveyV2Result(BaseResult):
    name = "survey_v2"
    filename = "survey_v2.pkl"

    required_props = dict(params=SurveyV2Params, _survey=(pd.DataFrame))

    survey_columns = [
        "pro_i",
        "pep_i",
        "pep_start",
        "pep_stop",
        "pep_len",
        "seqstr",
        "P2",
        "flustr",
        "n_dyes_max_any_ch",
        "flu_count",
        "nn_pep_i",
        "nn_dist",
    ]

    survey_filter_schema = s(
        s.is_kws_r(
            allow_proline_at_2=s.is_bool(),
            run_exclude=s.is_list(s.is_str()),
            run_include=s.is_list(s.is_str()),
            max_dyes_per_ch=s.is_int(noneable=True),
            max_pep_len=s.is_int(noneable=True),
            max_ptms_per_pep=s.is_int(noneable=True),
            multi_peptide_metric=s.is_str(noneable=True,
                                          options=["dist_avg", "dist_min"]),
            n_best_schemes=s.is_int(),
            n_peps_per_scheme=s.is_int(),
            objective=s.is_str(options=["protein_id", "coverage", "ptms"]),
            poi_only=s.is_bool(),
            pro_subset=s.is_list(s.is_str()),
            ptm_subset=s.is_list(s.is_int()),
        ))

    defaults = Munch(
        allow_proline_at_2=False,
        run_exclude=[],
        run_include=[],
        max_dyes_per_ch=None,
        max_pep_len=None,
        max_ptms_per_pep=None,
        multi_pro_rank="dist_min",
        n_best_schemes=50,
        n_peps_per_scheme=1,
        objective="protein_id",
        poi_only=False,
        pro_subset=[],
        ptm_subset=[],
    )

    @classmethod
    def validate_filters(cls, filters):
        """
        Validates filters against schema, and fills in defaults where missing.
        This is a class method so that higher level objects like JobResult can make use
        of filtering that seems more logical to group with our filters, but are applied
        at higher level (e.g. objective).
        """
        check.t(filters, Munch)
        cls.survey_filter_schema.apply_defaults(cls.defaults, filters)
        cls.survey_filter_schema.validate(filters)

    def survey(self):
        return self._survey

    def _domain_loss(self, df, filters, msg):  # debug aid
        if filters.verbose:
            if filters.objective == "ptms":
                domain_loss = set(filters.requested_ptms) - set(
                    list(df.ptm.astype(int)))
                # print(filters.requested_ptms)
            else:
                domain_loss = set(filters.requested_proteins) - set(
                    list(df.pro_i))
                # print(filters.requested_proteins)

            if domain_loss:
                print(
                    f"  {filters.objective} {msg} domain_loss: {sorted(domain_loss)}"
                )

    def _apply_filters(self, filters, prep=None):
        """
        filters may be used to reduce the entries rows of _survey.
        # TODO: can we just require prep to avoid all the exception logic below?
        """
        if filters.verbose:
            print(f"\n{(self._folder / '..' / '..').name}")

        df = self.survey()

        # If the caller is optimizing for PTMs, we need to add PTM information for
        # the peptides.  This is done here so that protein PTMs can be changed
        # after a run is complete.  the PrepResult is required.  Note the inner
        # join which causes the resulting df to only contain entries which have
        # PTM locations specified.
        if filters.objective == "ptms":
            if prep:
                peps__ptms = prep.peps__ptms(ptm_peps_only=True,
                                             ptms_to_rows=True)[[
                                                 "pep_i", "n_pep_ptms", "ptm"
                                             ]]
                if len(peps__ptms) > 0:
                    df = (df.set_index("pep_i").join(
                        peps__ptms.set_index("pep_i"),
                        how="inner").reset_index())
                # Write down which ptms were explicitly or implicitly requested by the caller,
                # so we can know later which ones were removed by filtering.  Note that looking
                # at the unique values in df.ptm is not quite right if more than one protein has a
                # PTM at the same location.  In theory this is OK, but it means our PTM accounting
                # for "domain_loss" for PTMs is not quite right, so assert here and deal with that
                # if necessary.
                assert len(df.ptm) == len(df.ptm.unique(
                )), "More than one protein has the same PTM location?"
                filters.requested_ptms = filters.ptm_subset or sorted(
                    list(df.ptm.unique().astype(int)))
                if filters.verbose:
                    print(f"  ptms domain: {filters.requested_ptms}")
            else:
                raise ValueError("Must supply PrepResult to optimize for PTMs")

        if filters is not None:
            self.validate_filters(filters)
            # Do protein subset or POI which substantially reduces df
            if len(filters.pro_subset) > 0:
                if prep is None:
                    raise ValueError(
                        "Must supply PrepResult to filter by pro_subset")
                pros = prep.pros()
                pro_iz = pros[pros.pro_id.isin(
                    filters.pro_subset)].pro_i.values
                df = df[df.pro_i.isin(pro_iz)]
            if filters.poi_only == True:
                if prep is None:
                    raise ValueError(
                        "Must supply PrepResult to filter by proteins-of-interest"
                    )
                poi_iz = prep.pros__pois().pro_i.values
                if len(poi_iz) > 0:
                    # If there are no entries, then all are considered "of interest",
                    # so only filter here if there are some specifically marked.
                    df = df[df.pro_i.isin(poi_iz)]

            # Write down requested proteins so we can tell the user which ones got
            # removed by filtering.
            filters.requested_proteins = sorted(list(df.pro_i.unique()))
            if filters.verbose:
                print(f"  proteins domain: {filters.requested_proteins}")

            self._domain_loss(df, filters, "post-protein-filtering")

            # remove rows per filtering
            if filters.max_pep_len is not None:
                df = df[df.pep_len <= filters.max_pep_len]
                self._domain_loss(df, filters, "max_pep_len")
            if filters.max_dyes_per_ch is not None:
                df = df[df.n_dyes_max_any_ch <= filters.max_dyes_per_ch]
                self._domain_loss(df, filters, "max_dyes_per_ch")
            if filters.max_ptms_per_pep is not None:
                df = df[df.n_pep_ptms <= filters.max_ptms_per_pep]
                self._domain_loss(df, filters, "max_ptms_per_pep")
            if len(filters.ptm_subset) > 0:
                # WARNING: this affects PTMs for ALL proteins that have them.
                # This is typically OK, since you're looking for PTMs on a single
                # protein of interest, but if you had a PTM at location 100 on two
                # different proteins, this filter would apply to both of them.
                df = df[df.ptm.astype(int).isin(filters.ptm_subset)]
                self._domain_loss(df, filters, "ptm_subset")
            if not filters.allow_proline_at_2:
                df = df[df.P2 == False]
                self._domain_loss(df, filters, "allow_proline_at_2")

        return df.copy()

    def n_uniques(self, filters=None, df=None):
        """
        Returns number of peptides with unique flus.  This is probably actually not
        very interesting if you are comparing different proteases, but it may be if
        you are comparing different labeling schemes for a single protease, or for
        pre-specified peptide sets like MHC.
        """
        df = self._apply_filters(filters) if df is None else df
        return len(df[df.flu_count == 1])

    def protein_coverage(self, prep_result, filters=None, df=None):
        """
        Returns the percentage coverage of proteins with peptides that have unique flus.
        If any proteins are marked "of interest" via the pro_report flag, then we
        compute the coverage only for those proteins, else all proteins are used.
        """
        df = self._apply_filters(filters,
                                 prep=prep_result) if df is None else df
        df = df[df.flu_count == 1]  # only use peptides that have unique flus
        n_poi = prep_result.n_pois
        poi_iz = (prep_result.pros__pois().pro_i.values
                  if n_poi > 0 else prep_result.pros().pro_i.values)

        # OLD - returns average coverage of proteins in domain
        # poi_percent_coverage = np.zeros_like(poi_iz).astype(float)
        # proseq_groups = prep_result.proseqs().groupby("pro_i")
        # pep_coverage_groups = df.groupby("pro_i")
        # for i, poi_i in enumerate(poi_iz):
        #     try:
        #         poi_percent_coverage[i] = (
        #             pep_coverage_groups.get_group(poi_i).pep_len.sum()
        #             / proseq_groups.get_group(poi_i).aa.count()
        #         )
        #     except KeyError:
        #         pass  # protein not covered at all by peps
        # avg_coverage = np.mean(poi_percent_coverage)
        # return avg_coverage

        # NEW - returns total percentage coverage of multiple proteins
        # in the case multiple proteins in domain of interest.
        proseq_groups = prep_result.proseqs().groupby("pro_i")
        pep_coverage_groups = df.groupby("pro_i")
        total_aa_covered = 0
        total_proteins_len = 0
        for poi_i in poi_iz:
            try:
                total_aa_covered += pep_coverage_groups.get_group(
                    poi_i).pep_len.sum()
            except KeyError:
                pass  # protein not covered at all, no length added to total_aa_covered
            total_proteins_len += proseq_groups.get_group(poi_i).aa.count()
        return total_aa_covered / total_proteins_len

    def max_nn_dist(self, unique_flus_only=True, filters=None, df=None):
        """
        Returns the maximum nearest-neighbor distance over all perfect dyetracks
        from the set of peptides.  We will probably want more nuanced information
        here, or via some other fn -- something that gets at more than just the
        max, perhaps including information for the top N, and some measure of
        'separated-ness' across that set.  A single non-normalized max value feels
        kind of fragile.
        """
        df = self._apply_filters(filters) if df is None else df
        if unique_flus_only:
            df = df[df.flu_count == 1]
        return df.nn_dist.max()

    def max_nn_dist_peps(self,
                         prep=None,
                         unique_flus_only=True,
                         filters=None,
                         df=None):
        """
        Like max_nn_dist(), but instead of returning just the max dist, returns information
        about the peptide(s) as well, in a DataFrame.  filters.n_peps_per_scheme controls how
        many rows are returned for non-ptm filtering.  For ptm-filtering, the number of peps
        is determined by how many peptides in this scheme contain ptms -- each will be
        returned.

        prep : a PrepResult.  If provided, we'll include the protein_coverage in the results.
        """
        df = self._apply_filters(filters, prep=prep) if df is None else df
        if unique_flus_only:
            df = df[df.flu_count == 1]
            self._domain_loss(df, filters, "unique_flus_only")

        if prep is not None:
            cols = list(df.columns)
            cols.insert(1, "pro_id")
            df = (df.set_index("pro_i").join(prep.pros().set_index("pro_i"),
                                             how="left").reset_index()[cols])
            df["nn_coverage"] = self.protein_coverage(prep, df=df)

        df["nn_unique"] = self.n_uniques(df=df)

        df = df.sort_values(
            by=["nn_dist", "pep_len", "n_dyes_max_any_ch"],
            ascending=[False, True, True],
        )

        if filters.objective != "ptms":
            # If we only need to know a single max dist across all peps/proteins, we're done.
            # This is the case if multi_peptide_metric is None - we're not trying to take into
            # account the performance on multiple protein distances.  Return the n best peps.
            if filters.multi_peptide_metric is None:
                return df[:filters.n_peps_per_scheme].reset_index(drop=True)

            # Otherwise we need a composite metric that considers the nn_dist
            # of peptides from multiple proteins.  To start, we rank
            # the peptides from each protein based on the sort order already
            # established above, and take the top n based on filters.n_peps_per_scheme,
            # leaving us with the top n peptides, ranked, from each protein
            df["nn_rank"] = (df.groupby("pro_i").nn_dist.rank(
                "first", ascending=False).astype("int"))
            df = df[df.nn_rank.isin(range(filters.n_peps_per_scheme + 1))]

            # Then we compute a couple of composite metrics which are fns
            # of the nn_dist from the peptides of each rank.  The caller
            # can sort on these across multiple runs.
            df["nn_dist_avg"] = df.groupby("nn_rank").nn_dist.transform("mean")
            df["nn_dist_min"] = df.groupby("nn_rank").nn_dist.transform("min")

        else:
            # For PTMs, we have already filtered out the peptides that
            # don't contain PTMs, so we just need these composite metrics
            # computed for the single set of all of the peptides in the df.
            df["nn_rank"] = 1
            df["nn_dist_avg"] = df.nn_dist.mean()
            df["nn_dist_min"] = df.nn_dist.min()

        # It can be that the caller is interested in N proteins or PTMs, but some
        # of those have been lost due to filtering etc.  This will cause the
        # mean and stats above to be w.r.t. too few set members, so adjust
        # these in a way that makes sense.  In the initial application of
        # filtering, the number of proteins/ptms the caller is interested in
        # has been saved. (If a protein or PTM was "lost" due to filtering,
        # it effectively merged with the background, is not observable,
        # and its nn_dist is therefore 0 -- indistinguishable from some
        # neighbor)
        filter_pass = 1.0
        domain_loss = ""  # either lost proteins, or lost ptms
        if filters.objective == "ptms":
            filter_pass = len(df) / len(filters.requested_ptms)
            domain_loss = set(filters.requested_ptms) - set(
                list(df.ptm.astype(int)))
            if filters.verbose and domain_loss:
                print(f"  ** final domain_loss: {sorted(domain_loss)}")
        elif len(filters.requested_proteins) == 0:
            filter_pass = 0
        else:
            filter_pass = len(df.pro_i.unique()) / len(
                filters.requested_proteins)
            domain_loss = set(filters.requested_proteins) - set(list(df.pro_i))

        domain_loss = str(sorted(domain_loss)) if domain_loss else ""
        df["domain_loss"] = domain_loss
        assert filter_pass <= 1.0
        if filter_pass != 1.0:
            df.nn_dist_avg *= 1.0 - filter_pass
            df.nn_dist_min = 0

        if filters.verbose:
            print(f"  filter_pass is {filter_pass}")

        return df.sort_values(by=["nn_rank", "pro_i"],
                              ascending=[True, True]).reset_index(drop=True)

    def nn_stats(self, prep_result, filters=None):
        """
        Returns a tuple that gives the main stats for this survey run
        that can be used to pick from a list of such survey runs:
        nn_uniques - the number of unique peptides
        nn_coverage - percent coverage of protein(s) by unique peptides
        nn_dist - distance to neighbor for most isolated dyetrack
        """
        df = self._apply_filters(filters, prep=prep_result)
        n_uniques = (self.n_uniques(df=df), )
        return (
            self.n_uniques(df=df),
            self.protein_coverage(prep_result, df=df),
            self.max_nn_dist(df=df),
        )
Esempio n. 18
0
class SimV2Params(ParamsAndPriors):
    # The following constants are repeated in sim_v2.h because it
    # is hard to get constants like this to be shared between
    # the two languages. This shouldn't be a problem as they are stable.
    # TODO: Move these to an import form the pyx
    CycleKindType = np.uint8
    CYCLE_TYPE_PRE = 0
    CYCLE_TYPE_MOCK = 1
    CYCLE_TYPE_EDMAN = 2

    channel__priors__columns = (
        "ch_i",
        "channel_name",
        "bg_mu",
        "bg_sigma",
        "dye_name",
        "gain_mu",
        "gain_sigma",
        "index",
        "p_bleach",
        "row_k_sigma",
    )

    dye__label__priors__columns = (
        "channel_name",
        "dye_name",
        "aa",
        "label_name",
        "ptm_only",
        "p_non_fluorescent",
        "ch_i",
        "bg_mu",
        "bg_sigma",
        "gain_mu",
        "gain_sigma",
        "row_k_sigma",
        "p_bleach",
    )

    defaults = Munch(
        n_pres=1,
        n_mocks=0,
        n_edmans=1,
        n_samples_train=5_000,
        n_samples_test=1_000,
        dyes=[],
        labels=[],
        random_seed=None,
        allow_train_test_to_be_identical=False,
        allow_edman_cterm=False,
        enable_ptm_labels=False,
        is_survey=False,
        train_includes_radmat=False,
        test_includes_dyemat=False,
        dump_debug=False,
        generate_flus=True,
        use_lognormal_model=False,
    )

    schema = s(
        s.is_kws_r(
            priors_desc=Priors.priors_desc_schema,
            is_survey=s.is_bool(),
            n_pres=s.is_int(bounds=(0, None)),
            n_mocks=s.is_int(bounds=(0, None)),
            n_edmans=s.is_int(bounds=(0, None)),
            n_samples_train=s.is_int(bounds=(1, None)),
            n_samples_test=s.is_int(bounds=(1, None)),
            dyes=s.is_list(elems=s.is_kws_r(
                dye_name=s.is_str(),
                channel_name=s.is_str(),
            )),
            labels=s.is_list(elems=s.is_kws_r(
                aa=s.is_str(),
                dye_name=s.is_str(),
                label_name=s.is_str(),
                ptm_only=s.is_bool(required=False, noneable=True),
            )),
            channels=s.is_dict(required=False),
            random_seed=s.is_int(required=False, noneable=True),
            allow_train_test_to_be_identical=s.is_bool(required=False,
                                                       noneable=True),
            allow_edman_cterm=s.is_bool(required=False, noneable=True),
            enable_ptm_labels=s.is_bool(required=False, noneable=True),
            train_includes_radmat=s.is_bool(required=False, noneable=True),
            test_includes_dyemat=s.is_bool(required=False, noneable=True),
            dump_debug=s.is_bool(),
            generate_flus=s.is_bool(),
            use_lognormal_model=s.is_bool(),
        ))

    # def copy(self):
    #     dst = utils.munch_deep_copy(self, klass_set={SimV2Params})
    #     assert isinstance(dst, SimV2Params)
    #     return dst

    def __init__(self, **kwargs):
        # _skip_setup_dfs is True in fixture mode
        super().__init__(source="SimV2Params", **kwargs)
        self._setup_dfs()

    def validate(self):
        super().validate()

        all_dye_names = list(set([d.dye_name for d in self.dyes]))

        # No duplicate dye names
        self._validate(
            len(all_dye_names) == len(self.dyes),
            "The dye list contains a duplicate")

        # No duplicate labels
        self._validate(
            len(list(set(utils.listi(self.labels, "aa")))) == len(self.labels),
            "There is a duplicate label in the label_set",
        )

        # All labels have a legit dye name
        [
            self._validate(
                label.dye_name in all_dye_names,
                f"Label {label.label_name} does not have a valid matching dye_name",
            ) for label in self.labels
        ]

        # Channel mappings
        mentioned_channels = {dye.channel_name: False for dye in self.dyes}
        if "channels" in self:
            # Validate that channel mapping is complete
            for channel_name, ch_i in self.channels.items():
                self._validate(
                    channel_name in mentioned_channels,
                    f"Channel name '{channel_name}' was not found in dyes",
                )
                mentioned_channels[channel_name] = True

            self._validate(
                all([mentioned
                     for _, mentioned in mentioned_channels.items()]),
                "Not all channels in dyes were enumerated in channels",
            )
        else:
            # No channel mapping: assign them
            self["channels"] = {
                ch_name: i
                for i, ch_name in enumerate(sorted(mentioned_channels.keys()))
            }

    @property
    def n_cycles(self):
        return self.n_pres + self.n_mocks + self.n_edmans

    def channel_names(self):
        return [
            ch_name for ch_name, _ in sorted(self.channels.items(),
                                             key=lambda item: item[1])
        ]

    def ch_i_by_name(self):
        return self.channels

    @property
    def n_channels(self):
        # if self.is_photobleaching_run:
        #     return 1
        return len(self.channels)

    @property
    def n_channels_and_cycles(self):
        return self.n_channels, self.n_cycles

    def _setup_dfs(self):
        """
        Assemble all of the priors into several dataframes indexed differently.
        (Call after validate)

        * self.channel__priors:
            ch_i,
            ch_name,
            bg_mu,
            bg_sigma,
            gain_mu,
            gain_sigma,
            row_k_sigma,
            p_bleach
            --> Note, does NOT have p_non_fluorescent because this is a dye property

        * self.dye__label__priors:
            aa,
            label_name,
            dye_name,
            ch_i,
            ch_name,
            bg_mu,
            bg_sigma,
            gain_mu,
            gain_sigma,
            row_k_sigma,
            p_bleach
            p_non_fluorescent,
        """

        # if self.is_photobleaching_run:
        #     # Not sure what these should be yet
        #     # self._ch_by_aa = {}
        #     # self._channel__priors = pd.DataFrame(columns=self.channel__priors__columns)
        #     # self._dye__label__priors = pd.DataFrame(columns=self.dye__label__priors__columns)
        #     self.dyes = [Munch(dye_name="zero", channel_name="zero")]
        #     self.channels = Munch(zero=0)
        #     self.labels = [
        #         dict(aa=".", dye_name="zero", label_name="zero", ptm_only=False)
        #     ]

        labels_df = pd.DataFrame(self.labels)
        # labels_df: (aa, dye_name, label_name, ptm_only)
        # assert len(labels_df) > 0

        dyes_df = pd.DataFrame(self.dyes)
        # dyes_df: (dye_name, channel_name)
        # assert len(dyes_df) > 0

        # LOOKUP dye priors
        dye_priors = []
        for dye in self.dyes:
            # SEARCH priors by dye name and if not found by channel
            p_non_fluorescent = self.priors.get_exact(
                f"p_non_fluorescent.{dye.dye_name}")
            if p_non_fluorescent is None:
                p_non_fluorescent = self.priors.get(
                    f"p_non_fluorescent.ch_{dye.channel_name}")

            dye_priors += [
                Munch(
                    dye_name=dye.dye_name,
                    p_non_fluorescent=p_non_fluorescent.prior,
                )
            ]

        dye_priors_df = pd.DataFrame(dye_priors)
        # dye_priors_df: (dye_name, p_non_fluorescent)

        dyes_df = utils.easy_join(dyes_df, dye_priors_df, "dye_name")
        # dyes_df: (dye_name, channel_name, p_non_fluorescent)

        # TODO: LOOKUP label priors
        #       (p_failure_to_bind_aa, p_failure_to_attach_to_dye)

        # LOOKUP channel priors
        ch_priors = pd.DataFrame([
            dict(
                channel_name=channel_name,
                ch_i=ch_i,
                bg_mu=self.priors.get(f"bg_mu.ch_{ch_i}").prior,
                bg_sigma=self.priors.get(f"bg_sigma.ch_{ch_i}").prior,
                gain_mu=self.priors.get(f"gain_mu.ch_{ch_i}").prior,
                gain_sigma=self.priors.get(f"gain_sigma.ch_{ch_i}").prior,
                row_k_sigma=self.priors.get(f"row_k_sigma.ch_{ch_i}").prior,
                p_bleach=self.priors.get(f"p_bleach.ch_{ch_i}").prior,
            ) for channel_name, ch_i in self.channels.items()
        ])
        # ch_priors: (channel_name, ch_i, ...)

        self._channel__priors = (utils.easy_join(
            dyes_df, ch_priors, "channel_name").drop(
                columns=["p_non_fluorescent"]).drop_duplicates().reset_index())
        # self._channel__priors: (
        #    'ch_i', 'channel_name', 'bg_mu', 'bg_sigma', 'dye_name',
        #    'gain_mu', 'gain_sigma', 'index', 'p_bleach', 'row_k_sigma',
        # )

        # SANITY check channel__priors
        group_by_ch = self._channel__priors.groupby("ch_i")
        for field in (
                "bg_mu",
                "bg_sigma",
                "gain_mu",
                "gain_sigma",
                "row_k_sigma",
        ):
            assert np.all(group_by_ch[field].nunique() == 1)
        assert "p_non_fluorescent" not in self._channel__priors.columns

        labels_dyes_df = utils.easy_join(labels_df, dyes_df, "dye_name")
        self._dye__label__priors = utils.easy_join(
            labels_dyes_df, ch_priors, "channel_name").reset_index(drop=True)

        # self._dye__label__priors: (
        #     'channel_name', 'dye_name', 'aa', 'label_name',
        #     'ptm_only', 'p_non_fluorescent', 'ch_i', 'bg_mu', 'bg_sigma',
        #     'gain_mu', 'gain_sigma', 'row_k_sigma', 'p_bleach'
        # )

        self._ch_by_aa = {
            row.aa: row.ch_i
            for row in self._dye__label__priors.itertuples()
        }

    def ch_by_aa(self):
        return self._ch_by_aa

    def dye__label__priors(self):
        """
        DataFrame(
            'channel_name', 'dye_name', 'aa', 'label_name',
            'ptm_only', 'p_non_fluorescent', 'ch_i', 'bg_mu', 'bg_sigma',
            'gain_mu', 'gain_sigma', 'row_k_sigma', 'p_bleach'
        )
        """
        return self._dye__label__priors

    def channel__priors(self):
        """
        DataFrame(
            'ch_i', 'channel_name', 'bg_mu', 'bg_sigma', 'dye_name',
            'gain_mu', 'gain_sigma', 'index', 'p_bleach', 'row_k_sigma',
        )
        """
        return self._channel__priors

    def by_channel(self):
        return self._channel__priors.set_index("ch_i")

    def to_label_list(self):
        """Summarize labels like: ["DE", "C"]"""
        return [
            "".join([
                label.aa for label in self.labels
                if label.dye_name == dye.dye_name
            ]) for dye in self.dyes
        ]

    def to_label_str(self):
        """Summarize labels like: DE,C"""
        return ",".join(self.to_label_list())

    def cycles_array(self):
        cycles = np.zeros((self.n_cycles, ), dtype=self.CycleKindType)
        i = 0
        for _ in range(self.n_pres):
            cycles[i] = self.CYCLE_TYPE_PRE
            i += 1
        for _ in range(self.n_mocks):
            cycles[i] = self.CYCLE_TYPE_MOCK
            i += 1
        for _ in range(self.n_edmans):
            cycles[i] = self.CYCLE_TYPE_EDMAN
            i += 1
        return cycles

    def pcbs(self, pep_seq_df):
        """
        pcb stands for (p)ep_i, (c)hannel_i, (b)right_probability

        This is a structure that is liek a "flu" but with an extra bright probability.

        Each peptide has a row for each amino acid
            That row has a columns (pep_i, ch_i, p_bright)
            And it will have np.nan for ch_i and p_bright **IF THERE IS NO LABEL**

        bright_probability is the inverse of all the ways a dye can fail to be visible
        ie the probability that a dye is active.

        pep_seq_df: Any DataFrame with an "aa" column

        Returns:
            contiguous ndarray(:, 3) where there 3 columns are:
                pep_i, ch_i, p_bright
        """
        labelled_pep_df = pep_seq_df.join(
            self.dye__label__priors().set_index("aa"), on="aa", how="left")

        # p_bright = is the product of (1.0 - ) all the ways the dye can fail to be visible.
        labelled_pep_df["p_bright"] = (
            # TODO: Sim needs to be converted to use priors sampling
            #       at which point this function needs to be refactored
            #       so that the parameters of the priors can be sampled in C.
            1.0 - np.array([
                i.sample() if isinstance(i, Prior) else np.nan
                for i in labelled_pep_df.p_non_fluorescent
            ])
            # TODO: Add label priors
            # * (1.0 - labelled_pep_df.p_failure_to_attach_to_dye)
            # * (1.0 - labelled_pep_df.p_failure_to_bind_aa)
        )

        labelled_pep_df.sort_values(by=["pep_i", "pep_offset_in_pro"],
                                    inplace=True)
        return np.ascontiguousarray(
            labelled_pep_df[["pep_i", "ch_i", "p_bright"]].values)

    @classmethod
    def from_aa_list_fixture(cls, aa_list, priors=None, **kwargs):
        """
        This is a helper to generate channel when you have a list of aas.
        For example, two channels where ch0 is D&E and ch1 is Y.
        ["DE", "Y"].
        """

        check.list_or_tuple_t(aa_list, str)

        allowed_aa_mods = ["[", "]"]
        assert all([(aa.isalpha() or aa in allowed_aa_mods) for aas in aa_list
                    for aa in list(aas)])

        dyes = [
            Munch(dye_name=f"dye_{ch}", channel_name=f"ch_{ch}")
            for ch, _ in enumerate(aa_list)
        ]

        # Note the extra for loop because "DE" needs to be split into "D" & "E"
        # which is done by aa_str_to_list() - which also handles PTMs like S[p]
        labels = [
            Munch(
                aa=aa,
                dye_name=f"dye_{ch}",
                label_name=f"label_{ch}",
                ptm_only=False,
            ) for ch, aas in enumerate(aa_list) for aa in aa_str_to_list(aas)
        ]

        return cls(dyes=dyes, labels=labels, priors=priors, **kwargs)