Ejemplo n.º 1
0
    def construct_from_aa_list(cls, aa_list, **kwargs):
        """
        This is a helper to generate channel when you have a list of aas.
        For example, two channels where ch0 is D&E and ch1 is Y.
        ["DE", "Y"].

        If you pass in an error model, it needs to match channels and labels.
        """

        check.list_or_tuple_t(aa_list, str)

        allowed_aa_mods = ["[", "]"]
        assert all([(aa.isalpha() or aa in allowed_aa_mods) for aas in aa_list
                    for aa in list(aas)])

        dyes = [
            Munch(dye_name=f"dye_{ch}", channel_name=f"ch_{ch}")
            for ch, _ in enumerate(aa_list)
        ]

        # Note the extra for loop because "DE" needs to be split into "D" & "E"
        # which is done by aa_str_to_list() - which also handles PTMs like S[p]
        labels = [
            Munch(
                amino_acid=aa,
                dye_name=f"dye_{ch}",
                label_name=f"label_{ch}",
                ptm_only=False,
            ) for ch, aas in enumerate(aa_list) for aa in aa_str_to_list(aas)
        ]

        return cls(dyes=dyes, labels=labels, **kwargs)
Ejemplo n.º 2
0
def survey_nn(survey_nn_params, prep_result, sim_result, progress=None, pipeline=None):
    """
    Compute a distance between between peptides that exist in prep_result
    using the dye-tracks employed by nearest-neighbor.  Create a DF that
    collects these distances with other information useful in surveying
    a number of protease-label schemes to determine which ones are well
    suited to some informatics objective, such as identifying a protein(s).

    Notes:
        - We are not including decoys.  If you want to include decoys (assuming they
          were used in the simulation) use the test dyemat rather than train.

    """

    # get simple euclidean nearest-neighbor info & store in Dataframe
    pep_iz, nn_pep_iz, nn_dist = euc_dist(sim_result)
    df = pd.DataFrame()
    df["pep_i"] = pep_iz
    df["nn_pep_i"] = nn_pep_iz
    df["nn_dist"] = nn_dist

    # Join this to some flu information so we have it all in one place, especially
    # info about degeneracy (when more than one pep has the same dyetrack)
    # This isn't very DRY, since this data already lives in the prep and sim results.
    # But it makes downstream report-code simpler and faster to filter and search
    # these results if everything you need is already joined in one DF.
    # My approach is to put everything into the SurveyResult that you want
    # to be able to filter on to minimize computation in the report.
    # This is possible for nearly everything, except things you want to
    # be able to change at report time, like what PTMs you're interested in
    # if this survey involves PTMs.
    #
    peps__flus = sim_result.peps__flus(prep_result)
    peps__flus["pep_len"] = peps__flus.apply(
        lambda x: x.pep_stop - x.pep_start - 1, axis=1
    )

    # include the peptide sequence, and whether it has Proline at position 2
    pepstrs = prep_result.pepstrs()
    pepstrs["P2"] = pepstrs.apply(
        lambda row: True
        if row.seqstr and len(row.seqstr) > 1 and aa_str_to_list(row.seqstr)[1] == "P"
        else False,
        axis=1,
    )

    df = (
        df.set_index("pep_i")
        .join(peps__flus.set_index("pep_i"), how="left")
        .join(pepstrs.set_index("pep_i"), how="left")
        .reset_index()
    )[SurveyNNResult.survey_columns]

    return SurveyNNResult(params=survey_nn_params, _survey=df)
Ejemplo n.º 3
0
 def ptms_in_peptide(row, only_active_ptms=ptms_column_active_only):
     # set ptms to global ptms that fall into this peptide and are active.
     # ptms are 1-based but start/stop are 0-based.
     local_ptm_indices = [
         int(i) - (row.pep_start + 1) for i in row.ptms.split(";")
         if i and int(i) in range(row.pep_start + 1, row.pep_stop + 1)
     ]
     if not local_ptm_indices:
         return ""
     aas = aa_str_to_list(row.seqstr)
     return ";".join([
         str(i + row.pep_start + 1) for i in local_ptm_indices
         if not only_active_ptms or "[" in aas[i]
     ])
Ejemplo n.º 4
0
    def peps__pepstrs__flustrs__p2(
        self,
        include_decoys=False,
        in_report_only=False,
        ptm_peps_only=False,
        ptms_to_rows=True,
    ):
        """
        This is collects a variety of information for reporting and is fairly configurable, thus
        the options.  How else to support these options?  The pattern of a function per join-type
        would create lots of functions in this case... Maybe only a few are needed though.
        """
        peps = self._prep_result.peps__ptms(
            include_decoys=include_decoys,
            poi_only=in_report_only,
            ptm_peps_only=ptm_peps_only,
            ptms_to_rows=ptms_to_rows,
        )
        pepstrs = self._prep_result.pepstrs()
        flus = self._sim_result.flus()
        flustrs = flus[["pep_i", "flustr", "flu_count", "n_dyes_max_any_ch"]]

        df = (peps.set_index("pep_i").join(pepstrs.set_index("pep_i"),
                                           how="left").join(
                                               flustrs.set_index("pep_i"),
                                               how="left").reset_index())

        pros = self._prep_result.pros()
        if "abundance" in pros.columns and "abundance" not in df.columns:
            df = (df.set_index("pro_i").join(
                pros.set_index("pro_i")[["abundance"]]).reset_index())

        # To protect against an error when the df is empty
        # we must add the column as None first
        df["P2"] = None

        # Now this is safe even if df is empty
        df["P2"] = df.apply(
            lambda row: True if row.seqstr and len(row.seqstr) > 1 and
            aa_str_to_list(row.seqstr)[1] == "P" else False,
            axis=1,
        )
        return df
Ejemplo n.º 5
0
    def peps__pepstrs__flustrs__p2(
        self,
        include_decoys=False,
        in_report_only=False,
        ptm_peps_only=False,
        ptms_to_rows=True,
    ):
        """
        This is collects a variety of information for reporting and is fairly configurable, thus
        the options.  How else to support these options?  The pattern of a function per join-type
        would create lots of functions in this case... Maybe only a few are needed though.
        """
        peps = self._prep_result.peps__ptms(
            include_decoys=include_decoys,
            in_report_only=in_report_only,
            ptm_peps_only=ptm_peps_only,
            ptms_to_rows=ptms_to_rows,
        )
        pepstrs = self._prep_result.pepstrs()
        flus = self._sim_result.flus()
        if "n_dyes_max_any_ch" not in flus.columns:
            # I've added this to sim but compute on demand if not in this run.
            # Remove after it's no longer needed for older runs. tfb 8 apr 2020
            self._sim_result._generate_flu_info(self._prep_result)
            flus = self._sim_result.flus()

        flustrs = flus[["pep_i", "flustr", "flu_count", "n_dyes_max_any_ch"]]

        df = (peps.set_index("pep_i").join(pepstrs.set_index("pep_i"),
                                           how="left").join(
                                               flustrs.set_index("pep_i"),
                                               how="left").reset_index())

        df["P2"] = df.apply(
            lambda row: True if row.seqstr and len(row.seqstr) > 1 and
            aa_str_to_list(row.seqstr)[1] == "P" else False,
            axis=1,
        )
        return df
Ejemplo n.º 6
0
    def false_rates_all_peps__ptm_info(
        self,
        at_prec,
        n_false=4,
        protein_of_interest_only=True,
        ptms_column_active_only=False,
    ):
        """
        Adds some additional info requested by Angela.  I'm placing this in a separate fn
        because this is really ad-hoc info for the way we're doing PTMs at the moment and
        doesn't really belong in a more generic "false_rates..." call.
        """

        df = self.false_rates_all_peps__flus(at_prec, n_false,
                                             protein_of_interest_only)

        #
        # Add global PTM locations that occur for each peptide
        #
        pros = self._prep_result.pros()
        df = pd.merge(
            df,
            pros[["pro_i",
                  "pro_ptm_locs"]].rename(columns=dict(pro_ptm_locs="ptms")),
            on="pro_i",
            how="left",
        )

        def ptms_in_peptide(row, only_active_ptms=ptms_column_active_only):
            # set ptms to global ptms that fall into this peptide and are active.
            # ptms are 1-based but start/stop are 0-based.
            local_ptm_indices = [
                int(i) - (row.pep_start + 1) for i in row.ptms.split(";")
                if i and int(i) in range(row.pep_start + 1, row.pep_stop + 1)
            ]
            if not local_ptm_indices:
                return ""
            aas = aa_str_to_list(row.seqstr)
            return ";".join([
                str(i + row.pep_start + 1) for i in local_ptm_indices
                if not only_active_ptms or "[" in aas[i]
            ])

        df["ptms"] = df.apply(ptms_in_peptide, axis=1)

        #
        # Add column for "Proline in 2nd position"
        #
        df["P2"] = df.apply(
            lambda row: True if row.seqstr and len(row.seqstr) > 1 and
            aa_str_to_list(row.seqstr)[1] == "P" else False,
            axis=1,
        )

        #
        # Add seqlen column
        #
        df["seqlen"] = df.apply(
            lambda row: len(aa_str_to_list(row.seqstr)),
            axis=1,
        )

        return df