Exemple #1
0
class RetentionTime:
    def __init__(self, predictor="deeplc", config=None):
        """
        Initialize peptide retention time predictor

        Parameters
        ----------
        predictor: str, optional
            Retention time predictor to employ. Currently only 'deeplc' is supported.
        config: dict, optional
            Dictionary with configuration. Requires 'deeplc' top-level key for
            DeepLC predictions.
        """
        self.predictor = predictor
        self.deeplc_predictor = None

        if not config:
            self.config = dict()
        else:
            self.config = config

    def _get_irt_peptides(self):
        """
        Return DeepLC DataFrame with iRT peptides
        """
        irt_peptides = {
            "LGGNEQVTR": -24.92,
            "GAGSSEPVTGLDAK": 0.00,
            "VEATFGVDESNAK": 12.39,
            "YILAGVENSK": 19.79,
            "TPVISGGPYEYR": 28.71,
            "TPVITGAPYEYR": 33.38,
            "DGLDAASYYAPVR": 42.26,
            "ADVTPADFSEWSK": 54.62,
            "GTFIIDPGGVIR": 70.52,
            "GTFIIDPAAVIR": 87.23,
            "LFLQFGAQGSPFLK": 100.00,
        }

        irt_df = pd.DataFrame.from_dict(irt_peptides, orient="index")
        irt_df = irt_df.reset_index()
        irt_df.columns = ["seq", "tr"]
        irt_df["modifications"] = ""

        return irt_df

    def _init_deeplc(self):
        """
        Initialize DeepLC: import, configurate and calibrate
        """
        # Only import if DeepLC will be used, otherwise lot's of extra heavy
        # dependencies (e.g. Tensorflow) are imported as well
        from deeplc import DeepLC

        if "deeplc" in self.config:
            deeplc_params = self.config["deeplc"]
            calibration_file = deeplc_params["calibration_file"]
            del deeplc_params["calibration_file"]
        else:
            deeplc_params = {"verbose": False}
            calibration_file = None

        # TODO: Remove when fixed upstream in DeepLC
        if not calibration_file:
            deeplc_params["split_cal"] = 9

        self.deeplc_predictor = DeepLC(**deeplc_params)

        if calibration_file:
            cal_df = pd.read_csv(calibration_file, sep=",")
        else:
            cal_df = self._get_irt_peptides()

        self.deeplc_predictor.calibrate_preds(seq_df=cal_df)

    def _prepare_deeplc_peptide_df(self):
        """
        Prepare DeepLC peptide DataFrame
        """
        column_map = {"peptide": "seq", "modifications": "modifications"}
        self.deeplc_pep_df = self.peprec[column_map.keys()].copy()
        self.deeplc_pep_df.rename(columns=column_map, inplace=True)

    def _run_deeplc(self):
        """
        Run DeepLC
        """
        self.deeplc_preds = self.deeplc_predictor.make_preds(
            seq_df=self.deeplc_pep_df.fillna(""))

    def _parse_deeplc_preds(self):
        """
        Add DeepLC predictions to peprec DataFrame
        """
        self.peprec["rt"] = self.deeplc_preds

    def _predict_deeplc(self):
        """
        Predict retention times using DeepLC
        """
        if not self.deeplc_predictor:
            self._init_deeplc()
        self._prepare_deeplc_peptide_df()
        self._run_deeplc()
        self._parse_deeplc_preds()

    def add_rt_predictions(self, peprec):
        """
        Run RT predictor and add predictions to peprec DataFrame.

        peprec: pandas.DataFrame
            MS2PIP-style peprec DataFrame with peptides for which to predict retention
            times
        """
        self.peprec = peprec

        if self.predictor == "deeplc":
            self._predict_deeplc()
        else:
            raise NotImplementedError(self.predictor)
Exemple #2
0
                            standard_feat=False,
                            chem_descr_feat=False,
                            add_comp_feat=False,
                            cnn_feats=True,
                            verbose=False)
# Initiate a DeepLC instance that will perform the calibration and predictions
dlc = DeepLC(path_model="deeplc/mods/full_hc_dia_fixed_mods.hdf5",
             cnn_model=True,
             f_extractor=f_extractor,
             verbose=False)

# To demonstrate DeepLC's callibration, we'll induce some an artificial
# transformation into the retention times
df["tr"] = df["tr"]**0.85

# Calibrate the original model based on the new retention times
dlc.calibrate_preds(seq_df=df)

# Make predictions; calibrated and uncalibrated
preds_cal = dlc.make_preds(seq_df=df)
preds_uncal = dlc.make_preds(seq_df=df, calibrate=False)

# Compare calibrated and uncalibrated predictions
#print("Predictions (calibrated): ", preds_cal)
#print("Predictions (uncalibrated): ", preds_uncal)

plt.scatter(df["tr"], preds_cal, label="Calibrated", s=1)
plt.scatter(df["tr"], preds_uncal, label="Uncalibrated", s=1)
plt.legend()
plt.savefig('deeplc_calibrated_vs_uncalibrated.png')
Exemple #3
0
def run(file_pred="",
        file_cal="",
        file_pred_out="",
        file_model=None,
        n_threads=None,
        verbose=False,
        split_cal=50,
        dict_divider=50,
        batch_num=50000,
        plot_predictions=False):
    """
    Main function to run the DeepLC code

    Parameters
    ----------
    file_pred : str
        the file in peprec format that we need to make predictions for
        this file is not required to contain a tr column
    file_cal : str
        the file in peprec format that we use for calibrating the prediction
        model. This file is required to contain a tr column
    file_pred_out : str
        outfile for predictions, the file is in peprec format and predictions
        are added in the column TODO
    file_model : str | list | None 
        the model(s) to try for retention time prediction can be a single
        location or several locations for multiple models to try
    n_threads : int
        number of threads to run mainly the feature extraction on
    split_cal : int
        number of splits or divisions to use for the calibration
    dict_divider : int
        TODO
    batch_num : int
        TODO
    plot_predictions : bool
        Save scatter plot of predictions vs observations

    Returns
    -------
    None
    """

    logging.info("Using DeepLC version %s", __version__)

    # Read input files
    df_pred = pd.read_csv(file_pred)
    df_pred = df_pred.fillna("")

    if len(file_cal) > 1:
        df_cal = pd.read_csv(file_cal)
        df_cal = df_cal.fillna("")

    # Make a feature extraction object; you can skip this if you do not want to
    # use the default settings for DeepLC. Here we want to use a model that does
    # not use RDKit features so we skip the chemical descriptor making
    # procedure.
    f_extractor = FeatExtractor(add_sum_feat=False,
                                ptm_add_feat=False,
                                ptm_subtract_feat=False,
                                standard_feat=False,
                                chem_descr_feat=False,
                                add_comp_feat=False,
                                cnn_feats=True,
                                verbose=verbose)

    # Make the DeepLC object that will handle making predictions and
    # calibration
    dlc = DeepLC(path_model=file_model,
                 f_extractor=f_extractor,
                 cnn_model=True,
                 n_jobs=n_threads,
                 verbose=verbose,
                 batch_num=batch_num)

    # Calibrate the original model based on the new retention times
    if len(file_cal) > 1:
        logging.info("Selecting best model and calibrating predictions...")
        dlc.calibrate_preds(seq_df=df_cal)

    # Make predictions; calibrated or uncalibrated
    logging.info("Making predictions using model: %s", dlc.model)
    if len(file_cal) > 1:
        preds = dlc.make_preds(seq_df=df_pred)
    else:
        preds = dlc.make_preds(seq_df=df_pred, calibrate=False)

    df_pred["predicted_tr"] = preds
    logging.debug("Writing predictions to file: %s", file_pred_out)
    df_pred.to_csv(file_pred_out)

    if plot_predictions:
        if len(file_cal) > 1 and "tr" in df_pred.columns:
            file_pred_figure = os.path.splitext(file_pred_out)[0] + '.png'
            logging.debug("Saving scatterplot of predictions to file: %s", file_pred_figure)
            plt.figure(figsize=(11.5, 9))
            plt.scatter(df_pred["tr"], df_pred["predicted_tr"], s=3)
            plt.title("DeepLC predictions")
            plt.xlabel("Observed retention times")
            plt.ylabel("Predicted retention times")
            plt.savefig(file_pred_figure, dpi=300)
        else:
            logging.warning('No observed retention time in input data. Cannot \
plot predictions')

    logging.info("DeepLC finished!")
class RetentionTimeIntegration:
    """Retention time integration for MS²ReScore, using DeepLC."""
    def __init__(
        self,
        peprec_path: str,
        feature_path: str,
        higher_psm_score_better: bool = True,
        calibration_set_size: Optional[Union[int, float]] = 0.20,
        num_cpu: Optional[int] = None,
    ):
        """
        Retention time integration for MS²ReScore, using DeepLC.

        Parameters
        ----------
        peprec_path: str
            Path to PEPREC file with PSMs
        feature_path: str
            Path to feature file to write features to.
        higher_psm_score_better: bool
            Wheter a higher PSM score (`psm_score` column in PEPREC) denotes a better
            score. (default: True)
        calibration_set_size: int or float
            Amount of best PSMs to use for DeepLC calibration. If this value is lower
            than the number of available PSMs, all PSMs will be used. (default: 0.20)
        num_cpu: {int, None}
            Number of processes to use in DeepLC

        Properties
        ----------
        calibration_data: pandas.DataFrame
            Get calibration peptides (N best PSMs in PEPREC).
        prediction_data: pandas.DataFrame
            Get prediction peptides.

        Methods
        -------
        run()
            Get retention time predictions for PEPREC and calculate features.

        """
        self.peprec_path = peprec_path
        self.feature_path = feature_path
        self.higher_psm_score_better = higher_psm_score_better
        self.calibration_set_size = calibration_set_size
        self.num_cpu = num_cpu

        # Until fixed upstream: https://github.com/compomics/DeepLC/issues/19
        if "NUMEXPR_MAX_THREADS" not in os.environ:
            os.environ["NUMEXPR_MAX_THREADS"] = str(self.num_cpu)

        self.peprec = None
        self.feature_df = None

        if self.peprec_path:
            self.peprec = PeptideRecord(path=self.peprec_path)

    def num_calibration_psms(self, peprec):
        """Get number of calibration PSMs given `calibration_set_size` and total number of PSMs."""
        if isinstance(self.calibration_set_size, float):
            if self.calibration_set_size > 1:
                raise ValueError(
                    "`calibration_set_size` cannot be larger than 1.")
            elif self.calibration_set_size <= 0:
                raise ValueError(
                    "`calibration_set_size` cannot be smaller than or equal to 0."
                )
            else:
                num_calibration_psms = round(
                    len(peprec) * self.calibration_set_size)
        elif isinstance(self.calibration_set_size, int):
            if self.calibration_set_size > len(peprec):
                logger.warning(
                    "Requested number of calibration PSMs (%s) is larger than total number "
                    "of PSMs in PEPREC (%s). Using all PSMs for calibration.",
                    self.calibration_set_size,
                    peprec,
                )
                num_calibration_psms = len(peprec)
            else:
                num_calibration_psms = self.calibration_set_size
        else:
            raise TypeError(
                "Expected float or int for `calibration_set_size`. Got "
                f"{type(self.calibration_set_size)} instead")
        logger.debug("Using %i PSMs for calibration", num_calibration_psms)
        return num_calibration_psms

    def get_calibration_data(self, peprec):
        """Get calibration peptides (N best PSMs in PEPREC)."""
        ascending = not self.higher_psm_score_better
        if "label" in peprec.columns:
            label_col = "label"
        elif "Label" in peprec.columns:
            label_col = "Label"
        else:
            raise ValueError("No label column found in peptide record.")
        calibration_data = (peprec[peprec[label_col] == 1].sort_values(
            ["psm_score"], ascending=ascending).head(
                self.num_calibration_psms(peprec=peprec)).rename(
                    columns={
                        "observed_retention_time": "tr",
                        "peptide": "seq",
                    })[["tr", "seq", "modifications"]].copy())
        return calibration_data

    def get_prediction_data(self, peprec):
        """Get prediction peptides."""
        return peprec[["peptide", "modifications"]].rename(columns={
            "peptide": "seq",
        })

    def _calculate_features(self):
        """Calculate retention time features for rescoring."""
        # Absolute difference between observed and predicted'
        self.feature_df = self.peprec.df.copy()
        self.feature_df["rt_diff"] = (
            self.feature_df["observed_retention_time"] -
            self.feature_df["predicted_retention_time"]).abs()

        # Minimum RT difference for a peptidoform
        min_rt_diff = self.feature_df[[
            "peptide",
            "modifications",
            "observed_retention_time",
            "predicted_retention_time",
            "rt_diff",
        ]].copy()

        min_rt_diff = (min_rt_diff.sort_values(
            "rt_diff", ascending=True
        ).drop_duplicates(subset=[
            "peptide", "modifications"
        ], keep="first").rename(
            columns={
                "rt_diff": "rt_diff_best",
                "observed_retention_time": "observed_retention_time_best",
                "predicted_retention_time": "predicted_retention_time_best",
            }))

        # Merging minimum RT difference features to full set
        self.feature_df = self.feature_df.merge(
            min_rt_diff, on=["peptide", "modifications"], how="left")

        # Only keep feature columns
        id_columns = ["spec_id", "charge", "peptide", "modifications"]
        feature_columns = [
            "observed_retention_time",
            "predicted_retention_time",
            "rt_diff",
            "rt_diff_best",
            "observed_retention_time_best",
            "predicted_retention_time_best",
        ]
        self.feature_df = self.feature_df[id_columns + feature_columns].copy()

    def run(self):
        """Get retention time predictions for PEPREC and calculate features."""

        from deeplc import DeepLC

        if "Raw file" in self.peprec.df.columns:
            raw_specific_predicted_dfs = []
            for i, (raw_file,
                    df) in enumerate(self.peprec.df.groupby("Raw file")):
                logger.info(f"Calibrating {raw_file}")

                peprec_raw_df = df.copy().reset_index()
                retention_time_df = pd.DataFrame(
                    columns=["spec_id", "predicted_retention_time"])

                if i == 0:
                    self.deeplc_predictor = DeepLC(split_cal=10,
                                                   n_jobs=self.num_cpu,
                                                   cnn_model=True,
                                                   verbose=False,
                                                   pygam_calibration=True)
                    self.deeplc_predictor.calibrate_preds(
                        seq_df=self.get_calibration_data(peprec_raw_df))
                    self.deeplc_model = list(
                        self.deeplc_predictor.model.keys())
                else:
                    self.deeplc_predictor = DeepLC(
                        split_cal=10,
                        n_jobs=self.num_cpu,
                        cnn_model=True,
                        verbose=False,
                        path_model=self.deeplc_model,
                        pygam_calibration=True)
                    self.deeplc_predictor.calibrate_preds(
                        seq_df=self.get_calibration_data(peprec_raw_df))
                predicted_rts = pd.Series(
                    self.deeplc_predictor.make_preds(
                        seq_df=self.get_prediction_data(peprec_raw_df)))
                retention_time_df["spec_id"] = peprec_raw_df["spec_id"].copy()
                retention_time_df["predicted_retention_time"] = predicted_rts

                raw_specific_predicted_dfs.append(retention_time_df)
            self.peprec.df = pd.merge(
                self.peprec.df,
                pd.concat(raw_specific_predicted_dfs, ignore_index=True),
                on="spec_id",
                how="inner",
            )
        else:
            self.deeplc_predictor = DeepLC(split_cal=10,
                                           n_jobs=self.num_cpu,
                                           cnn_model=True,
                                           verbose=False,
                                           pygam_calibration=True)
            self.deeplc_predictor.calibrate_preds(
                seq_df=self.get_calibration_data(self.peprec.df))
            predicted_rts = pd.Series(
                self.deeplc_predictor.make_preds(
                    seq_df=self.get_prediction_data(self.peprec.df), ))
            self.peprec.df["predicted_retention_time"] = predicted_rts

        self._calculate_features()
        self.feature_df.to_csv(self.feature_path, index=False)