class RetentionTime: def __init__(self, predictor="deeplc", config=None): """ Initialize peptide retention time predictor Parameters ---------- predictor: str, optional Retention time predictor to employ. Currently only 'deeplc' is supported. config: dict, optional Dictionary with configuration. Requires 'deeplc' top-level key for DeepLC predictions. """ self.predictor = predictor self.deeplc_predictor = None if not config: self.config = dict() else: self.config = config def _get_irt_peptides(self): """ Return DeepLC DataFrame with iRT peptides """ irt_peptides = { "LGGNEQVTR": -24.92, "GAGSSEPVTGLDAK": 0.00, "VEATFGVDESNAK": 12.39, "YILAGVENSK": 19.79, "TPVISGGPYEYR": 28.71, "TPVITGAPYEYR": 33.38, "DGLDAASYYAPVR": 42.26, "ADVTPADFSEWSK": 54.62, "GTFIIDPGGVIR": 70.52, "GTFIIDPAAVIR": 87.23, "LFLQFGAQGSPFLK": 100.00, } irt_df = pd.DataFrame.from_dict(irt_peptides, orient="index") irt_df = irt_df.reset_index() irt_df.columns = ["seq", "tr"] irt_df["modifications"] = "" return irt_df def _init_deeplc(self): """ Initialize DeepLC: import, configurate and calibrate """ # Only import if DeepLC will be used, otherwise lot's of extra heavy # dependencies (e.g. Tensorflow) are imported as well from deeplc import DeepLC if "deeplc" in self.config: deeplc_params = self.config["deeplc"] calibration_file = deeplc_params["calibration_file"] del deeplc_params["calibration_file"] else: deeplc_params = {"verbose": False} calibration_file = None # TODO: Remove when fixed upstream in DeepLC if not calibration_file: deeplc_params["split_cal"] = 9 self.deeplc_predictor = DeepLC(**deeplc_params) if calibration_file: cal_df = pd.read_csv(calibration_file, sep=",") else: cal_df = self._get_irt_peptides() self.deeplc_predictor.calibrate_preds(seq_df=cal_df) def _prepare_deeplc_peptide_df(self): """ Prepare DeepLC peptide DataFrame """ column_map = {"peptide": "seq", "modifications": "modifications"} self.deeplc_pep_df = self.peprec[column_map.keys()].copy() self.deeplc_pep_df.rename(columns=column_map, inplace=True) def _run_deeplc(self): """ Run DeepLC """ self.deeplc_preds = self.deeplc_predictor.make_preds( seq_df=self.deeplc_pep_df.fillna("")) def _parse_deeplc_preds(self): """ Add DeepLC predictions to peprec DataFrame """ self.peprec["rt"] = self.deeplc_preds def _predict_deeplc(self): """ Predict retention times using DeepLC """ if not self.deeplc_predictor: self._init_deeplc() self._prepare_deeplc_peptide_df() self._run_deeplc() self._parse_deeplc_preds() def add_rt_predictions(self, peprec): """ Run RT predictor and add predictions to peprec DataFrame. peprec: pandas.DataFrame MS2PIP-style peprec DataFrame with peptides for which to predict retention times """ self.peprec = peprec if self.predictor == "deeplc": self._predict_deeplc() else: raise NotImplementedError(self.predictor)
standard_feat=False, chem_descr_feat=False, add_comp_feat=False, cnn_feats=True, verbose=False) # Initiate a DeepLC instance that will perform the calibration and predictions dlc = DeepLC(path_model="deeplc/mods/full_hc_dia_fixed_mods.hdf5", cnn_model=True, f_extractor=f_extractor, verbose=False) # To demonstrate DeepLC's callibration, we'll induce some an artificial # transformation into the retention times df["tr"] = df["tr"]**0.85 # Calibrate the original model based on the new retention times dlc.calibrate_preds(seq_df=df) # Make predictions; calibrated and uncalibrated preds_cal = dlc.make_preds(seq_df=df) preds_uncal = dlc.make_preds(seq_df=df, calibrate=False) # Compare calibrated and uncalibrated predictions #print("Predictions (calibrated): ", preds_cal) #print("Predictions (uncalibrated): ", preds_uncal) plt.scatter(df["tr"], preds_cal, label="Calibrated", s=1) plt.scatter(df["tr"], preds_uncal, label="Uncalibrated", s=1) plt.legend() plt.savefig('deeplc_calibrated_vs_uncalibrated.png')
def run(file_pred="", file_cal="", file_pred_out="", file_model=None, n_threads=None, verbose=False, split_cal=50, dict_divider=50, batch_num=50000, plot_predictions=False): """ Main function to run the DeepLC code Parameters ---------- file_pred : str the file in peprec format that we need to make predictions for this file is not required to contain a tr column file_cal : str the file in peprec format that we use for calibrating the prediction model. This file is required to contain a tr column file_pred_out : str outfile for predictions, the file is in peprec format and predictions are added in the column TODO file_model : str | list | None the model(s) to try for retention time prediction can be a single location or several locations for multiple models to try n_threads : int number of threads to run mainly the feature extraction on split_cal : int number of splits or divisions to use for the calibration dict_divider : int TODO batch_num : int TODO plot_predictions : bool Save scatter plot of predictions vs observations Returns ------- None """ logging.info("Using DeepLC version %s", __version__) # Read input files df_pred = pd.read_csv(file_pred) df_pred = df_pred.fillna("") if len(file_cal) > 1: df_cal = pd.read_csv(file_cal) df_cal = df_cal.fillna("") # Make a feature extraction object; you can skip this if you do not want to # use the default settings for DeepLC. Here we want to use a model that does # not use RDKit features so we skip the chemical descriptor making # procedure. f_extractor = FeatExtractor(add_sum_feat=False, ptm_add_feat=False, ptm_subtract_feat=False, standard_feat=False, chem_descr_feat=False, add_comp_feat=False, cnn_feats=True, verbose=verbose) # Make the DeepLC object that will handle making predictions and # calibration dlc = DeepLC(path_model=file_model, f_extractor=f_extractor, cnn_model=True, n_jobs=n_threads, verbose=verbose, batch_num=batch_num) # Calibrate the original model based on the new retention times if len(file_cal) > 1: logging.info("Selecting best model and calibrating predictions...") dlc.calibrate_preds(seq_df=df_cal) # Make predictions; calibrated or uncalibrated logging.info("Making predictions using model: %s", dlc.model) if len(file_cal) > 1: preds = dlc.make_preds(seq_df=df_pred) else: preds = dlc.make_preds(seq_df=df_pred, calibrate=False) df_pred["predicted_tr"] = preds logging.debug("Writing predictions to file: %s", file_pred_out) df_pred.to_csv(file_pred_out) if plot_predictions: if len(file_cal) > 1 and "tr" in df_pred.columns: file_pred_figure = os.path.splitext(file_pred_out)[0] + '.png' logging.debug("Saving scatterplot of predictions to file: %s", file_pred_figure) plt.figure(figsize=(11.5, 9)) plt.scatter(df_pred["tr"], df_pred["predicted_tr"], s=3) plt.title("DeepLC predictions") plt.xlabel("Observed retention times") plt.ylabel("Predicted retention times") plt.savefig(file_pred_figure, dpi=300) else: logging.warning('No observed retention time in input data. Cannot \ plot predictions') logging.info("DeepLC finished!")
class RetentionTimeIntegration: """Retention time integration for MSĀ²ReScore, using DeepLC.""" def __init__( self, peprec_path: str, feature_path: str, higher_psm_score_better: bool = True, calibration_set_size: Optional[Union[int, float]] = 0.20, num_cpu: Optional[int] = None, ): """ Retention time integration for MSĀ²ReScore, using DeepLC. Parameters ---------- peprec_path: str Path to PEPREC file with PSMs feature_path: str Path to feature file to write features to. higher_psm_score_better: bool Wheter a higher PSM score (`psm_score` column in PEPREC) denotes a better score. (default: True) calibration_set_size: int or float Amount of best PSMs to use for DeepLC calibration. If this value is lower than the number of available PSMs, all PSMs will be used. (default: 0.20) num_cpu: {int, None} Number of processes to use in DeepLC Properties ---------- calibration_data: pandas.DataFrame Get calibration peptides (N best PSMs in PEPREC). prediction_data: pandas.DataFrame Get prediction peptides. Methods ------- run() Get retention time predictions for PEPREC and calculate features. """ self.peprec_path = peprec_path self.feature_path = feature_path self.higher_psm_score_better = higher_psm_score_better self.calibration_set_size = calibration_set_size self.num_cpu = num_cpu # Until fixed upstream: https://github.com/compomics/DeepLC/issues/19 if "NUMEXPR_MAX_THREADS" not in os.environ: os.environ["NUMEXPR_MAX_THREADS"] = str(self.num_cpu) self.peprec = None self.feature_df = None if self.peprec_path: self.peprec = PeptideRecord(path=self.peprec_path) def num_calibration_psms(self, peprec): """Get number of calibration PSMs given `calibration_set_size` and total number of PSMs.""" if isinstance(self.calibration_set_size, float): if self.calibration_set_size > 1: raise ValueError( "`calibration_set_size` cannot be larger than 1.") elif self.calibration_set_size <= 0: raise ValueError( "`calibration_set_size` cannot be smaller than or equal to 0." ) else: num_calibration_psms = round( len(peprec) * self.calibration_set_size) elif isinstance(self.calibration_set_size, int): if self.calibration_set_size > len(peprec): logger.warning( "Requested number of calibration PSMs (%s) is larger than total number " "of PSMs in PEPREC (%s). Using all PSMs for calibration.", self.calibration_set_size, peprec, ) num_calibration_psms = len(peprec) else: num_calibration_psms = self.calibration_set_size else: raise TypeError( "Expected float or int for `calibration_set_size`. Got " f"{type(self.calibration_set_size)} instead") logger.debug("Using %i PSMs for calibration", num_calibration_psms) return num_calibration_psms def get_calibration_data(self, peprec): """Get calibration peptides (N best PSMs in PEPREC).""" ascending = not self.higher_psm_score_better if "label" in peprec.columns: label_col = "label" elif "Label" in peprec.columns: label_col = "Label" else: raise ValueError("No label column found in peptide record.") calibration_data = (peprec[peprec[label_col] == 1].sort_values( ["psm_score"], ascending=ascending).head( self.num_calibration_psms(peprec=peprec)).rename( columns={ "observed_retention_time": "tr", "peptide": "seq", })[["tr", "seq", "modifications"]].copy()) return calibration_data def get_prediction_data(self, peprec): """Get prediction peptides.""" return peprec[["peptide", "modifications"]].rename(columns={ "peptide": "seq", }) def _calculate_features(self): """Calculate retention time features for rescoring.""" # Absolute difference between observed and predicted' self.feature_df = self.peprec.df.copy() self.feature_df["rt_diff"] = ( self.feature_df["observed_retention_time"] - self.feature_df["predicted_retention_time"]).abs() # Minimum RT difference for a peptidoform min_rt_diff = self.feature_df[[ "peptide", "modifications", "observed_retention_time", "predicted_retention_time", "rt_diff", ]].copy() min_rt_diff = (min_rt_diff.sort_values( "rt_diff", ascending=True ).drop_duplicates(subset=[ "peptide", "modifications" ], keep="first").rename( columns={ "rt_diff": "rt_diff_best", "observed_retention_time": "observed_retention_time_best", "predicted_retention_time": "predicted_retention_time_best", })) # Merging minimum RT difference features to full set self.feature_df = self.feature_df.merge( min_rt_diff, on=["peptide", "modifications"], how="left") # Only keep feature columns id_columns = ["spec_id", "charge", "peptide", "modifications"] feature_columns = [ "observed_retention_time", "predicted_retention_time", "rt_diff", "rt_diff_best", "observed_retention_time_best", "predicted_retention_time_best", ] self.feature_df = self.feature_df[id_columns + feature_columns].copy() def run(self): """Get retention time predictions for PEPREC and calculate features.""" from deeplc import DeepLC if "Raw file" in self.peprec.df.columns: raw_specific_predicted_dfs = [] for i, (raw_file, df) in enumerate(self.peprec.df.groupby("Raw file")): logger.info(f"Calibrating {raw_file}") peprec_raw_df = df.copy().reset_index() retention_time_df = pd.DataFrame( columns=["spec_id", "predicted_retention_time"]) if i == 0: self.deeplc_predictor = DeepLC(split_cal=10, n_jobs=self.num_cpu, cnn_model=True, verbose=False, pygam_calibration=True) self.deeplc_predictor.calibrate_preds( seq_df=self.get_calibration_data(peprec_raw_df)) self.deeplc_model = list( self.deeplc_predictor.model.keys()) else: self.deeplc_predictor = DeepLC( split_cal=10, n_jobs=self.num_cpu, cnn_model=True, verbose=False, path_model=self.deeplc_model, pygam_calibration=True) self.deeplc_predictor.calibrate_preds( seq_df=self.get_calibration_data(peprec_raw_df)) predicted_rts = pd.Series( self.deeplc_predictor.make_preds( seq_df=self.get_prediction_data(peprec_raw_df))) retention_time_df["spec_id"] = peprec_raw_df["spec_id"].copy() retention_time_df["predicted_retention_time"] = predicted_rts raw_specific_predicted_dfs.append(retention_time_df) self.peprec.df = pd.merge( self.peprec.df, pd.concat(raw_specific_predicted_dfs, ignore_index=True), on="spec_id", how="inner", ) else: self.deeplc_predictor = DeepLC(split_cal=10, n_jobs=self.num_cpu, cnn_model=True, verbose=False, pygam_calibration=True) self.deeplc_predictor.calibrate_preds( seq_df=self.get_calibration_data(self.peprec.df)) predicted_rts = pd.Series( self.deeplc_predictor.make_preds( seq_df=self.get_prediction_data(self.peprec.df), )) self.peprec.df["predicted_retention_time"] = predicted_rts self._calculate_features() self.feature_df.to_csv(self.feature_path, index=False)