def get_stats(self, set_name=None): peaks_dict = self.collected_peak_sets.get(set_name, {}) stats = [] for mol_idx, dfs in peaks_dict.items(): peak_df = pd.concat(dfs) if not len(peak_df): continue mz_offset = peak_df.mz - self.eval_peaks.at[mol_idx, 'db_mz'] avg_mz_offset = np.average(mz_offset, weights=peak_df.ints) width = peak_width(peak_df.mz.iloc[0], self.params.analyzer, self.ppm_sigma_1) avg_ppm_offset = avg_mz_offset / width mz_lo, mz_hi = np.percentile(mz_offset, [5, 95]) stats.append( { 'mol_idx': mol_idx, 'mz_offset': avg_mz_offset, 'ppm_offset': avg_ppm_offset, 'mz_spread': mz_hi - mz_lo, 'ppm_spread': (mz_hi - mz_lo) / width, 'in_1ppm': np.count_nonzero(np.abs(mz_offset) <= 1 * width) / len(peak_df), 'in_2ppm': np.count_nonzero(np.abs(mz_offset) <= 2 * width) / len(peak_df), 'in_3ppm': np.count_nonzero(np.abs(mz_offset) <= 3 * width) / len(peak_df), 'n_spectra': len(peak_df), } ) return pd.DataFrame(stats).set_index('mol_idx')
def _make_mx_nodes(self, X): # MSIWarp discards peaks outside the node_mzs range, so add a safety margin at either end # in case some other spectra in the dataset have a wider m/z range than the sample spectra. # Also, round to the nearest 10 or 1 Da for consistency and interpretability, and only pick # unique values in case n_segments is too high or the mass range is too small min_mz = np.floor(X.mz.min() / 10 - 1) * 10 max_mz = np.ceil(X.mz.max() / 10 + 1) * 10 node_mzs = np.unique(np.round(np.linspace(min_mz, max_mz, self.n_segments + 1))) node_slacks = peak_width(node_mzs, self.analyzer, self.align_sigma_1) / 2 return mx.initialize_nodes(node_mzs, node_slacks, self.n_steps)
def _align_ransac_inner(self, sp, mzs, ints): hits = join_by_mz( self.target_spectrum, 'mz', pd.DataFrame({ 'sample_mz': mzs, 'sample_ints': ints }), 'sample_mz', self.analyzer, self.align_sigma_1, ) if len(hits) > 10: ints = hits.sample_ints * np.median(hits.ints / hits.sample_ints) ints_accuracy = 0.5 - (ints / (ints + 1)) hits['weight'] = np.log(hits.sample_ints) * ints_accuracy hits = hits.sort_values('weight', ascending=False, ignore_index=True).iloc[:100] X = hits.sample_mz.values.reshape(-1, 1) y = hits.mz.values bins = np.histogram_bin_edges(X, 2) threshold = peak_width(X[:, 0], self.analyzer, self.jitter_sigma_1) ransac = RANSACRegressor( # max_trials=10000, min_samples=max(0.1, 3 / len(X)), residual_threshold=threshold, # Require subsets include values from both the higher and lower end of the mass range is_data_valid=lambda X_subset, y_subset: np.histogram( X_subset, bins)[0].all(), loss='absolute_loss', stop_probability=1, ) ransac.fit(X, y) return { 'sp': sp, 'M': ransac.estimator_.coef_[0], 'C': ransac.estimator_.intercept_, 'score': ransac.score(X, y), 'inliers': np.count_nonzero(ransac.inlier_mask_), 'align_peaks': len(hits), 'align_min': hits.mz.min(), 'align_max': hits.mz.max(), } else: return {'sp': sp, 'M': 1, 'C': 0, 'score': 0}
def representative_spectrum( spectra_df: pd.DataFrame, mean_spectrum: pd.DataFrame, analyzer: AnalyzerType, sigma_1: float, denoise=False, ): """Finds the single spectrum that is most similar to the mean spectrum""" orig_mean_spectrum = mean_spectrum if denoise: # Exclude peaks that only exist in small number of spectra, have high m/z variability # (which suggests that multiple peaks were grouped together), or are near other more # intense peaks # mean_spectrum = mean_spectrum[mean_spectrum.n_hits > 1] _ints = mean_spectrum.ints.values _mz = mean_spectrum.mz.values local_lo, local_hi = mass_accuracy_bound_indices( _mz, _mz, analyzer, sigma_1 * 2) local_maximum_score = np.array([ lo >= hi - 1 or i == lo + np.argmax(_ints[lo:hi]) for i, (lo, hi) in enumerate(zip(local_lo, local_hi)) ]) peak_score = (mean_spectrum.coverage * (0.1 + local_maximum_score) * ( 1 - np.clip(mean_spectrum.mz_stddev / mean_spectrum.mz_tol, 0, 1))) mean_spectrum = sample_across_mass_range(mean_spectrum, peak_score, n_per_bin=500) logger.debug( f'Denoising reduced peaks from {len(orig_mean_spectrum)} to {len(mean_spectrum)}' ) # Find the spectrum that's most similar to the background spectrum mean_spectrum = mean_spectrum.rename(columns={ 'mz': 'mean_mz', 'ints': 'mean_ints' }) spectrum_scores = {} processed_spectra = {} for sp, grp in spectra_df.groupby('sp'): joined = join_by_mz(mean_spectrum, 'mean_mz', grp, 'mz', analyzer, sigma_1, how='left') mz_tol = peak_width(joined.mz, analyzer, sigma_1) / 2 joined['mz_err'] = np.clip( (joined.mean_mz - joined.mz.fillna(0)) / mz_tol, -1, 1) a = joined.mean_ints b = joined.ints.fillna(0) mz_err = max(joined.mz_err.abs().sum(), 0.0001) # score = cosine_similarity(mean_ints, ints) / mz_err.sum() spectrum_scores[sp] = np.dot( a, b) / (np.linalg.norm(a) * np.linalg.norm(b)) / mz_err if denoise: processed_spectra[sp] = joined[['sp', 'mz', 'ints']][~joined.ints.isna()] else: processed_spectra[sp] = grp # Return the best scoring spectrum best_sp = pd.Series(spectrum_scores).idxmax() logger.debug(f'Choose representative spectrum: {best_sp}') return processed_spectra[best_sp].sort_values('mz')
def fit(self, X): missing_cols = {'sp', 'mz', 'ints'}.difference(X.columns) assert not missing_cols, f'X is missing columns: {", ".join(missing_cols)}' recal_candidates, self.db_hits, mean_spectrum = get_recal_candidates( X, self.params, self.recal_sigma_1) if len(recal_candidates) < 10: logger.warning( f'Too few peaks for recalibration ({len(recal_candidates)} < 10). Skipping.' ) # Make a fake RANSACRegressor just in case linear_data = np.arange(3).reshape(-1, 1) self.model = RANSACRegressor(min_samples=2).fit( linear_data, linear_data) return self _X = np.array(recal_candidates.mz).reshape(-1, 1) _y = np.array(recal_candidates.db_mz) _weights = np.array(recal_candidates.weight) threshold = peak_width(recal_candidates.db_mz.values, self.analyzer, self.jitter_sigma_1) # Require subsets include values from both the higher and lower end of the mass range # but define the bins such that at least 20% of peaks are included in each, to guard # against cases where the upper half of the mass range is almost empty. bins = np.histogram_bin_edges(recal_candidates.db_mz, 2) bins[1] = np.clip(bins[1], *np.percentile(_X, [20, 80])) # sum-spectrum peaks should be much more consistent than jitter_sigma_1, so try running # RANSAC with a much tighter tolerance, increasing the tolerance if the model can't converge # TODO: With this logic can max_trials be lowered? for i in [0.125, 0.25, 0.5, 1.0]: try: self.model = RANSACRegressor( max_trials=10000, # min_samples min_samples=max(0.05, 3 / len(X)), residual_threshold=( threshold * i)**2, # use ** 2 only if loss = squared_loss is_data_valid=_IsValidSubset(bins), loss='squared_loss', stop_probability=1, ) self.model.fit(_X, _y, _weights) except ValueError: if i == 1: raise else: logger.info( f'RANSAC couldn\'t converge with sigma={self.jitter_sigma_1 * i}, ' f'trying again with a higher tolerance') y_db_pred = self.model.estimator_.predict( self.db_hits.mz.values.reshape(-1, 1)) db_threshold = peak_width(self.db_hits.db_mz.values, self.analyzer, self.jitter_sigma_1) db_inliers = np.abs(self.db_hits.db_mz.values - y_db_pred) < db_threshold self.db_hits['recal_inlier'] = db_inliers mz_err_before = self.db_hits.db_mz - self.db_hits.mz self.db_hits['ppm_err_before'] = (mz_err_before) / ( self.db_hits.db_mz * 1e6) mz_err_after = self.db_hits.db_mz - y_db_pred self.db_hits['ppm_err_after'] = mz_err_after / (self.db_hits.db_mz * 1e6) n_inliers = np.count_nonzero(self.db_hits.recal_inlier & self.db_hits.used_for_recal) logger.debug(f'RANSAC model hit {n_inliers} inliers out of {len(_y)}') min_mz = np.floor(X.mz.min() / 10) * 10 max_mz = np.ceil(X.mz.max() / 10) * 10 new_min, new_max = self.model.predict([[min_mz], [max_mz]]) logger.debug(f'Warping {min_mz:.6f} -> {new_min:.6f}') logger.debug(f'Warping {max_mz:.6f} -> {new_max:.6f}') return self