コード例 #1
0
ファイル: evaluate.py プロジェクト: metaspace2020/metaspace
    def get_stats(self, set_name=None):
        peaks_dict = self.collected_peak_sets.get(set_name, {})
        stats = []

        for mol_idx, dfs in peaks_dict.items():
            peak_df = pd.concat(dfs)
            if not len(peak_df):
                continue

            mz_offset = peak_df.mz - self.eval_peaks.at[mol_idx, 'db_mz']
            avg_mz_offset = np.average(mz_offset, weights=peak_df.ints)
            width = peak_width(peak_df.mz.iloc[0], self.params.analyzer, self.ppm_sigma_1)
            avg_ppm_offset = avg_mz_offset / width
            mz_lo, mz_hi = np.percentile(mz_offset, [5, 95])
            stats.append(
                {
                    'mol_idx': mol_idx,
                    'mz_offset': avg_mz_offset,
                    'ppm_offset': avg_ppm_offset,
                    'mz_spread': mz_hi - mz_lo,
                    'ppm_spread': (mz_hi - mz_lo) / width,
                    'in_1ppm': np.count_nonzero(np.abs(mz_offset) <= 1 * width) / len(peak_df),
                    'in_2ppm': np.count_nonzero(np.abs(mz_offset) <= 2 * width) / len(peak_df),
                    'in_3ppm': np.count_nonzero(np.abs(mz_offset) <= 3 * width) / len(peak_df),
                    'n_spectra': len(peak_df),
                }
            )

        return pd.DataFrame(stats).set_index('mol_idx')
コード例 #2
0
 def _make_mx_nodes(self, X):
     # MSIWarp discards peaks outside the node_mzs range, so add a safety margin at either end
     # in case some other spectra in the dataset have a wider m/z range than the sample spectra.
     # Also, round to the nearest 10 or 1 Da for consistency and interpretability, and only pick
     # unique values in case n_segments is too high or the mass range is too small
     min_mz = np.floor(X.mz.min() / 10 - 1) * 10
     max_mz = np.ceil(X.mz.max() / 10 + 1) * 10
     node_mzs = np.unique(np.round(np.linspace(min_mz, max_mz, self.n_segments + 1)))
     node_slacks = peak_width(node_mzs, self.analyzer, self.align_sigma_1) / 2
     return mx.initialize_nodes(node_mzs, node_slacks, self.n_steps)
コード例 #3
0
    def _align_ransac_inner(self, sp, mzs, ints):
        hits = join_by_mz(
            self.target_spectrum,
            'mz',
            pd.DataFrame({
                'sample_mz': mzs,
                'sample_ints': ints
            }),
            'sample_mz',
            self.analyzer,
            self.align_sigma_1,
        )
        if len(hits) > 10:
            ints = hits.sample_ints * np.median(hits.ints / hits.sample_ints)
            ints_accuracy = 0.5 - (ints / (ints + 1))

            hits['weight'] = np.log(hits.sample_ints) * ints_accuracy
            hits = hits.sort_values('weight',
                                    ascending=False,
                                    ignore_index=True).iloc[:100]
            X = hits.sample_mz.values.reshape(-1, 1)
            y = hits.mz.values
            bins = np.histogram_bin_edges(X, 2)
            threshold = peak_width(X[:, 0], self.analyzer, self.jitter_sigma_1)
            ransac = RANSACRegressor(
                # max_trials=10000,
                min_samples=max(0.1, 3 / len(X)),
                residual_threshold=threshold,
                # Require subsets include values from both the higher and lower end of the mass range
                is_data_valid=lambda X_subset, y_subset: np.histogram(
                    X_subset, bins)[0].all(),
                loss='absolute_loss',
                stop_probability=1,
            )
            ransac.fit(X, y)
            return {
                'sp': sp,
                'M': ransac.estimator_.coef_[0],
                'C': ransac.estimator_.intercept_,
                'score': ransac.score(X, y),
                'inliers': np.count_nonzero(ransac.inlier_mask_),
                'align_peaks': len(hits),
                'align_min': hits.mz.min(),
                'align_max': hits.mz.max(),
            }
        else:
            return {'sp': sp, 'M': 1, 'C': 0, 'score': 0}
コード例 #4
0
def representative_spectrum(
    spectra_df: pd.DataFrame,
    mean_spectrum: pd.DataFrame,
    analyzer: AnalyzerType,
    sigma_1: float,
    denoise=False,
):
    """Finds the single spectrum that is most similar to the mean spectrum"""

    orig_mean_spectrum = mean_spectrum

    if denoise:
        # Exclude peaks that only exist in small number of spectra, have high m/z variability
        # (which suggests that multiple peaks were grouped together), or are near other more
        # intense peaks
        # mean_spectrum = mean_spectrum[mean_spectrum.n_hits > 1]
        _ints = mean_spectrum.ints.values
        _mz = mean_spectrum.mz.values
        local_lo, local_hi = mass_accuracy_bound_indices(
            _mz, _mz, analyzer, sigma_1 * 2)
        local_maximum_score = np.array([
            lo >= hi - 1 or i == lo + np.argmax(_ints[lo:hi])
            for i, (lo, hi) in enumerate(zip(local_lo, local_hi))
        ])

        peak_score = (mean_spectrum.coverage * (0.1 + local_maximum_score) * (
            1 - np.clip(mean_spectrum.mz_stddev / mean_spectrum.mz_tol, 0, 1)))

        mean_spectrum = sample_across_mass_range(mean_spectrum,
                                                 peak_score,
                                                 n_per_bin=500)
        logger.debug(
            f'Denoising reduced peaks from {len(orig_mean_spectrum)} to {len(mean_spectrum)}'
        )

    # Find the spectrum that's most similar to the background spectrum
    mean_spectrum = mean_spectrum.rename(columns={
        'mz': 'mean_mz',
        'ints': 'mean_ints'
    })
    spectrum_scores = {}
    processed_spectra = {}
    for sp, grp in spectra_df.groupby('sp'):
        joined = join_by_mz(mean_spectrum,
                            'mean_mz',
                            grp,
                            'mz',
                            analyzer,
                            sigma_1,
                            how='left')
        mz_tol = peak_width(joined.mz, analyzer, sigma_1) / 2
        joined['mz_err'] = np.clip(
            (joined.mean_mz - joined.mz.fillna(0)) / mz_tol, -1, 1)
        a = joined.mean_ints
        b = joined.ints.fillna(0)
        mz_err = max(joined.mz_err.abs().sum(), 0.0001)
        # score = cosine_similarity(mean_ints, ints) / mz_err.sum()
        spectrum_scores[sp] = np.dot(
            a, b) / (np.linalg.norm(a) * np.linalg.norm(b)) / mz_err
        if denoise:
            processed_spectra[sp] = joined[['sp', 'mz',
                                            'ints']][~joined.ints.isna()]
        else:
            processed_spectra[sp] = grp

    # Return the best scoring spectrum
    best_sp = pd.Series(spectrum_scores).idxmax()
    logger.debug(f'Choose representative spectrum: {best_sp}')
    return processed_spectra[best_sp].sort_values('mz')
コード例 #5
0
    def fit(self, X):
        missing_cols = {'sp', 'mz', 'ints'}.difference(X.columns)
        assert not missing_cols, f'X is missing columns: {", ".join(missing_cols)}'

        recal_candidates, self.db_hits, mean_spectrum = get_recal_candidates(
            X, self.params, self.recal_sigma_1)

        if len(recal_candidates) < 10:
            logger.warning(
                f'Too few peaks for recalibration ({len(recal_candidates)} < 10). Skipping.'
            )
            # Make a fake RANSACRegressor just in case
            linear_data = np.arange(3).reshape(-1, 1)
            self.model = RANSACRegressor(min_samples=2).fit(
                linear_data, linear_data)
            return self

        _X = np.array(recal_candidates.mz).reshape(-1, 1)
        _y = np.array(recal_candidates.db_mz)
        _weights = np.array(recal_candidates.weight)
        threshold = peak_width(recal_candidates.db_mz.values, self.analyzer,
                               self.jitter_sigma_1)

        # Require subsets include values from both the higher and lower end of the mass range
        # but define the bins such that at least 20% of peaks are included in each, to guard
        # against cases where the upper half of the mass range is almost empty.
        bins = np.histogram_bin_edges(recal_candidates.db_mz, 2)
        bins[1] = np.clip(bins[1], *np.percentile(_X, [20, 80]))
        # sum-spectrum peaks should be much more consistent than jitter_sigma_1, so try running
        # RANSAC with a much tighter tolerance, increasing the tolerance if the model can't converge
        # TODO: With this logic can max_trials be lowered?
        for i in [0.125, 0.25, 0.5, 1.0]:
            try:
                self.model = RANSACRegressor(
                    max_trials=10000,
                    # min_samples
                    min_samples=max(0.05, 3 / len(X)),
                    residual_threshold=(
                        threshold *
                        i)**2,  # use ** 2 only if loss = squared_loss
                    is_data_valid=_IsValidSubset(bins),
                    loss='squared_loss',
                    stop_probability=1,
                )
                self.model.fit(_X, _y, _weights)
            except ValueError:
                if i == 1:
                    raise
                else:
                    logger.info(
                        f'RANSAC couldn\'t converge with sigma={self.jitter_sigma_1 * i}, '
                        f'trying again with a higher tolerance')

        y_db_pred = self.model.estimator_.predict(
            self.db_hits.mz.values.reshape(-1, 1))
        db_threshold = peak_width(self.db_hits.db_mz.values, self.analyzer,
                                  self.jitter_sigma_1)
        db_inliers = np.abs(self.db_hits.db_mz.values -
                            y_db_pred) < db_threshold
        self.db_hits['recal_inlier'] = db_inliers
        mz_err_before = self.db_hits.db_mz - self.db_hits.mz
        self.db_hits['ppm_err_before'] = (mz_err_before) / (
            self.db_hits.db_mz * 1e6)
        mz_err_after = self.db_hits.db_mz - y_db_pred
        self.db_hits['ppm_err_after'] = mz_err_after / (self.db_hits.db_mz *
                                                        1e6)

        n_inliers = np.count_nonzero(self.db_hits.recal_inlier
                                     & self.db_hits.used_for_recal)
        logger.debug(f'RANSAC model hit {n_inliers} inliers out of {len(_y)}')
        min_mz = np.floor(X.mz.min() / 10) * 10
        max_mz = np.ceil(X.mz.max() / 10) * 10
        new_min, new_max = self.model.predict([[min_mz], [max_mz]])
        logger.debug(f'Warping {min_mz:.6f} -> {new_min:.6f}')
        logger.debug(f'Warping {max_mz:.6f} -> {new_max:.6f}')
        return self