def __call__(self, spectrum: SpectrumType, reference_spectrum: SpectrumType) -> bool: """Compare parent masses""" parentmass = spectrum.get("parent_mass") parentmass_ref = reference_spectrum.get("parent_mass") assert parentmass is not None and parentmass_ref is not None, "Missing parent mass." return abs(parentmass - parentmass_ref) <= self.tolerance
def pair(self, reference: SpectrumType, query: SpectrumType) -> float: """Compare precursor m/z between reference and query spectrum. Parameters ---------- reference Single reference spectrum. query Single query spectrum. """ entry_ref = reference.get(self.field) entry_query = query.get(self.field) if entry_ref is None or entry_query is None: return np.asarray(False, dtype=self.score_datatype) if self.matching_type == "equal_match": score = (entry_ref == entry_query) return np.asarray(score, dtype=self.score_datatype) if isinstance(entry_ref, (int, float)) and isinstance(entry_query, (int, float)): score = abs(entry_ref - entry_query) <= self.tolerance return np.asarray(score, dtype=self.score_datatype) logger.warning( "Non-numerical entry not compatible with 'difference' method") return np.asarray(False, dtype=self.score_datatype)
def pair(self, reference: SpectrumType, query: SpectrumType) -> float: """Compare parent masses between reference and query spectrum. Parameters ---------- reference Single reference spectrum. query Single query spectrum. """ parentmass_ref = reference.get("parent_mass") parentmass_query = query.get("parent_mass") assert parentmass_ref is not None and parentmass_query is not None, "Missing parent mass." return abs(parentmass_ref - parentmass_query) <= self.tolerance
def pair(self, reference: SpectrumType, query: SpectrumType) -> float: """Compare precursor m/z between reference and query spectrum. Parameters ---------- reference Single reference spectrum. query Single query spectrum. """ precursormz_ref = reference.get("precursor_mz") precursormz_query = query.get("precursor_mz") assert precursormz_ref is not None and precursormz_query is not None, "Missing precursor m/z." if self.type == "Dalton": return abs(precursormz_ref - precursormz_query) <= self.tolerance mean_mz = (precursormz_ref + precursormz_query) / 2 score = abs(precursormz_ref - precursormz_query) / mean_mz <= self.tolerance return numpy.asarray(score, dtype=self.score_datatype)
def pair(self, reference: SpectrumType, query: SpectrumType) -> float: """Calculate fingerprint based similarity score between two spectra. Parameters ---------- reference Single reference spectrum. query Single query spectrum. """ fingerprint_ref = reference.get("fingerprint") fingerprint_query = query.get("fingerprint") if self.similarity_measure == "jaccard": return jaccard_index(fingerprint_ref, fingerprint_query) if self.similarity_measure == "dice": return dice_similarity(fingerprint_ref, fingerprint_query) if self.similarity_measure == "cosine": return cosine_similarity(fingerprint_ref, fingerprint_query) raise NotImplementedError
def get_tanimoto_for_spectrum_ids( self, query_spectrum: SpectrumType, spectra_ids_list: List[str]) -> pd.DataFrame: """Returns a dataframe with tanimoto scores Spectra in spectra_ids_list without inchikey are removed. Args: ------ query_spectrum: Single Spectrum, the tanimoto scores are calculated between this spectrum and the spectra in match_spectrum_ids. match_spectrum_ids: list of spectrum_ids, which are preselected matches of the query_spectrum """ query_inchikey14 = query_spectrum.get("inchikey")[:14] assert len(query_inchikey14) == 14, \ f"Expected inchikey of length 14, " \ f"got inchikey = {query_inchikey14}" # Get inchikeys belonging to spectra ids metadata_dict = get_metadata_from_sqlite( self.sqlite_file_name, spectra_ids_list, self.settings["spectrum_id_column_name"]) unfiltered_inchikeys = [ metadata_dict[spectrum_id]["inchikey"] for spectrum_id in spectra_ids_list ] inchikey14s_dict = {} for i, inchikey in enumerate(unfiltered_inchikeys): # Only get the first 14 characters of the inchikeys inchikey14 = inchikey[:14] spectrum_id = spectra_ids_list[i] # Don't save spectra that do not have an inchikey. If a spectra has # no inchikey it is stored as "", so it will not be stored. if len(inchikey14) == 14: inchikey14s_dict[spectrum_id] = inchikey14 tanimoto_scores_spectra_ids = pd.DataFrame( columns=["Tanimoto_score"], index=list(inchikey14s_dict.keys())) for spectrum_id, inchikey14 in inchikey14s_dict.items(): tanimoto_score = self.tanimoto_scores.loc[inchikey14, query_inchikey14] tanimoto_scores_spectra_ids.at[spectrum_id, "Tanimoto_score"] = \ tanimoto_score return tanimoto_scores_spectra_ids