def test_is_valid_inchikey(): """Test if strings are correctly classified.""" inchikeys_true = ["XYLJNLCSTIOKRM-UHFFFAOYSA-N"] inchikeys_false = [ "XYLJNLCSTIOKRM-UHFFFAOYSA", "XYLJNLCSTIOKRMRUHFFFAOYSASN", "XYLJNLCSTIOKR-MUHFFFAOYSA-N", "XYLJNLCSTIOKRM-UHFFFAOYSA-NN", "Brcc(NC2=NCN2)-ccc3nccnc1-3", "2YLJNLCSTIOKRM-UHFFFAOYSA-N", "XYLJNLCSTIOKRM-aaaaaaaaaa-a" ] for inchikey in inchikeys_true: assert is_valid_inchikey(inchikey), "Expected inchikey is True." for inchikey in inchikeys_false: assert not is_valid_inchikey(inchikey), "Expected inchikey is False."
def test_is_valid_inchikey_none_input(): """Test None entry.""" assert not is_valid_inchikey(None), "Expected None entry to give False."
def pubchem_metadata_lookup(spectrum_in, name_search_depth=10, formula_search=False, min_formula_length=6, formula_search_depth=25, verbose=1): """ Parameters ---------- spectrum_in Matchms type spectrum as input. name_search_depth: int How many of the most relevant name matches to explore deeper. Default = 10. """ if spectrum_in is None: return None spectrum = spectrum_in.clone() if is_valid_inchikey(spectrum.get("inchikey")): return spectrum def _plausible_name(compound_name): return (isinstance(compound_name, str) and len(compound_name) > 4) compound_name = spectrum.get("compound_name") if not _plausible_name(compound_name): return spectrum # Start pubchem search inchi = spectrum.get("inchi") parent_mass = spectrum.get("parent_mass") if isinstance(parent_mass, np.ndarray): parent_mass = parent_mass[0] formula = spectrum.get("formula") # 1) Search for matching compound name results_pubchem = pubchem_name_search(compound_name, name_search_depth=name_search_depth, verbose=verbose) if len(results_pubchem) > 0: # 1a) Search for matching inchi if likely_has_inchi(inchi): inchi_pubchem, inchikey_pubchem, smiles_pubchem = find_pubchem_inchi_match( results_pubchem, inchi, verbose=verbose) # 1b) Search for matching mass if not likely_has_inchi(inchi) or inchikey_pubchem is None: inchi_pubchem, inchikey_pubchem, smiles_pubchem = find_pubchem_mass_match( results_pubchem, parent_mass, verbose=verbose) if inchikey_pubchem is not None and inchi_pubchem is not None: logging.info("Matching compound name: %s", compound_name) if verbose >= 1: print(f"Matching compound name: {compound_name}") spectrum.set("inchikey", inchikey_pubchem) spectrum.set("inchi", inchi_pubchem) spectrum.set("smiles", smiles_pubchem) return spectrum elif verbose >= 2: print(f"No matches found for compound name: {compound_name}") # 2) Search for matching formula if formula_search and formula and len(formula) >= min_formula_length: results_pubchem = pubchem_formula_search( formula, formula_search_depth=formula_search_depth, verbose=verbose) if len(results_pubchem) > 0: # 2a) Search for matching inchi if likely_has_inchi(inchi): inchi_pubchem, inchikey_pubchem, smiles_pubchem = find_pubchem_inchi_match( results_pubchem, inchi) # 2b) Search for matching mass if inchikey_pubchem is None: inchi_pubchem, inchikey_pubchem, smiles_pubchem = find_pubchem_mass_match( results_pubchem, parent_mass) if inchikey_pubchem is not None and inchi_pubchem is not None: logging.info("Matching formula: %s", formula) if verbose >= 1: print(f"Matching formula: {formula}") spectrum.set("inchikey", inchikey_pubchem) spectrum.set("inchi", inchi_pubchem) spectrum.set("smiles", smiles_pubchem) return spectrum elif verbose >= 2: print(f"No matches found for formula: {formula}") return spectrum