def test_dissociation_reactions(): # Check there's only one dissociation reaction for each product assert len(df['Electrolyte Formula']) == len(set(df['Electrolyte Formula'].values.tolist())) # Check the chemicals match up with the database for name, CAS, formula in zip(df['Electrolyte name'], df['Electrolyte CAS'], df['Electrolyte Formula']): assert CAS_from_any(CAS) == CAS assert pubchem_db.search_CAS(CAS).formula == serialize_formula(formula) # Check the anions match up with the database for formula, CAS, charge in zip(df['Anion formula'], df['Anion CAS'], df['Anion charge']): assert CAS_from_any(CAS) == CAS assert CAS_from_any(formula) == CAS hit = pubchem_db.search_CAS(CAS) assert hit.charge == charge assert hit.formula == serialize_formula(formula) # Check the cations match up with the database for formula, CAS, charge in zip(df['Cation formula'], df['Cation CAS'], df['Cation charge']): assert CAS_from_any(CAS) == CAS assert CAS_from_any(formula) == CAS hit = pubchem_db.search_CAS(CAS) assert hit.charge == charge assert hit.formula == serialize_formula(formula) # Check the charges and counts of ions sums to zero for an_charge, an_count, cat_charge, cat_count in zip(df['Anion charge'].tolist(), df['Anion count'].tolist(), df['Cation charge'].tolist(), df['Cation count'].tolist()): # for index, row in df.iterrows(): # an_charge = row['Anion charge'] # an_count = row['Anion count'] # cat_charge = row['Cation charge'] # cat_count = row['Cation count'] err = an_charge*an_count + cat_charge*cat_count assert err == 0 # Check the reactant counts and product counts sum to be equal and conserve # moles #for index, row in df.iterrows(): for elec, cat, cat_count, an, an_count in zip(df['Electrolyte Formula'].tolist(), df['Cation formula'].tolist(), df['Cation count'].tolist(), df['Anion formula'].tolist(), df['Anion count'].tolist()): elec = nested_formula_parser(elec) #elec = nested_formula_parser(row['Electrolyte Formula']) cat = nested_formula_parser(cat) #cat = nested_formula_parser(row['Cation formula']) #cat_count = row['Cation count'] an = nested_formula_parser(an) #an = nested_formula_parser(row['Anion formula']) #an_count = row['Anion count'] product_counter = Counter() for _ in range(cat_count): product_counter.update(cat) for _ in range(an_count): product_counter.update(an) assert dict(product_counter.items()) == elec
def test_organic_user_db(): db = ChemicalMetadataDB(elements=False, main_db=None, user_dbs=[ os.path.join( folder, 'chemical identifiers example user db.tsv') ]) for CAS, d in db.CAS_index.items(): assert CAS_from_any(d.CASs) == d.CASs # Check something was loaded assert len(db.CAS_index) > 100 # Check smiles are unique / can lookup by smiles for smi, d in db.smiles_index.items(): if not smi: continue assert CAS_from_any('smiles=' + smi) == d.CASs # Check formula is formatted right assert all([ i.formula == serialize_formula(i.formula) for i in db.CAS_index.values() ]) # Check CAS validity assert all([check_CAS(i.CASs) for i in db.CAS_index.values()]) # MW checker for i in db.CAS_index.values(): formula = serialize_formula(i.formula) atoms = nested_formula_parser(formula, check=False) mw_calc = molecular_weight(atoms) assert_allclose(mw_calc, i.MW, atol=0.05) for CAS, d in db.CAS_index.items(): assert CAS_from_any('InChI=1S/' + d.InChI) == int_to_CAS(CAS) for CAS, d in db.CAS_index.items(): assert CAS_from_any('InChIKey=' + d.InChI_key) == int_to_CAS(CAS) # Test the pubchem ids which aren't -1 for CAS, d in db.CAS_index.items(): if d.pubchemid != -1: assert CAS_from_any('PubChem=' + str(d.pubchemid)) == int_to_CAS(CAS) CAS_lenth = len(db.CAS_index) assert CAS_lenth == len(db.smiles_index) assert CAS_lenth == len(db.InChI_index) assert CAS_lenth == len(db.InChI_key_index)
def test_inorganic_db(): db = ChemicalMetadataDB( elements=False, main_db=None, user_dbs=[os.path.join(folder, 'Inorganic db.tsv')]) # Check CAS lookup for CAS, d in db.CAS_index.items(): assert CAS_from_any(d.CASs) == d.CASs # Try ro check formula lookups for formula, d in db.formula_index.items(): if formula in set(['H2MgO2', 'F2N2']): # Formulas which are not unique by design continue assert CAS_from_any(formula) == d.CASs # Check smiles are unique / can lookup by smiles for smi, d in db.smiles_index.items(): if not smi: continue assert CAS_from_any('smiles=' + smi) == d.CASs # Check formula is formatted right assert all([ i.formula == serialize_formula(i.formula) for i in db.CAS_index.values() ]) # Check CAS validity assert all([check_CAS(i.CASs) for i in db.CAS_index.values()]) # MW checker for i in db.CAS_index.values(): formula = serialize_formula(i.formula) atoms = nested_formula_parser(formula, check=False) mw_calc = molecular_weight(atoms) assert_allclose(mw_calc, i.MW, atol=0.05)
def process(init_data, use_cache=True): ''' Examples -------- >>> res = process({'CAS': '10170-69-1', 'synonyms': ['14267-36-8', 'NSC 22319'], 'name': 'Manganese, decacarbonyldi-, (Mn-Mn)'}) >>> res['inchi'], res['smiles'], res['cid'], res['CAS'] ('InChI=1S/10CO.2Mn/c10*1-2;;', '[C-]#[O+].[C-]#[O+].[C-]#[O+].[C-]#[O+].[C-]#[O+].[C-]#[O+].[C-]#[O+].[C-]#[O+].[C-]#[O+].[C-]#[O+].[Mn].[Mn]', 517769, '10170-69-1') ''' # print(locals()) init_data = init_data.copy() cc = cc_CAS = cc_name = cc_inchi = cc_inchikey = cc_smiles = cc_synonyms = cc_deprecated_CASs = None if 'CAS' in init_data: try: cc = common_chemistry_data(init_data['CAS']) cc_CAS, cc_name, cc_inchi, cc_inchikey, cc_smiles, cc_synonyms, cc_deprecated_CASs = cc except ValueError: # Compund is not in common chemistry; this is OK pass cid = iupac_name = p_MW = p_inchi = p_inchikey = p_smiles = p_formula = p_synonyms = None if init_data.get('mol', None) is not None: # If not in common chemistry or no InChi there, but if we have a mol file, get the inchi and inchikey for the # pubchem lookup mol = Chem.MolFromMolFile(init_data['mol']) if mol is not None: init_data['inchi'] = MolToInchi(mol) init_data['inchikey'] = InchiToInchiKey(init_data['inchi']) can_search_pubchem = (init_data.get('pubchem') is not None or init_data.get('CASRN', cc_CAS) is not None or init_data.get('inchi', cc_inchi) is not None or init_data.get('inchikey', cc_inchikey) is not None or init_data.get('smiles', cc_smiles) is not None) if can_search_pubchem: try: p = find_pubchem_from_ids( pubchem=init_data.get('pubchem'), CASRN=init_data.get('CASRN', cc_CAS), inchi=init_data.get('inchi', cc_inchi), inchikey=init_data.get('inchikey', cc_inchikey), smiles=init_data.get('smiles', cc_smiles), use_cache=use_cache) except Exception as e: p = None print(e, 'exception') if p is not None: cid, iupac_name, p_MW, p_inchi, p_inchikey, p_smiles, p_formula, p_synonyms = p # print(locals()) mol = None # Be aware some smiles descriptions are wrong # Start with user overridding if 'mol' in init_data: mol = Chem.MolFromMolFile(init_data['mol']) if mol is None and 'smiles' in init_data: mol = Chem.MolFromSmiles(init_data['smiles']) if mol is None and 'inchi' in init_data: mol = MolFromInchi( init_data['inchi']) if init_data['inchi'].startswith( "InChI=1S/") else MolFromInchi("InChI=1S/" + init_data['inchi']) # Trust common chemistry next if mol is None and cc_smiles is not None: mol = Chem.MolFromSmiles(cc_smiles) if mol is None and cc_inchi is not None: mol = MolFromInchi(cc_inchi) if cc_inchi.startswith( "InChI=1S/") else MolFromInchi("InChI=1S/" + cc_inchi) # Did we pull up the structure from pubchem?? if mol is None and p_smiles is not None: mol = Chem.MolFromSmiles(p_smiles) if mol is None and p_inchi is not None: mol = MolFromInchi(p_inchi) if p_inchi.startswith( "InChI=1S/") else MolFromInchi("InChI=1S/" + p_inchi) if mol is None: raise ValueError("No structure found") smiles = Chem.MolToSmiles(mol, True) inchi = MolToInchi(mol) inchikey = InchiToInchiKey(inchi) #MW = Descriptors.ExactMolWt(mol) formula = CalcMolFormula(mol, True, True) formula = serialize_formula(formula) MW = molecular_weight(nested_formula_parser(formula)) # print(inchi, cc_inchi, p_inchi) # print(inchikey, cc_inchikey, p_inchikey) # print(smiles, cc_smiles, p_smiles) # output values if 'pubchem' in init_data: cid = init_data['pubchem'] elif cid is None: cid = -1 if cc_CAS is not None: CAS = cc_CAS elif 'CAS' in init_data: CAS = init_data['CAS'] else: raise ValueError("CAS culd not be found") if 'formula' in init_data: # Override rdkit formula = init_data['formula'] if 'MW' in init_data: # Override rdkit MW = init_data['MW'] if 'smiles' in init_data: smiles = init_data['smiles'] if 'inchi' in init_data: inchi = init_data['inchi'] if 'inchikey' in init_data: inchikey = init_data['inchikey'] if inchikey == '*' or smiles == '*' or inchi == '*': raise ValueError("Failure in rdkit") # Do we have a name specified in the settings? if 'name' in init_data: name = init_data['name'] elif cc_name is not None: name = cc_name elif iupac_name is not None: name = iupac_name else: raise ValueError("There is no name for this compound") synonyms = [] if cc_synonyms is not None: synonyms += cc_synonyms if cc_deprecated_CASs is not None: synonyms += cc_deprecated_CASs if p_synonyms is not None: synonyms += p_synonyms if 'synonyms' in init_data: synonyms += init_data['synonyms'] synonyms = list(set(synonyms)) if name in synonyms: synonyms.remove(name) if synonyms: def key_sort_str(s): return len(s), s.lower() synonyms = sorted(synonyms, key=key_sort_str) # synonyms = natsorted(synonyms) # synonyms = [] return { 'cid': cid, 'CAS': CAS, 'formula': formula, 'MW': MW, 'smiles': smiles, 'inchi': inchi, 'inchikey': inchikey, 'name': name, 'synonyms': synonyms }