def test_get_cid(self): self.assertEqual(get_cid("64-17-5"), 702) self.assertEqual(get_cid("141-78-6"), 8857) self.assertEqual(get_cid("110-01-0"), 1127) cids = get_cids(["64-17-5", "141-78-6", "110-01-0"]) self.assertEqual(cids["64-17-5"], 702) self.assertEqual(cids["141-78-6"], 8857) self.assertEqual(cids["110-01-0"], 1127)
import pyrfume df = pyrfume.load_data('arctander_1960/Arctander Master.xlsx') from rdkit.Chem.rdinchi import InchiToInchiKey df['InChiKey'] = df['InChiKey'].apply(lambda x: InchiToInchiKey(x) if 'InChI=' in str(x) and str(x)!='nan' else x) from tqdm.auto import tqdm from pyrfume.odorants import get_cid, get_cids for index, row in tqdm(df.iterrows(), total=df.shape[0]): #if index < 215: # continue cid = 0 for j, col in enumerate(['InChiKey', 'SMILES', 'CAS', 'ChemicalName']): if not str(row[col]) == 'nan': cid = get_cid(row[col], kind=(col if j<2 else 'name')) if cid: break df.loc[index, 'new_CID'] = cid df[df['new_CID'].isnull()] df.join(df[[]]) pyrfume.save_data(df, "arctander_1960/arctander.csv") df.dropna(subset=["ChemicalName"]).shape x = dict(df.dropna(subset=["ChemicalName"]).set_index("ChemicalName")["Description"]) dict(df.set_index('CID')["Description"])
for name, url_suffix in df[df['CID'] == 0]['url'].items(): url = 'https://senselab.med.yale.edu/OdorDB/%s' % url_suffix f = urlopen(url) html = f.read() soup = bs4.BeautifulSoup(html) table = soup.find('table') cas_row = table.find_all('tr')[5] cas_text = cas_row.find_all('span')[-1].text cas = cas_text.replace('\r\n', '').strip() df.loc[name, 'CAS'] = cas # + # Add CIDs obtained from searching the CAS string for name, cas in df[df['CAS'].notnull()]['CAS'].items(): if cas: cid = odorants.get_cid(cas, kind='name') df.loc[name, 'CID'] = cid # Fill remaining missing CIDs with 0 df.loc[:, 'CID'] = df['CID'].fillna(0) # - # Manual fills df.loc['2,4,5-TRIMETHYLTHIAZOLINE', 'CID'] = 263626 df.loc['METHYLSALICYLATE', 'CID'] = 4133 df.loc['PHENYLETHYL ALCOHOL (PEA)', 'CID'] = 6054 df.loc['Perillaalcohol', 'CID'] = 10819 df.loc['Perillaaldehyde', 'CID'] = 16441 #df[df['CID']==0] file_path = os.path.join(pyrfume.DATA, 'senselab.csv')
df = df.join(pd.Series(smiles_cids, name='CID')) df.head() from rdkit.Chem import MolFromSmiles, MolToSmiles df['SMILES'] = df.index p = ProgressBar(len(smiles_cids)) for i, (old, cid) in enumerate(smiles_cids.items()): p.animate(i, status=old) if cid == 0: mol = MolFromSmiles(old) if mol is None: new = '' else: new = MolToSmiles(mol, isomericSmiles=True) if old != new: cid = get_cid(new, kind='SMILES') df.loc[old, ['SMILES', 'CID']] = [new, cid] p.animate(i + 1, status='Done') df[df['SMILES'] == ''] ozone_smiles = ozone_cid = get_cid('[O-][O+]=O', kind='SMILES') df.loc['O=[O]=O', ['SMILES', 'CID']] = [ozone_smiles, ozone_cid] df = df.set_index('CID').drop(['ez_smiles'], axis=1) df = df.rename(columns={'author': 'year', 'year': 'author'}) df.head() pyrfume.save_data(df, 'thresholds/parsed_threshold_data_in_air_fixed.csv')
results = odorants.get_cids(leffingwell_data_raw['smiles'], kind='SMILES', verbose=False) leffingwell_data = pd.Series(results, name='CID').to_frame().join( leffingwell_data_raw.set_index('smiles')) leffingwell_data.head() for smiles in leffingwell_data[leffingwell_data['CID'] == 0].index: name = leffingwell_data.loc[smiles, 'chemical_name'] mol = Chem.MolFromSmiles(smiles) if mol is None: print("Bad smiles: %s" % smiles) else: smiles = Chem.MolToSmiles(mol, isomericSmiles=True) cid = odorants.get_cid(smiles, kind='smiles', verbose=True) if cid: print(name, cid) else: print(name, smiles) leffingwell_data = pd.Series(results, name='CID').to_frame().join( leffingwell_data_raw.set_index('smiles')) leffingwell_data[leffingwell_data['CID'] == 0] x = leffingwell_data.reset_index().set_index('chemical_name') #x.loc['calcium alginate', 0] x[x['CID'] == 0].head() file_path = os.path.join(pyrfume.DATA, 'westeros', 'westeros.csv') leffingwell_data.to_csv(file_path)