def test_load_data(self): import pickle, os from pyrfume.base import DEFAULT_DATA_PATH from pyrfume import load_data, save_data import pandas as pd data = {'col1': [1, 2], 'col2': [3, 4]} file_path = DEFAULT_DATA_PATH / "data.pkl" path_not_exists = DEFAULT_DATA_PATH / "THIS_IS_AN_INVALID_PATH" self.assertRaises(Exception, save_data, data, path_not_exists) save_data(data, file_path) data_gain = load_data(file_path) self.assertEqual(data_gain, data) os.remove(file_path) file_path = DEFAULT_DATA_PATH / "data.csv" df = pd.DataFrame(data) save_data(df, file_path) #with open(file_path, "w") as f: # f.write("0,1,2,3\n0,1,2,3") data_gain = load_data(file_path) for index1 in range(len(data_gain.values)): for index2 in range(len(data_gain.values[index1])): self.assertEqual(data_gain.values[index1][index2], df.values[index1][index2]) os.remove(file_path)
def get_snitz_dragon(use_original=True, regenerate=False): path = SNITZ_DIR / "snitz_dragon.csv" if not path.is_file() or regenerate: snitz_feature_names = get_snitz_weights().index # Use minmax scaling as in the Snitz paper minmax_scaled_dragon = features.load_dragon(suffix="-cleaned-minmaxed-imputed") df = minmax_scaled_dragon[snitz_feature_names] pyrfume.save_data(df, path) else: df = pyrfume.load_data(path) # .set_index('PubChemID') return df
def get_haddad_dragon(use_original=True, regenerate=False): path = HADDAD_DIR / "haddad_dragon.csv" if not path.is_file() or regenerate: haddad_feature_names = get_haddad_weights().index # Use standard scaling as in the Snitz paper standard_scaled_dragon = features.load_dragon( suffix="-cleaned-standardized-imputed") df = standard_scaled_dragon[haddad_feature_names] pyrfume.save_data(df, path) else: df = pyrfume.load_data(path) # .set_index('PubChemID') return df
def save_dragon(dragon, suffix): path = "%s/%s" % (FEATURES_DIR, DRAGON_STEM % suffix) save_data(dragon, path)
# --- # jupyter: # jupytext: # formats: ipynb,py # text_representation: # extension: .py # format_name: light # format_version: '1.5' # jupytext_version: 1.10.3 # kernelspec: # display_name: Python 3 # language: python # name: python3 # --- # + import pandas as pd import pyrfume # Read file sent by Emily Mayhew on Sept. 23, 2019 df = pd.read_csv('u19predictions.csv') # - df['CID'] = df['SMILEstring'].apply(lambda x: x.split(': ')[0]) df['SMILES'] = df['SMILEstring'].apply(lambda x: x.split(': ')[1]) df['PredictedOdorless'] = df['Prediction'] == 'Odorless' predicted_odorless = df.set_index('CID')['PredictedOdorless'] pyrfume.save_data(predicted_odorless.to_frame(), 'odorants/predicted_odorless.csv')
else: html += color(cid, statement, '#000000') return html # Create the HTML file html = make_html(all_statements) # Save the HTML file with open('../../pyrfume-data/pubchem/pubchem_scrape.html', 'w') as f: f.write(html) # - # Save a Python pickle file of all the statements in the Pyrfume data repository path = 'pubchem/pubchem_scrape.pkl' pyrfume.save_data(all_statements, path) # + # Create a dataframe to store the statements df = pd.DataFrame(index=sorted(all_statements), columns=['Odor', 'Odorless', 'Statements']) df.index.name = 'CID' # Fill this dataframe with the assignment (odor, odorless, or (!!) both), # and the corresponding statements supporting that assignment for cid in sorted(all_statements): statements = all_statements[cid] odor = False odorless = False for statement in statements: statement = statement.lower()
df.head() from rdkit.Chem import MolFromSmiles, MolToSmiles df['SMILES'] = df.index p = ProgressBar(len(smiles_cids)) for i, (old, cid) in enumerate(smiles_cids.items()): p.animate(i, status=old) if cid == 0: mol = MolFromSmiles(old) if mol is None: new = '' else: new = MolToSmiles(mol, isomericSmiles=True) if old != new: cid = get_cid(new, kind='SMILES') df.loc[old, ['SMILES', 'CID']] = [new, cid] p.animate(i + 1, status='Done') df[df['SMILES'] == ''] ozone_smiles = ozone_cid = get_cid('[O-][O+]=O', kind='SMILES') df.loc['O=[O]=O', ['SMILES', 'CID']] = [ozone_smiles, ozone_cid] df = df.set_index('CID').drop(['ez_smiles'], axis=1) df = df.rename(columns={'author': 'year', 'year': 'author'}) df.head() pyrfume.save_data(df, 'thresholds/parsed_threshold_data_in_air_fixed.csv')
# Load minmaxed, imputed Dragon features (cached from previous work) for all Pyrfume odorants # (Alternatively, load raw Dragon features and apply `features.clean_dragon`.) # Here we use minimax scaling instead of standard scaling because that is what the Snitz paper used. minmax_scaled_dragon = features.load_dragon(suffix='-cleaned-minmaxed-imputed') # Use the subset of features identified in Haddad and compute a cosine angle distance between each pair of odorants distances['snitz'] = snitz.get_snitz_distances(minmax_scaled_dragon) # Show the first 5 rows distances['snitz'].head() # Load scaled, imputed Dragon features (cached from previous work) for all Pyrfume odorants; # (Alternatively, load raw Dragon features and apply `features.clean_dragon`.) standard_scaled_dragon = features.load_dragon(suffix='-cleaned-standardized-imputed') # Use the subset of features identified in Haddad and compute a Euclidean distance between each pair of odorants distances['haddad'] = haddad.get_haddad_distances(standard_scaled_dragon) # Show the first 5 rows distances['haddad'].head() # + nondiagonal = distances['haddad'].values[np.triu_indices(distances['haddad'].shape[0], 1)] density, bins, _ = plt.hist(nondiagonal, bins=np.linspace(0, 25, 100), density=True, cumulative=True) shift = (bins[1]-bins[0])/2 haddad_density = pd.DataFrame(density, columns=['Cumulative Probability'], index=bins[:-1]+shift) pyrfume.save_data(haddad_density, 'haddad_2008/haddad_cumulative_probability.csv') nondiagonal = distances['snitz'].values[np.triu_indices(distances['snitz'].shape[0], 1)] density, bins, _ = plt.hist(nondiagonal, bins=np.linspace(0, 0.5, 100), density=True, cumulative=True) shift = (bins[1]-bins[0])/2 snitz_density = pd.DataFrame(density, columns=['Cumulative Probability'], index=bins[:-1]+shift) pyrfume.save_data(snitz_density, 'snitz_2013/snitz_cumulative_probability.csv')
# language: python # name: python3 # --- import pandas as pd import pyrfume original = pyrfume.load_data('physicochemical/AllDragon-20190730-mayhew.csv') original.head() new = pyrfume.load_data('physicochemical/ExtraEight.txt', delimiter='\t') new = new.set_index('NAME').sort_index() new.index.name = 'PubChemID' new.index from pyrfume import odorants infos = odorants.from_cids(new.index) for info in infos: new.loc[info['CID'], 'SMILES'] = info['IsomericSMILES'] new = new[['SMILES'] + [x for x in list(original) if x != 'SMILES']] new.head() assert list(original) == list(new) df = pd.concat([original, new]) df = df.groupby(level=0).first() # Drop duplicate PubChem IDs df.shape pyrfume.save_data(df, 'physicochemical/AllDragon.csv')
import pyrfume from pyrfume.pubchem import get_ghs_classification, parse_ghs_classification_for_odor, GHS_CODES from rickpy import ProgressBar path = 'odorants/all_cids_properties.csv' details = pyrfume.load_data(path, usecols=range(5)) details.head() # ### Cramer Toxicity Class Predictions tox = pyrfume.load_data('odorants/toxTree.csv') cramer = details.join(tox, on='SMILES')['Cramer Class'] cramer = cramer.apply(lambda x: len(x.split(' ')[-1][:-1])) cramer.head() pyrfume.save_data(cramer.to_frame(), 'odorants/cramer.csv') embedded_coords = { key: pyrfume.load_data('odorants/%s_umap.pkl' % key) for key in ('snitz', 'haddad') } # + def plot_tox(space, ax): coords = embedded_coords[space].join(cramer) color_dict = {1: 'gray', 2: 'green', 3: 'red'} colors = [color_dict[n] for n in coords['Cramer Class']] ax.scatter(*coords[['X', 'Y']].values.T, color=colors, s=0.5, alpha=0.5)
# language: python # name: python3 # --- # ### SMILES strings for the first N PubChem IDs # #### 2020/02/07: N = 100,000 # %load_ext autoreload # %autoreload 2 import pandas as pd import pyrfume from pyrfume.odorants import from_cids if 'results' not in locals(): results = {} n = int(1e5) by = int( 1e4) # In case there are errors, we will only have to go back this far for first in range(1, n + 1, by): if first not in results: last = first + by x = from_cids(range(first, last)) results[first] = x results.keys() df = pd.concat([pd.DataFrame(results[x]).set_index('CID') for x in results]) pyrfume.save_data(df, 'odorants/cids-smiles-pubchem-100000.csv')
# display_name: Python 3 # language: python # name: python3 # --- # %matplotlib inline import matplotlib.pyplot as plt import numpy as np import pandas as pd import pyrfume from pyrfume.odorants import cids_to_cas cids = pyrfume.load_data('odorants/all_cids.csv').index cas = cids_to_cas(cids) print("Out of %d molecules, %d have CAS numbers" % (len(cids), len([x for x in cas.values() if x]))) counts = pd.Series([len(x) for x in cas.values()]).value_counts() counts.index.name = 'Number of unique CAS values' counts.name = 'Number of molecules' counts.to_frame() to_save = pd.Series(cas) to_save.index.name = 'CID' to_save.name = 'CAS' to_save.head() pyrfume.save_data(to_save.to_frame(), 'odorants/cid_to_cas.csv')
# --- # jupyter: # jupytext: # formats: ipynb,py # text_representation: # extension: .py # format_name: light # format_version: '1.5' # jupytext_version: 1.10.3 # kernelspec: # display_name: Python 3 # language: python # name: python3 # --- import pyrfume from pyrfume import pubchem results = pubchem.get_results("Optical+Rotation") path = 'pubchem_optical_rotation/physics.pkl' pyrfume.save_data(results, path)
# jupytext: # formats: ipynb,py # text_representation: # extension: .py # format_name: light # format_version: '1.5' # jupytext_version: 1.10.3 # kernelspec: # display_name: Python 3 # language: python # name: python3 # --- import pyrfume from pyrfume import keller raw = keller.load_raw_bmc_data( only_dream_subjects=False, # Whether to only keep DREAM subjects only_dream_descriptors=False, # Whether to only keep DREAM descriptors only_dream_molecules=False) # Whether to only keep DREAM molecules) raw.head() cooked = keller.format_bmc_data(raw) cooked.head() cooked.index = cooked.index.reorder_levels([1, 0, 2, 3]) # Put CID first cooked = cooked.sort_index(level=0) # Sort by CID ascending pyrfume.save_data(cooked, 'keller_2016/data.csv')
odorless = True elif re.findall('no fragrance', statement): odorless = True elif re.findall('odorless', statement): odorless = True elif re.findall('odourless', statement): odorless = True elif re.findall('odoratus', statement): pass elif re.findall('sense of smell', statement): odor = True elif re.findall('odor', statement): odor = True elif re.findall('odour', statement): odor = True elif re.findall('smell', statement): odor = True elif re.findall('fragrance', statement): odor = True elif re.findall('aroma ', statement): odor = True else: pass if odor and odorless: pass #print(statements) df.loc[cid, :] = [odor, odorless, statements] df.head() pyrfume.save_data(df, 'pubchem_scrape_100000.csv')
from rdkit.Chem.rdinchi import InchiToInchiKey df['InChiKey'] = df['InChiKey'].apply(lambda x: InchiToInchiKey(x) if 'InChI=' in str(x) and str(x)!='nan' else x) from tqdm.auto import tqdm from pyrfume.odorants import get_cid, get_cids for index, row in tqdm(df.iterrows(), total=df.shape[0]): #if index < 215: # continue cid = 0 for j, col in enumerate(['InChiKey', 'SMILES', 'CAS', 'ChemicalName']): if not str(row[col]) == 'nan': cid = get_cid(row[col], kind=(col if j<2 else 'name')) if cid: break df.loc[index, 'new_CID'] = cid df[df['new_CID'].isnull()] df.join(df[[]]) pyrfume.save_data(df, "arctander_1960/arctander.csv") df.dropna(subset=["ChemicalName"]).shape x = dict(df.dropna(subset=["ChemicalName"]).set_index("ChemicalName")["Description"]) dict(df.set_index('CID')["Description"])
# Many of these CAS numbers are for substances, not compounds, and so have SIDs instead (not yet supported) cas_cids_dict = get_cids(df['CAS number']) # - # Add CIDs to the dataframe for cas, cid in cas_cids_dict.items(): df.loc[df['CAS number'] == cas, 'CID'] = cid # Convert CIDs to integers df.loc[:, 'CID'] = df.loc[:, 'CID'].astype(int) df.head() # Use CID as the index and sort df = df.set_index('CID').sort_index() df.head() pyrfume.save_data(df, 'IFRA_FIG/ifra_fig.csv') pyrfume.load_data('IFRA_FIG/ifra_fig.csv') from pyrfume.cabinets import get_mainland df_mainland = get_mainland() len(set(df_mainland['CAS']).intersection(df['CAS number'])) len(df_mainland.index.intersection(df.index)) df[df.index.isin(df_mainland.index)] # x = df_mainland.join(df, how='inner')[[ 'CAS', 'CAS number', 'Primary descriptor', 'Descriptor 2', 'Descriptor 2' ]]