Esempio n. 1
0
    def test_load_data(self):
        import pickle, os
        from pyrfume.base import DEFAULT_DATA_PATH
        from pyrfume import load_data, save_data
        import pandas as pd

        data = {'col1': [1, 2], 'col2': [3, 4]}
        file_path = DEFAULT_DATA_PATH / "data.pkl"
        path_not_exists = DEFAULT_DATA_PATH / "THIS_IS_AN_INVALID_PATH"
        
        self.assertRaises(Exception, save_data, data, path_not_exists)
        save_data(data, file_path)

        data_gain = load_data(file_path)
        self.assertEqual(data_gain, data)
        os.remove(file_path)

        file_path = DEFAULT_DATA_PATH / "data.csv"

        df = pd.DataFrame(data)
        save_data(df, file_path)
        #with open(file_path, "w") as f:
        #    f.write("0,1,2,3\n0,1,2,3")

        data_gain = load_data(file_path)

        for index1 in range(len(data_gain.values)):
            for index2 in range(len(data_gain.values[index1])):
                self.assertEqual(data_gain.values[index1][index2], df.values[index1][index2])
                
        os.remove(file_path)
Esempio n. 2
0
def get_snitz_dragon(use_original=True, regenerate=False):
    path = SNITZ_DIR / "snitz_dragon.csv"
    if not path.is_file() or regenerate:
        snitz_feature_names = get_snitz_weights().index
        # Use minmax scaling as in the Snitz paper
        minmax_scaled_dragon = features.load_dragon(suffix="-cleaned-minmaxed-imputed")
        df = minmax_scaled_dragon[snitz_feature_names]
        pyrfume.save_data(df, path)
    else:
        df = pyrfume.load_data(path)  # .set_index('PubChemID')
    return df
Esempio n. 3
0
def get_haddad_dragon(use_original=True, regenerate=False):
    path = HADDAD_DIR / "haddad_dragon.csv"
    if not path.is_file() or regenerate:
        haddad_feature_names = get_haddad_weights().index
        # Use standard scaling as in the Snitz paper
        standard_scaled_dragon = features.load_dragon(
            suffix="-cleaned-standardized-imputed")
        df = standard_scaled_dragon[haddad_feature_names]
        pyrfume.save_data(df, path)
    else:
        df = pyrfume.load_data(path)  # .set_index('PubChemID')
    return df
Esempio n. 4
0
def save_dragon(dragon, suffix):
    path = "%s/%s" % (FEATURES_DIR, DRAGON_STEM % suffix)
    save_data(dragon, path)
Esempio n. 5
0
# ---
# jupyter:
#   jupytext:
#     formats: ipynb,py
#     text_representation:
#       extension: .py
#       format_name: light
#       format_version: '1.5'
#       jupytext_version: 1.10.3
#   kernelspec:
#     display_name: Python 3
#     language: python
#     name: python3
# ---

# +
import pandas as pd
import pyrfume

# Read file sent by Emily Mayhew on Sept. 23, 2019
df = pd.read_csv('u19predictions.csv')
# -

df['CID'] = df['SMILEstring'].apply(lambda x: x.split(': ')[0])
df['SMILES'] = df['SMILEstring'].apply(lambda x: x.split(': ')[1])
df['PredictedOdorless'] = df['Prediction'] == 'Odorless'
predicted_odorless = df.set_index('CID')['PredictedOdorless']

pyrfume.save_data(predicted_odorless.to_frame(),
                  'odorants/predicted_odorless.csv')
Esempio n. 6
0
            else:
                html += color(cid, statement, '#000000')
    return html


# Create the HTML file
html = make_html(all_statements)

# Save the HTML file
with open('../../pyrfume-data/pubchem/pubchem_scrape.html', 'w') as f:
    f.write(html)
# -

# Save a Python pickle file of all the statements in the Pyrfume data repository
path = 'pubchem/pubchem_scrape.pkl'
pyrfume.save_data(all_statements, path)

# +
# Create a dataframe to store the statements
df = pd.DataFrame(index=sorted(all_statements),
                  columns=['Odor', 'Odorless', 'Statements'])
df.index.name = 'CID'

# Fill this dataframe with the assignment (odor, odorless, or (!!) both),
# and the corresponding statements supporting that assignment
for cid in sorted(all_statements):
    statements = all_statements[cid]
    odor = False
    odorless = False
    for statement in statements:
        statement = statement.lower()
Esempio n. 7
0
df.head()

from rdkit.Chem import MolFromSmiles, MolToSmiles
df['SMILES'] = df.index
p = ProgressBar(len(smiles_cids))
for i, (old, cid) in enumerate(smiles_cids.items()):
    p.animate(i, status=old)
    if cid == 0:
        mol = MolFromSmiles(old)
        if mol is None:
            new = ''
        else:
            new = MolToSmiles(mol, isomericSmiles=True)
            if old != new:
                cid = get_cid(new, kind='SMILES')
        df.loc[old, ['SMILES', 'CID']] = [new, cid]
p.animate(i + 1, status='Done')

df[df['SMILES'] == '']

ozone_smiles = ozone_cid = get_cid('[O-][O+]=O', kind='SMILES')
df.loc['O=[O]=O', ['SMILES', 'CID']] = [ozone_smiles, ozone_cid]

df = df.set_index('CID').drop(['ez_smiles'], axis=1)

df = df.rename(columns={'author': 'year', 'year': 'author'})

df.head()

pyrfume.save_data(df, 'thresholds/parsed_threshold_data_in_air_fixed.csv')
Esempio n. 8
0
# Load minmaxed, imputed Dragon features (cached from previous work) for all Pyrfume odorants
# (Alternatively, load raw Dragon features and apply `features.clean_dragon`.)
# Here we use minimax scaling instead of standard scaling because that is what the Snitz paper used.
minmax_scaled_dragon = features.load_dragon(suffix='-cleaned-minmaxed-imputed')
# Use the subset of features identified in Haddad and compute a cosine angle distance between each pair of odorants
distances['snitz'] = snitz.get_snitz_distances(minmax_scaled_dragon)
# Show the first 5 rows
distances['snitz'].head()

# Load scaled, imputed Dragon features (cached from previous work) for all Pyrfume odorants; 
# (Alternatively, load raw Dragon features and apply `features.clean_dragon`.)  
standard_scaled_dragon = features.load_dragon(suffix='-cleaned-standardized-imputed')
# Use the subset of features identified in Haddad and compute a Euclidean distance between each pair of odorants
distances['haddad'] = haddad.get_haddad_distances(standard_scaled_dragon)
# Show the first 5 rows
distances['haddad'].head()

# +
nondiagonal = distances['haddad'].values[np.triu_indices(distances['haddad'].shape[0], 1)]
density, bins, _ = plt.hist(nondiagonal, bins=np.linspace(0, 25, 100), density=True, cumulative=True)
shift = (bins[1]-bins[0])/2
haddad_density = pd.DataFrame(density, columns=['Cumulative Probability'], index=bins[:-1]+shift)
pyrfume.save_data(haddad_density, 'haddad_2008/haddad_cumulative_probability.csv')

nondiagonal = distances['snitz'].values[np.triu_indices(distances['snitz'].shape[0], 1)]
density, bins, _ = plt.hist(nondiagonal, bins=np.linspace(0, 0.5, 100), density=True, cumulative=True)
shift = (bins[1]-bins[0])/2
snitz_density = pd.DataFrame(density, columns=['Cumulative Probability'], index=bins[:-1]+shift)
pyrfume.save_data(snitz_density, 'snitz_2013/snitz_cumulative_probability.csv')
Esempio n. 9
0
#     language: python
#     name: python3
# ---

import pandas as pd
import pyrfume

original = pyrfume.load_data('physicochemical/AllDragon-20190730-mayhew.csv')
original.head()

new = pyrfume.load_data('physicochemical/ExtraEight.txt', delimiter='\t')
new = new.set_index('NAME').sort_index()
new.index.name = 'PubChemID'
new.index

from pyrfume import odorants

infos = odorants.from_cids(new.index)
for info in infos:
    new.loc[info['CID'], 'SMILES'] = info['IsomericSMILES']
new = new[['SMILES'] + [x for x in list(original) if x != 'SMILES']]
new.head()

assert list(original) == list(new)

df = pd.concat([original, new])
df = df.groupby(level=0).first()  # Drop duplicate PubChem IDs
df.shape

pyrfume.save_data(df, 'physicochemical/AllDragon.csv')
Esempio n. 10
0
import pyrfume
from pyrfume.pubchem import get_ghs_classification, parse_ghs_classification_for_odor, GHS_CODES
from rickpy import ProgressBar

path = 'odorants/all_cids_properties.csv'
details = pyrfume.load_data(path, usecols=range(5))
details.head()

# ### Cramer Toxicity Class Predictions

tox = pyrfume.load_data('odorants/toxTree.csv')
cramer = details.join(tox, on='SMILES')['Cramer Class']
cramer = cramer.apply(lambda x: len(x.split(' ')[-1][:-1]))
cramer.head()

pyrfume.save_data(cramer.to_frame(), 'odorants/cramer.csv')

embedded_coords = {
    key: pyrfume.load_data('odorants/%s_umap.pkl' % key)
    for key in ('snitz', 'haddad')
}


# +
def plot_tox(space, ax):
    coords = embedded_coords[space].join(cramer)
    color_dict = {1: 'gray', 2: 'green', 3: 'red'}
    colors = [color_dict[n] for n in coords['Cramer Class']]
    ax.scatter(*coords[['X', 'Y']].values.T, color=colors, s=0.5, alpha=0.5)

Esempio n. 11
0
#     language: python
#     name: python3
# ---

# ### SMILES strings for the first N PubChem IDs
# #### 2020/02/07: N = 100,000

# %load_ext autoreload
# %autoreload 2

import pandas as pd
import pyrfume
from pyrfume.odorants import from_cids

if 'results' not in locals():
    results = {}
n = int(1e5)
by = int(
    1e4)  # In case there are errors, we will only have to go back this far
for first in range(1, n + 1, by):
    if first not in results:
        last = first + by
        x = from_cids(range(first, last))
        results[first] = x

results.keys()

df = pd.concat([pd.DataFrame(results[x]).set_index('CID') for x in results])

pyrfume.save_data(df, 'odorants/cids-smiles-pubchem-100000.csv')
Esempio n. 12
0
#     display_name: Python 3
#     language: python
#     name: python3
# ---

# %matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pyrfume
from pyrfume.odorants import cids_to_cas

cids = pyrfume.load_data('odorants/all_cids.csv').index

cas = cids_to_cas(cids)

print("Out of %d molecules, %d have CAS numbers" %
      (len(cids), len([x for x in cas.values() if x])))

counts = pd.Series([len(x) for x in cas.values()]).value_counts()
counts.index.name = 'Number of unique CAS values'
counts.name = 'Number of molecules'
counts.to_frame()

to_save = pd.Series(cas)
to_save.index.name = 'CID'
to_save.name = 'CAS'
to_save.head()

pyrfume.save_data(to_save.to_frame(), 'odorants/cid_to_cas.csv')
Esempio n. 13
0
# ---
# jupyter:
#   jupytext:
#     formats: ipynb,py
#     text_representation:
#       extension: .py
#       format_name: light
#       format_version: '1.5'
#       jupytext_version: 1.10.3
#   kernelspec:
#     display_name: Python 3
#     language: python
#     name: python3
# ---

import pyrfume
from pyrfume import pubchem

results = pubchem.get_results("Optical+Rotation")

path = 'pubchem_optical_rotation/physics.pkl'
pyrfume.save_data(results, path)
Esempio n. 14
0
#   jupytext:
#     formats: ipynb,py
#     text_representation:
#       extension: .py
#       format_name: light
#       format_version: '1.5'
#       jupytext_version: 1.10.3
#   kernelspec:
#     display_name: Python 3
#     language: python
#     name: python3
# ---

import pyrfume
from pyrfume import keller

raw = keller.load_raw_bmc_data(
    only_dream_subjects=False,  # Whether to only keep DREAM subjects
    only_dream_descriptors=False,  # Whether to only keep DREAM descriptors
    only_dream_molecules=False)  # Whether to only keep DREAM molecules)
raw.head()

cooked = keller.format_bmc_data(raw)
cooked.head()

cooked.index = cooked.index.reorder_levels([1, 0, 2, 3])  # Put CID first

cooked = cooked.sort_index(level=0)  # Sort by CID ascending

pyrfume.save_data(cooked, 'keller_2016/data.csv')
Esempio n. 15
0
            odorless = True
        elif re.findall('no fragrance', statement):
            odorless = True
        elif re.findall('odorless', statement):
            odorless = True
        elif re.findall('odourless', statement):
            odorless = True
        elif re.findall('odoratus', statement):
            pass
        elif re.findall('sense of smell', statement):
            odor = True
        elif re.findall('odor', statement):
            odor = True
        elif re.findall('odour', statement):
            odor = True
        elif re.findall('smell', statement):
            odor = True
        elif re.findall('fragrance', statement):
            odor = True
        elif re.findall('aroma ', statement):
            odor = True
        else:
            pass
    if odor and odorless:
        pass  #print(statements)
    df.loc[cid, :] = [odor, odorless, statements]

df.head()

pyrfume.save_data(df, 'pubchem_scrape_100000.csv')
Esempio n. 16
0
from rdkit.Chem.rdinchi import InchiToInchiKey
df['InChiKey'] = df['InChiKey'].apply(lambda x: InchiToInchiKey(x) if 'InChI=' in str(x) and str(x)!='nan' else x)

from tqdm.auto import tqdm
from pyrfume.odorants import get_cid, get_cids
for index, row in tqdm(df.iterrows(), total=df.shape[0]):
    #if index < 215:
    #    continue
    cid = 0
    for j, col in enumerate(['InChiKey', 'SMILES', 'CAS', 'ChemicalName']):
        if not str(row[col]) == 'nan':
            cid = get_cid(row[col], kind=(col if j<2 else 'name'))
            if cid:
                break
    df.loc[index, 'new_CID'] = cid

df[df['new_CID'].isnull()]

df.join(df[[]])

pyrfume.save_data(df, "arctander_1960/arctander.csv")

df.dropna(subset=["ChemicalName"]).shape

x = dict(df.dropna(subset=["ChemicalName"]).set_index("ChemicalName")["Description"])

dict(df.set_index('CID')["Description"])


Esempio n. 17
0
# Many of these CAS numbers are for substances, not compounds, and so have SIDs instead (not yet supported)
cas_cids_dict = get_cids(df['CAS number'])
# -

# Add CIDs to the dataframe
for cas, cid in cas_cids_dict.items():
    df.loc[df['CAS number'] == cas, 'CID'] = cid
# Convert CIDs to integers
df.loc[:, 'CID'] = df.loc[:, 'CID'].astype(int)
df.head()

# Use CID as the index and sort
df = df.set_index('CID').sort_index()
df.head()

pyrfume.save_data(df, 'IFRA_FIG/ifra_fig.csv')

pyrfume.load_data('IFRA_FIG/ifra_fig.csv')

from pyrfume.cabinets import get_mainland

df_mainland = get_mainland()
len(set(df_mainland['CAS']).intersection(df['CAS number']))

len(df_mainland.index.intersection(df.index))

df[df.index.isin(df_mainland.index)]  #

x = df_mainland.join(df, how='inner')[[
    'CAS', 'CAS number', 'Primary descriptor', 'Descriptor 2', 'Descriptor 2'
]]