Example #1
0
    def test_get_cid(self):
        self.assertEqual(get_cid("64-17-5"), 702)
        self.assertEqual(get_cid("141-78-6"), 8857)
        self.assertEqual(get_cid("110-01-0"), 1127)

        cids = get_cids(["64-17-5", "141-78-6", "110-01-0"])
        self.assertEqual(cids["64-17-5"], 702)
        self.assertEqual(cids["141-78-6"], 8857)
        self.assertEqual(cids["110-01-0"], 1127)
Example #2
0
    url = 'https://senselab.med.yale.edu/OdorDB/Browse?db=5&cl=1&page=%d' % page
    f = urlopen(url)
    html = f.read()
    soup = bs4.BeautifulSoup(html)
    table = soup.find('table')
    for span in table.find_all('span'):
        name = span.text.strip()
        link = span.find('a').get('href')
        info.append((name, link))

# Make into a dataframe
df = pd.DataFrame.from_records(info, columns=['name', 'url'])
df.head()

# Get CIDS by searching the names
cids = odorants.get_cids(df['name'], kind='name')

# Add these CIDs to the dataframe
df = df.set_index('name').join(pd.Series(cids, name='CID'))

# Get CAS strings for compounds with no CID was found based on the name
for name, url_suffix in df[df['CID'] == 0]['url'].items():
    url = 'https://senselab.med.yale.edu/OdorDB/%s' % url_suffix
    f = urlopen(url)
    html = f.read()
    soup = bs4.BeautifulSoup(html)
    table = soup.find('table')
    cas_row = table.find_all('tr')[5]
    cas_text = cas_row.find_all('span')[-1].text
    cas = cas_text.replace('\r\n', '').strip()
    df.loc[name, 'CAS'] = cas
Example #3
0
# ---
# jupyter:
#   jupytext:
#     formats: ipynb,py
#     text_representation:
#       extension: .py
#       format_name: light
#       format_version: '1.5'
#       jupytext_version: 1.10.3
#   kernelspec:
#     display_name: Python 3
#     language: python
#     name: python3
# ---

import os
import pandas as pd
import pyrfume
from pyrfume import odorants

file_path = os.path.join(pyrfume.DATA, 'GRAS.smi')
gras_data_raw = pd.read_csv(file_path, header=None, names=['SMILES', 'CAS'], sep='\t')

results = odorants.get_cids(gras_data_raw['SMILES'], kind='SMILES', verbose=False)

gras_data = pd.Series(results, name='CID').to_frame().join(gras_data_raw.set_index('SMILES'))
gras_data.head()

file_path = os.path.join(pyrfume.DATA, 'gras.csv')
gras_data.to_csv(file_path)
Example #4
0
#   kernelspec:
#     display_name: Python 3
#     language: python
#     name: python3
# ---

# # Add CIDS to parsed_threshold_data_in_air.csv

import pandas as pd
import pyrfume
from pyrfume.odorants import get_cid, get_cids
from rickpy import ProgressBar
df = pyrfume.load_data('thresholds/parsed_threshold_data_in_air.csv')
df = df.set_index('canonical SMILES')

smiles_cids = get_cids(df.index, kind='SMILES')

df = df.join(pd.Series(smiles_cids, name='CID'))
df.head()

from rdkit.Chem import MolFromSmiles, MolToSmiles
df['SMILES'] = df.index
p = ProgressBar(len(smiles_cids))
for i, (old, cid) in enumerate(smiles_cids.items()):
    p.animate(i, status=old)
    if cid == 0:
        mol = MolFromSmiles(old)
        if mol is None:
            new = ''
        else:
            new = MolToSmiles(mol, isomericSmiles=True)
Example #5
0
#     display_name: Python 3
#     language: python
#     name: python3
# ---

# #### Basically, I just started from the mergedOdorants file that Joel sent me, and converted SMILES strings to CIDs

import os
import pandas as pd
import pyrfume
from pyrfume import odorants

file_path = os.path.join(pyrfume.DATA, 'mergedOdorants.csv')
df = pd.read_csv(file_path, index_col=0)

# Get CIDs from PubChem
smiles_cids = odorants.get_cids(df['NAME'], kind='smiles')

# Merge back into this list
df = pd.Series(smiles_cids,
               name='PubChemID').to_frame().join(df.set_index('NAME'))

# Save back to a file of just CIDs
for lib, name in [('goodscent', 'goodscents'), ('arc', 'arctander')]:
    file_path = os.path.join(pyrfume.DATA, '%s_cids.txt' % name)
    cids = sorted(set(df[df['lib'] == lib]['PubChemID']) - {0})
    pd.Series(cids, name='CID').to_csv(file_path, header=True, index=False)

file_path = os.path.join(pyrfume.DATA, 'mergedOdorants_with_cids.csv')
df.to_csv(file_path)
Example #6
0
    # Remove extraneous hyphens
    name = re.sub('(?<![0-9\(])-(?![0-9])', ' ', name)
    # Add back hyphens after prefixes
    for x in [
            'alpha', 'beta', 'gamma', 'delta', 'tert', 'L', 'D', 'm', 'o', 'p',
            'cis', 'trans', 'sec'
    ]:
        name = name.replace('%s ' % x, '%s-' % x)
    # Move isomeric identifiers to the front of the name
    for x in ['(-)', '(+)']:
        if x in name:
            name = '%s-%s' % (x, name.replace(x, ''))
    new_names.append(name)
    #print(name)

cids = get_cids(new_names)

df = pd.Series(cids, name='CID').to_frame()
df['Old Name'] = old_names
df.index.name = 'Name'
df = df.reset_index()
df.head()

df[df['CID'] == 0]

df.loc[67, 'CID'] = 19309
df.loc[76, 'CID'] = 11160
df.loc[79, 'CID'] = 8092
df.loc[155, 'CID'] = 28500
df.loc[170, 'CID'] = 251531
Example #7
0
# jupyter:
#   jupytext:
#     formats: ipynb,py
#     text_representation:
#       extension: .py
#       format_name: light
#       format_version: '1.5'
#       jupytext_version: 1.10.3
#   kernelspec:
#     display_name: Python 3
#     language: python
#     name: python3
# ---

import os
import pandas as pd
import pyrfume
from pyrfume import snitz, odorants

file_path = os.path.join(pyrfume.DATA, 'snitz', 'Snitz144.csv')
snitz_data_raw = pd.read_csv(file_path)

results = odorants.get_cids(snitz_data_raw['CAS'], kind='name', verbose=False)

snitz_data = pd.Series(results, name='CID').to_frame().join(
    snitz_data_raw.set_index('CAS'))
snitz_data.head()

file_path = os.path.join(pyrfume.DATA, 'snitz', 'snitz.csv')
snitz_data.to_csv(file_path)
Example #8
0
#   kernelspec:
#     display_name: Python 3
#     language: python
#     name: python3
# ---

import os
import pandas as pd
import pyrfume
from pyrfume import sigma_ff, odorants

# Load raw Sigma FF data
descriptors, data = sigma_ff.get_data()

# Turn CAS into CIDs
cas_list = list(data.keys())
results = odorants.get_cids(cas_list, kind='name', verbose=False)

# Format Sigma FF data into Dataframe with CIDs
# Odorants without CIDs will have a CID of 0
sigma = pd.DataFrame(index=cas_list, columns=['CID']+descriptors, data=0)
sigma.index.name = 'CAS'
for cas, desc in data.items():
    sigma.loc[cas, 'CID'] = results[cas]
    sigma.loc[cas, desc] = 1    
sigma.head()

# Create a new file with CIDs and store here in `cids` dictionary
file_path = os.path.join(pyrfume.DATA, 'sigma', 'sigma.csv')
sigma.to_csv(file_path)
Example #9
0
# Delete those overflow rows
df = df.loc[~df.index.isin(overflow_indices)]

# Fix problematic CAS numbers
for index, cas in df['CAS number'].items():
    if not re.match('[0-9]+\-[0-9]+\-[0-9]+', cas):
        print("Fixing %s" % cas)
        cas = cas.replace('(', '').replace(')', '')
        assert re.match('[0-9]+\-[0-9]+\-[0-9]+', cas)
        df.loc[index, 'CAS number'] = cas

# + jupyter={"outputs_hidden": true}
# Get CIDs for these CAS numbers
# Many of these CAS numbers are for substances, not compounds, and so have SIDs instead (not yet supported)
cas_cids_dict = get_cids(df['CAS number'])
# -

# Add CIDs to the dataframe
for cas, cid in cas_cids_dict.items():
    df.loc[df['CAS number'] == cas, 'CID'] = cid
# Convert CIDs to integers
df.loc[:, 'CID'] = df.loc[:, 'CID'].astype(int)
df.head()

# Use CID as the index and sort
df = df.set_index('CID').sort_index()
df.head()

pyrfume.save_data(df, 'IFRA_FIG/ifra_fig.csv')
Example #10
0
#     display_name: Python 3
#     language: python
#     name: python3
# ---

import os
import pandas as pd
import pyrfume
from pyrfume import odorants
from rdkit import Chem

file_path = os.path.join(pyrfume.DATA, 'westeros', 'molecules.csv')
leffingwell_data_raw = pd.read_csv(file_path, sep='\t')

results = odorants.get_cids(leffingwell_data_raw['smiles'],
                            kind='SMILES',
                            verbose=False)

leffingwell_data = pd.Series(results, name='CID').to_frame().join(
    leffingwell_data_raw.set_index('smiles'))
leffingwell_data.head()

for smiles in leffingwell_data[leffingwell_data['CID'] == 0].index:
    name = leffingwell_data.loc[smiles, 'chemical_name']
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        print("Bad smiles: %s" % smiles)
    else:
        smiles = Chem.MolToSmiles(mol, isomericSmiles=True)
    cid = odorants.get_cid(smiles, kind='smiles', verbose=True)
    if cid:
Example #11
0
# ---

# +
import bs4
import os
import pandas as pd
from urllib.request import urlopen

import pyrfume
from pyrfume import odorants
# -

url = 'http://www.flavornet.org/cas.html'
f = urlopen(url)
html = f.read()
soup = bs4.BeautifulSoup(html)

cas_list = []
rows = soup.find('table').find_all('tr')
for row in rows[1:]:
    cas = row.find('td').text
    cas_list.append(cas)

cids = odorants.get_cids(cas_list, kind='name')

df = pd.Series(cids, name='CID').to_frame()
df.head()

file_path = os.path.join(pyrfume.DATA, 'flavornet.csv')
df.to_csv(file_path)
Example #12
0
#       extension: .py
#       format_name: light
#       format_version: '1.5'
#       jupytext_version: 1.10.3
#   kernelspec:
#     display_name: Python 3
#     language: python
#     name: python3
# ---

import os
import pandas as pd
import pyrfume
from pyrfume import odorants

file_path = os.path.join(pyrfume.DATA, 'PrestwickChemLib.smi')
prestwick_data = pd.read_csv(file_path, header=None, sep='\t')[0]
prestwick_data.head()

results = odorants.get_cids(prestwick_data['SMILES'],
                            kind='SMILES',
                            verbose=False)

prestwick_data = pd.Series(results,
                           name='CID').to_frame().join(prestwick_data)[['CID']]
prestwick_data.head()

# Create a new file with CIDs and store here in `cids` dictionary
file_path = os.path.join(pyrfume.DATA, 'prestwick.csv')
prestwick_data.to_csv(file_path)