Ejemplo n.º 1
0
    def test_load_data(self):
        import pickle, os
        from pyrfume.base import DEFAULT_DATA_PATH
        from pyrfume import load_data, save_data
        import pandas as pd

        data = {'col1': [1, 2], 'col2': [3, 4]}
        file_path = DEFAULT_DATA_PATH / "data.pkl"
        path_not_exists = DEFAULT_DATA_PATH / "THIS_IS_AN_INVALID_PATH"
        
        self.assertRaises(Exception, save_data, data, path_not_exists)
        save_data(data, file_path)

        data_gain = load_data(file_path)
        self.assertEqual(data_gain, data)
        os.remove(file_path)

        file_path = DEFAULT_DATA_PATH / "data.csv"

        df = pd.DataFrame(data)
        save_data(df, file_path)
        #with open(file_path, "w") as f:
        #    f.write("0,1,2,3\n0,1,2,3")

        data_gain = load_data(file_path)

        for index1 in range(len(data_gain.values)):
            for index2 in range(len(data_gain.values[index1])):
                self.assertEqual(data_gain.values[index1][index2], df.values[index1][index2])
                
        os.remove(file_path)
Ejemplo n.º 2
0
def get_snitz_weights(use_original=True):
    """Return a pandas Series of weights for Dragon features in Snitz"""
    if use_original:  # Use the ones from the Snitz paper, with no weights
        file_name = "snitz-descriptors-from-paper-dragon-6.csv"
        path = SNITZ_DIR / file_name
        snitz_weights = pyrfume.load_data(path, index_col=0)["Weight"]
    else:
        # Use the ones that I derived, with weights computed by optimization
        # using Snitz-space projections of each molecule's original unit vector
        file_name = "snitz_dragon_weights.csv"
        path = SNITZ_DIR / file_name
        snitz_weights = -pyrfume.load_data(path, index_col=0)["Weight"]
    return snitz_weights
Ejemplo n.º 3
0
def get_mainland(raw=False, vendors=None):
    """Return a dataframe containing odorants in Joel Mainland's cabinet"""
    df = pyrfume.load_data("cabinets/mainland.csv").set_index("CID")
    if not raw:
        if vendors:  # e.g. ['sigma']
            # Keep only odorants obtained from certain vendors
            df = df[df["SourcedFrom"].str.lower().isin(vendors)]

        # For odorants with no relative cost information
        # (probably discontinued), fill with a large number so they
        # don't get used.  Also use log10 $/mol
        df.loc[:, "$/mol"] = np.log10(df.loc[:, "$/mol"].fillna(1e15))

        # Fix price information
        df["Price"] = df["Price"].astype("str").apply(
            lambda x: x.replace("$", "")).astype("float")

        # Remove odorants with no price
        df = df.dropna(subset=["Price"])

        # Sort by price and then take only the cheapest instance of each CID
        df = df.sort_values("Price").groupby("CID").first()

        # Drop values with no CID
        df = df.loc[df.index > 0]

        # Sort CIDs from low to high
        df = df.sort_index()

    return df
Ejemplo n.º 4
0
def load_data():
    path = BUSHDID_PATH / "Bushdid-tableS1.csv"
    df = pyrfume.load_data(path, encoding="latin1")
    df = df.iloc[:, :4]
    df.columns = ["Name", "CAS", "Dilution", "Solvent"]
    df = df.set_index("CAS")
    df = df[df.index.notnull()]
    df.head()
    return df
Ejemplo n.º 5
0
def load_mainland_data(extra_cols=[]):
    cols = ["Subject", "Odor", "CAS", "Concentration", "IntensityRating"]
    cols += extra_cols
    df = pyrfume.load_data(MAINLAND_INTENSITY_PATH /
                           "all data Supra clean.csv",
                           index_col=None)
    df = df[cols]
    df["Subject"] = df["Subject"].astype(int)
    df["IntensityRating"] /= 100
    return df
Ejemplo n.º 6
0
def load_dream_model():
    """Load the open source DREAM model created in `open-source-dream.ipynb`"""
    path = "keller_2017/open-source-dream.pkl"
    model, use_features, descriptors, imputer = pyrfume.load_data(path)
    # model: The model to use for prediction
    # features: The features used for each column
    # descriptors: The descriptors it will predict for each output
    # imputer: The missing data imputer that should be used on any Morgan features
    #          before joining with other features and passing it to the model
    return model, use_features, descriptors, imputer
Ejemplo n.º 7
0
def get_haddad_weights():
    """Get a pandas Series of weights for Dragon features in Haddad"""
    # haddad_info_path = HADDAD_DIR / 'haddad-optimized_v6-symbols.xlsx'
    # df_haddad_list = pd.read_excel(haddad_info_path)
    # haddad_list = df_haddad_list['Dragon 6.0 symbol']
    # haddad_weights = df_haddad_list['Weight']
    # haddad_weights.index = haddad_list
    path = HADDAD_DIR / "haddad-weights-from-paper-dragon-6.csv"
    haddad_weights = pyrfume.load_data(path, index_col=0)["Weight"]
    return haddad_weights
Ejemplo n.º 8
0
def embed_molecules(molecules: pd.DataFrame):
    import matplotlib.pyplot as plt
    plt.figure(figsize=(6, 6))
    ax = plt.gca()
    embedding = load_data('embedding/pf_umap.pkl')
    ax = embedding.plot.scatter(x=0, y=1, alpha=0.05, c='k', ax=ax)
    smiles = molecules['IsomericSMILES']
    embedding_ = embedding.loc[smiles]
    embedding_.plot.scatter(x=0, y=1, alpha=1, c='r', s=100, ax=ax)
    ax.set_xlabel('Dimension 1')
    ax.set_ylabel('Dimension 2')
Ejemplo n.º 9
0
def get_snitz_dragon(use_original=True, regenerate=False):
    path = SNITZ_DIR / "snitz_dragon.csv"
    if not path.is_file() or regenerate:
        snitz_feature_names = get_snitz_weights().index
        # Use minmax scaling as in the Snitz paper
        minmax_scaled_dragon = features.load_dragon(suffix="-cleaned-minmaxed-imputed")
        df = minmax_scaled_dragon[snitz_feature_names]
        pyrfume.save_data(df, path)
    else:
        df = pyrfume.load_data(path)  # .set_index('PubChemID')
    return df
Ejemplo n.º 10
0
def get_haddad_dragon(use_original=True, regenerate=False):
    path = HADDAD_DIR / "haddad_dragon.csv"
    if not path.is_file() or regenerate:
        haddad_feature_names = get_haddad_weights().index
        # Use standard scaling as in the Snitz paper
        standard_scaled_dragon = features.load_dragon(
            suffix="-cleaned-standardized-imputed")
        df = standard_scaled_dragon[haddad_feature_names]
        pyrfume.save_data(df, path)
    else:
        df = pyrfume.load_data(path)  # .set_index('PubChemID')
    return df
Ejemplo n.º 11
0
def load_dragon(suffix=""):
    """Loads dragon features.
    Use a suffix to specify a precomputed cleaning of this data"""
    path = "%s/%s" % (FEATURES_DIR, DRAGON_STEM % suffix)
    dragon = load_data(path).set_index("PubChemID")
    return dragon
Ejemplo n.º 12
0
def smiles_to_dragon(smiles, suffix="", features=None):
    dragon = load_data("physicochemical/AllDragon%s.csv" % suffix)
    if features is None:
        features = list(dragon)
    dragon = dragon.loc[smiles, dragon]
    return dragon
Ejemplo n.º 13
0
 def test_load_snitz_2013(self):
     data = pyrfume.load_data('snitz_2013/behavior.csv')
     molecules = pyrfume.load_data('snitz_2013/molecules.csv')
Ejemplo n.º 14
0
#       format_name: light
#       format_version: '1.5'
#       jupytext_version: 1.10.3
#   kernelspec:
#     display_name: Python 3
#     language: python
#     name: python3
# ---

# # Add CIDS to parsed_threshold_data_in_air.csv

import pandas as pd
import pyrfume
from pyrfume.odorants import get_cid, get_cids
from rickpy import ProgressBar
df = pyrfume.load_data('thresholds/parsed_threshold_data_in_air.csv')
df = df.set_index('canonical SMILES')

smiles_cids = get_cids(df.index, kind='SMILES')

df = df.join(pd.Series(smiles_cids, name='CID'))
df.head()

from rdkit.Chem import MolFromSmiles, MolToSmiles
df['SMILES'] = df.index
p = ProgressBar(len(smiles_cids))
for i, (old, cid) in enumerate(smiles_cids.items()):
    p.animate(i, status=old)
    if cid == 0:
        mol = MolFromSmiles(old)
        if mol is None:
Ejemplo n.º 15
0
##### Initialize app #####

bootstrap = 'https://stackpath.bootstrapcdn.com/bootstrap/4.3.1/css/bootstrap.min.css'
external_stylesheets = [bootstrap]
app = flask.Flask(__name__)
dapp = dash.Dash(__name__,
                 server=app,
                 url_base_pathname='/',
                 external_stylesheets=external_stylesheets)

##### Load data #####

# Pyrfume-data-relative file path
file_path = 'odorants/all_cids_properties.csv'
# First 5 columns of all_cids file (name, MW, SMILES, etc.)
details = pyrfume.load_data(file_path, usecols=range(5), index_col=0)
# Dragon descriptor files with only the Snitz or Haddad features
dragon = {}
dragon['snitz'] = snitz.get_snitz_dragon().round(3)
dragon['haddad'] = haddad.get_haddad_dragon().round(3)
w = snitz.get_snitz_weights()
dragon['snitz'].loc['Weight', w.index] = w
w = haddad.get_haddad_weights()
dragon['haddad'].loc['Weight', w.index] = w
cumul = {}
cumul['snitz'] = pyrfume.load_data('snitz_2013/snitz_cumulative_probability.csv')
cumul['haddad'] = pyrfume.load_data('haddad_2008/haddad_cumulative_probability.csv')
# Spaces to show
spaces = OrderedDict({'snitz': 'Snitz Map',
                      'haddad': 'Haddad Map'})
Ejemplo n.º 16
0
#       format_version: '1.5'
#       jupytext_version: 1.10.3
#   kernelspec:
#     display_name: Python 3
#     language: python
#     name: python3
# ---

# %matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pyrfume
from pyrfume.odorants import cids_to_cas

cids = pyrfume.load_data('odorants/all_cids.csv').index

cas = cids_to_cas(cids)

print("Out of %d molecules, %d have CAS numbers" %
      (len(cids), len([x for x in cas.values() if x])))

counts = pd.Series([len(x) for x in cas.values()]).value_counts()
counts.index.name = 'Number of unique CAS values'
counts.name = 'Number of molecules'
counts.to_frame()

to_save = pd.Series(cas)
to_save.index.name = 'CID'
to_save.name = 'CAS'
to_save.head()
Ejemplo n.º 17
0
# jupyter:
#   jupytext:
#     formats: ipynb,py
#     text_representation:
#       extension: .py
#       format_name: light
#       format_version: '1.5'
#       jupytext_version: 1.10.3
#   kernelspec:
#     display_name: Python 3
#     language: python
#     name: python3
# ---

import pyrfume
df = pyrfume.load_data('arctander_1960/Arctander Master.xlsx')

from rdkit.Chem.rdinchi import InchiToInchiKey
df['InChiKey'] = df['InChiKey'].apply(lambda x: InchiToInchiKey(x) if 'InChI=' in str(x) and str(x)!='nan' else x)

from tqdm.auto import tqdm
from pyrfume.odorants import get_cid, get_cids
for index, row in tqdm(df.iterrows(), total=df.shape[0]):
    #if index < 215:
    #    continue
    cid = 0
    for j, col in enumerate(['InChiKey', 'SMILES', 'CAS', 'ChemicalName']):
        if not str(row[col]) == 'nan':
            cid = get_cid(row[col], kind=(col if j<2 else 'name'))
            if cid:
                break
Ejemplo n.º 18
0
 def test_load_morgan_skipped(self):
     my_cids = [129, 239]
     morgan_sim = pyrfume.load_data('morgan/features_sim.csv', cids=my_cids)
     self.assertEqual(morgan_sim.shape[0], len(my_cids))
Ejemplo n.º 19
0
 def test_load_manoel_2021(self):
     data = pyrfume.load_data('manoel_2021/behavior.csv')
Ejemplo n.º 20
0
def all_sources():
    """Whether or not each odorant (by CID) is in each of the data sources"""
    df = load_data(ODORANT_SOURCES_PATH)
    df = df.sort_index()
    return df
Ejemplo n.º 21
0
def all_odorants():
    """All CIDs, SMILES, Names, and Molecular Weights found in the
    file at ODORANTS_BASIC_INFO_PATH"""
    df = load_data(ODORANTS_BASIC_INFO_PATH)
    df = df.sort_index()
    return df
Ejemplo n.º 22
0
#       jupytext_version: 1.10.3
#   kernelspec:
#     display_name: Python 3
#     language: python
#     name: python3
# ---

# %matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import pyrfume
from pyrfume.pubchem import get_ghs_classification, parse_ghs_classification_for_odor, GHS_CODES
from rickpy import ProgressBar

path = 'odorants/all_cids_properties.csv'
details = pyrfume.load_data(path, usecols=range(5))
details.head()

# ### Cramer Toxicity Class Predictions

tox = pyrfume.load_data('odorants/toxTree.csv')
cramer = details.join(tox, on='SMILES')['Cramer Class']
cramer = cramer.apply(lambda x: len(x.split(' ')[-1][:-1]))
cramer.head()

pyrfume.save_data(cramer.to_frame(), 'odorants/cramer.csv')

embedded_coords = {
    key: pyrfume.load_data('odorants/%s_umap.pkl' % key)
    for key in ('snitz', 'haddad')
}
Ejemplo n.º 23
0
def cid_names():
    """TODO: Fix this to use the larger file"""
    path = FEATURES_DIR / "cids-names-smiles.csv"
    names = load_data(path).set_index("CID")["name"]
    return names
Ejemplo n.º 24
0
 def test_load_ravia_2020(self):
     data = pyrfume.load_data('ravia_2020/behavior1.csv')
     manifest = pyrfume.load_manifest('ravia_2020')
Ejemplo n.º 25
0
#       extension: .py
#       format_name: light
#       format_version: '1.5'
#       jupytext_version: 1.10.3
#   kernelspec:
#     display_name: Python 3
#     language: python
#     name: python3
# ---

import json
import pandas as pd
import pyrfume
import re

all_statements = pyrfume.load_data('pubchem_scrape_100000.pkl')

import json
with open('pubchem_100000.json', 'w') as f:
    json.dump(all_statements, f)

df = pd.DataFrame(index=sorted(all_statements),
                  columns=['Odor', 'Odorless', 'Statements'])
df.index.name = 'CID'
for cid in sorted(all_statements):
    statements = all_statements[cid]
    odor = False
    odorless = False
    for statement in statements:
        statement = statement.lower()
        if re.findall('no odor', statement):
Ejemplo n.º 26
0
import pyrfume
from pyrfume.odorants import smiles_to_image, all_odorants, all_sources

##### Initialize app #####
external_stylesheets = [
    'https://stackpath.bootstrapcdn.com/bootstrap/4.3.1/css/bootstrap.min.css'
]
app = flask.Flask(__name__)
dapp = dash.Dash(__name__,
                 server=app,
                 url_base_pathname='/',
                 external_stylesheets=external_stylesheets)

##### Load data #####
pyrfume.set_data_path('data')
gdb_umap = pyrfume.load_data('gdb_umap.pkl', remote=False)
pf_umap = pyrfume.load_data('pf_umap.pkl', remote=False)

hover_on = 1


def plot(big_umap, known_umap, skip=10):
    big_umap = big_umap.iloc[::skip]
    known_umap = known_umap.iloc[::skip]
    # The GDB scatter plot
    skip = 10
    big_scatter = go.Scatter(
        x=big_umap.loc[:, 0],
        y=big_umap.loc[:, 1],
        name='Possible Molecules',
        mode="markers",
Ejemplo n.º 27
0
def get_predicted_odorless():
    """Return a pandas Series that is True for molecules predicted to have an odor
    and False for those predicted to be odorless"""
    path = "odorants/predicted_odorless.csv"
    df = pyrfume.load_data(path)
    return df["PredictedOdorless"]
Ejemplo n.º 28
0
def load_raw_bmc_data(nrows=None):
    """Load raw data from Keller and Vosshall, 2016 supplement."""
    df_raw = pyrfume.load_data("keller_2016/12868_2016_287_MOESM1_ESM.xlsx",
                               header=2)
    return df_raw
Ejemplo n.º 29
0
def get_predicted_intensities():
    """Return the DREAM model predicted intensities using Mordred (not Dragon) features"""
    path = "physicochemical/cids-names-smiles-mordredpredintensities.csv"
    df = pyrfume.load_data(path)
    return df["Intensity"]
Ejemplo n.º 30
0
def plotly_embedding(embedding, features=None, show_features=None, colors=None, colorscale='rainbow'):
    """
    params:
        embedding: A dataframe wrapped around e.g. a fitted TSNE object, with an index of CIDs
        features: A dataframe of features, e.g. names, SMILES strings, or physicochemical features,
            with an index of CIDs
    """

    if features is None:
        features = pyrfume.load_data("odorants/all-cids-properties.csv", usecols=range(5))
        # Only retain those rows corresponding to odorants in the embedding
    features = features.loc[embedding.index]
    show_features = show_features or list(features)
    def format_features(col):
        return "%s: %s" % (index_name, col.values.split('<br>'))
    try:
        index_name = features.index.name or 'Index'
        names = (
            features.loc[:, show_features]
            .reset_index()
            .astype("str")
            .apply(format_features, axis=1)
        )
    except Exception:
        names = features.index
    assert embedding.shape[0] == features.shape[0]

    # The scatter plot
    scatter = go.Scatter(
        x=embedding.iloc[:, 0],
        y=embedding.iloc[:, 1],
        text=names,
        mode="markers",
        hoverinfo="text",
        opacity=0.5,
        marker={
            "size": 5,
            "line": {"width": 0.5, "color": "white"},
            "color": colors if colors is not None else "black",
            "colorscale": colorscale,
        },
    )
    
    # The axes, etc.
    layout = go.Layout(
        xaxis={"type": "linear", "title": "", "showline": False, "showticklabels": False},
        yaxis={"type": "linear", "title": "", "showline": False, "showticklabels": False},
        margin={"l": 40, "b": 40, "t": 10, "r": 10},
        legend={"x": 0, "y": 1},
        hovermode="closest",
        paper_bgcolor="rgba(0,0,0,0)",
        plot_bgcolor="rgba(0,0,0,0)",
        width=500,
        height=500,
    )

    fig = go.FigureWidget(data=[scatter], layout=layout)
    fig.layout.hovermode = 'closest'

    # The 2D drawing of the molecule
    image_widget = Image(
        value=smiles_to_image("CCCCO"), layout=Layout(height="300px", width="300px")
    )

    def hover_fn(trace, points, state):
        ind = points.point_inds[0]
        smiles = features["SMILES"].iloc[ind]
        image_widget.value = smiles_to_image(smiles)

    scatter = fig.data[0]
    scatter.on_hover(hover_fn)
    canvas = VBox([fig, image_widget])
    return canvas