def test_load_data(self): import pickle, os from pyrfume.base import DEFAULT_DATA_PATH from pyrfume import load_data, save_data import pandas as pd data = {'col1': [1, 2], 'col2': [3, 4]} file_path = DEFAULT_DATA_PATH / "data.pkl" path_not_exists = DEFAULT_DATA_PATH / "THIS_IS_AN_INVALID_PATH" self.assertRaises(Exception, save_data, data, path_not_exists) save_data(data, file_path) data_gain = load_data(file_path) self.assertEqual(data_gain, data) os.remove(file_path) file_path = DEFAULT_DATA_PATH / "data.csv" df = pd.DataFrame(data) save_data(df, file_path) #with open(file_path, "w") as f: # f.write("0,1,2,3\n0,1,2,3") data_gain = load_data(file_path) for index1 in range(len(data_gain.values)): for index2 in range(len(data_gain.values[index1])): self.assertEqual(data_gain.values[index1][index2], df.values[index1][index2]) os.remove(file_path)
def get_snitz_weights(use_original=True): """Return a pandas Series of weights for Dragon features in Snitz""" if use_original: # Use the ones from the Snitz paper, with no weights file_name = "snitz-descriptors-from-paper-dragon-6.csv" path = SNITZ_DIR / file_name snitz_weights = pyrfume.load_data(path, index_col=0)["Weight"] else: # Use the ones that I derived, with weights computed by optimization # using Snitz-space projections of each molecule's original unit vector file_name = "snitz_dragon_weights.csv" path = SNITZ_DIR / file_name snitz_weights = -pyrfume.load_data(path, index_col=0)["Weight"] return snitz_weights
def get_mainland(raw=False, vendors=None): """Return a dataframe containing odorants in Joel Mainland's cabinet""" df = pyrfume.load_data("cabinets/mainland.csv").set_index("CID") if not raw: if vendors: # e.g. ['sigma'] # Keep only odorants obtained from certain vendors df = df[df["SourcedFrom"].str.lower().isin(vendors)] # For odorants with no relative cost information # (probably discontinued), fill with a large number so they # don't get used. Also use log10 $/mol df.loc[:, "$/mol"] = np.log10(df.loc[:, "$/mol"].fillna(1e15)) # Fix price information df["Price"] = df["Price"].astype("str").apply( lambda x: x.replace("$", "")).astype("float") # Remove odorants with no price df = df.dropna(subset=["Price"]) # Sort by price and then take only the cheapest instance of each CID df = df.sort_values("Price").groupby("CID").first() # Drop values with no CID df = df.loc[df.index > 0] # Sort CIDs from low to high df = df.sort_index() return df
def load_data(): path = BUSHDID_PATH / "Bushdid-tableS1.csv" df = pyrfume.load_data(path, encoding="latin1") df = df.iloc[:, :4] df.columns = ["Name", "CAS", "Dilution", "Solvent"] df = df.set_index("CAS") df = df[df.index.notnull()] df.head() return df
def load_mainland_data(extra_cols=[]): cols = ["Subject", "Odor", "CAS", "Concentration", "IntensityRating"] cols += extra_cols df = pyrfume.load_data(MAINLAND_INTENSITY_PATH / "all data Supra clean.csv", index_col=None) df = df[cols] df["Subject"] = df["Subject"].astype(int) df["IntensityRating"] /= 100 return df
def load_dream_model(): """Load the open source DREAM model created in `open-source-dream.ipynb`""" path = "keller_2017/open-source-dream.pkl" model, use_features, descriptors, imputer = pyrfume.load_data(path) # model: The model to use for prediction # features: The features used for each column # descriptors: The descriptors it will predict for each output # imputer: The missing data imputer that should be used on any Morgan features # before joining with other features and passing it to the model return model, use_features, descriptors, imputer
def get_haddad_weights(): """Get a pandas Series of weights for Dragon features in Haddad""" # haddad_info_path = HADDAD_DIR / 'haddad-optimized_v6-symbols.xlsx' # df_haddad_list = pd.read_excel(haddad_info_path) # haddad_list = df_haddad_list['Dragon 6.0 symbol'] # haddad_weights = df_haddad_list['Weight'] # haddad_weights.index = haddad_list path = HADDAD_DIR / "haddad-weights-from-paper-dragon-6.csv" haddad_weights = pyrfume.load_data(path, index_col=0)["Weight"] return haddad_weights
def embed_molecules(molecules: pd.DataFrame): import matplotlib.pyplot as plt plt.figure(figsize=(6, 6)) ax = plt.gca() embedding = load_data('embedding/pf_umap.pkl') ax = embedding.plot.scatter(x=0, y=1, alpha=0.05, c='k', ax=ax) smiles = molecules['IsomericSMILES'] embedding_ = embedding.loc[smiles] embedding_.plot.scatter(x=0, y=1, alpha=1, c='r', s=100, ax=ax) ax.set_xlabel('Dimension 1') ax.set_ylabel('Dimension 2')
def get_snitz_dragon(use_original=True, regenerate=False): path = SNITZ_DIR / "snitz_dragon.csv" if not path.is_file() or regenerate: snitz_feature_names = get_snitz_weights().index # Use minmax scaling as in the Snitz paper minmax_scaled_dragon = features.load_dragon(suffix="-cleaned-minmaxed-imputed") df = minmax_scaled_dragon[snitz_feature_names] pyrfume.save_data(df, path) else: df = pyrfume.load_data(path) # .set_index('PubChemID') return df
def get_haddad_dragon(use_original=True, regenerate=False): path = HADDAD_DIR / "haddad_dragon.csv" if not path.is_file() or regenerate: haddad_feature_names = get_haddad_weights().index # Use standard scaling as in the Snitz paper standard_scaled_dragon = features.load_dragon( suffix="-cleaned-standardized-imputed") df = standard_scaled_dragon[haddad_feature_names] pyrfume.save_data(df, path) else: df = pyrfume.load_data(path) # .set_index('PubChemID') return df
def load_dragon(suffix=""): """Loads dragon features. Use a suffix to specify a precomputed cleaning of this data""" path = "%s/%s" % (FEATURES_DIR, DRAGON_STEM % suffix) dragon = load_data(path).set_index("PubChemID") return dragon
def smiles_to_dragon(smiles, suffix="", features=None): dragon = load_data("physicochemical/AllDragon%s.csv" % suffix) if features is None: features = list(dragon) dragon = dragon.loc[smiles, dragon] return dragon
def test_load_snitz_2013(self): data = pyrfume.load_data('snitz_2013/behavior.csv') molecules = pyrfume.load_data('snitz_2013/molecules.csv')
# format_name: light # format_version: '1.5' # jupytext_version: 1.10.3 # kernelspec: # display_name: Python 3 # language: python # name: python3 # --- # # Add CIDS to parsed_threshold_data_in_air.csv import pandas as pd import pyrfume from pyrfume.odorants import get_cid, get_cids from rickpy import ProgressBar df = pyrfume.load_data('thresholds/parsed_threshold_data_in_air.csv') df = df.set_index('canonical SMILES') smiles_cids = get_cids(df.index, kind='SMILES') df = df.join(pd.Series(smiles_cids, name='CID')) df.head() from rdkit.Chem import MolFromSmiles, MolToSmiles df['SMILES'] = df.index p = ProgressBar(len(smiles_cids)) for i, (old, cid) in enumerate(smiles_cids.items()): p.animate(i, status=old) if cid == 0: mol = MolFromSmiles(old) if mol is None:
##### Initialize app ##### bootstrap = 'https://stackpath.bootstrapcdn.com/bootstrap/4.3.1/css/bootstrap.min.css' external_stylesheets = [bootstrap] app = flask.Flask(__name__) dapp = dash.Dash(__name__, server=app, url_base_pathname='/', external_stylesheets=external_stylesheets) ##### Load data ##### # Pyrfume-data-relative file path file_path = 'odorants/all_cids_properties.csv' # First 5 columns of all_cids file (name, MW, SMILES, etc.) details = pyrfume.load_data(file_path, usecols=range(5), index_col=0) # Dragon descriptor files with only the Snitz or Haddad features dragon = {} dragon['snitz'] = snitz.get_snitz_dragon().round(3) dragon['haddad'] = haddad.get_haddad_dragon().round(3) w = snitz.get_snitz_weights() dragon['snitz'].loc['Weight', w.index] = w w = haddad.get_haddad_weights() dragon['haddad'].loc['Weight', w.index] = w cumul = {} cumul['snitz'] = pyrfume.load_data('snitz_2013/snitz_cumulative_probability.csv') cumul['haddad'] = pyrfume.load_data('haddad_2008/haddad_cumulative_probability.csv') # Spaces to show spaces = OrderedDict({'snitz': 'Snitz Map', 'haddad': 'Haddad Map'})
# format_version: '1.5' # jupytext_version: 1.10.3 # kernelspec: # display_name: Python 3 # language: python # name: python3 # --- # %matplotlib inline import matplotlib.pyplot as plt import numpy as np import pandas as pd import pyrfume from pyrfume.odorants import cids_to_cas cids = pyrfume.load_data('odorants/all_cids.csv').index cas = cids_to_cas(cids) print("Out of %d molecules, %d have CAS numbers" % (len(cids), len([x for x in cas.values() if x]))) counts = pd.Series([len(x) for x in cas.values()]).value_counts() counts.index.name = 'Number of unique CAS values' counts.name = 'Number of molecules' counts.to_frame() to_save = pd.Series(cas) to_save.index.name = 'CID' to_save.name = 'CAS' to_save.head()
# jupyter: # jupytext: # formats: ipynb,py # text_representation: # extension: .py # format_name: light # format_version: '1.5' # jupytext_version: 1.10.3 # kernelspec: # display_name: Python 3 # language: python # name: python3 # --- import pyrfume df = pyrfume.load_data('arctander_1960/Arctander Master.xlsx') from rdkit.Chem.rdinchi import InchiToInchiKey df['InChiKey'] = df['InChiKey'].apply(lambda x: InchiToInchiKey(x) if 'InChI=' in str(x) and str(x)!='nan' else x) from tqdm.auto import tqdm from pyrfume.odorants import get_cid, get_cids for index, row in tqdm(df.iterrows(), total=df.shape[0]): #if index < 215: # continue cid = 0 for j, col in enumerate(['InChiKey', 'SMILES', 'CAS', 'ChemicalName']): if not str(row[col]) == 'nan': cid = get_cid(row[col], kind=(col if j<2 else 'name')) if cid: break
def test_load_morgan_skipped(self): my_cids = [129, 239] morgan_sim = pyrfume.load_data('morgan/features_sim.csv', cids=my_cids) self.assertEqual(morgan_sim.shape[0], len(my_cids))
def test_load_manoel_2021(self): data = pyrfume.load_data('manoel_2021/behavior.csv')
def all_sources(): """Whether or not each odorant (by CID) is in each of the data sources""" df = load_data(ODORANT_SOURCES_PATH) df = df.sort_index() return df
def all_odorants(): """All CIDs, SMILES, Names, and Molecular Weights found in the file at ODORANTS_BASIC_INFO_PATH""" df = load_data(ODORANTS_BASIC_INFO_PATH) df = df.sort_index() return df
# jupytext_version: 1.10.3 # kernelspec: # display_name: Python 3 # language: python # name: python3 # --- # %matplotlib inline import matplotlib.pyplot as plt import pandas as pd import pyrfume from pyrfume.pubchem import get_ghs_classification, parse_ghs_classification_for_odor, GHS_CODES from rickpy import ProgressBar path = 'odorants/all_cids_properties.csv' details = pyrfume.load_data(path, usecols=range(5)) details.head() # ### Cramer Toxicity Class Predictions tox = pyrfume.load_data('odorants/toxTree.csv') cramer = details.join(tox, on='SMILES')['Cramer Class'] cramer = cramer.apply(lambda x: len(x.split(' ')[-1][:-1])) cramer.head() pyrfume.save_data(cramer.to_frame(), 'odorants/cramer.csv') embedded_coords = { key: pyrfume.load_data('odorants/%s_umap.pkl' % key) for key in ('snitz', 'haddad') }
def cid_names(): """TODO: Fix this to use the larger file""" path = FEATURES_DIR / "cids-names-smiles.csv" names = load_data(path).set_index("CID")["name"] return names
def test_load_ravia_2020(self): data = pyrfume.load_data('ravia_2020/behavior1.csv') manifest = pyrfume.load_manifest('ravia_2020')
# extension: .py # format_name: light # format_version: '1.5' # jupytext_version: 1.10.3 # kernelspec: # display_name: Python 3 # language: python # name: python3 # --- import json import pandas as pd import pyrfume import re all_statements = pyrfume.load_data('pubchem_scrape_100000.pkl') import json with open('pubchem_100000.json', 'w') as f: json.dump(all_statements, f) df = pd.DataFrame(index=sorted(all_statements), columns=['Odor', 'Odorless', 'Statements']) df.index.name = 'CID' for cid in sorted(all_statements): statements = all_statements[cid] odor = False odorless = False for statement in statements: statement = statement.lower() if re.findall('no odor', statement):
import pyrfume from pyrfume.odorants import smiles_to_image, all_odorants, all_sources ##### Initialize app ##### external_stylesheets = [ 'https://stackpath.bootstrapcdn.com/bootstrap/4.3.1/css/bootstrap.min.css' ] app = flask.Flask(__name__) dapp = dash.Dash(__name__, server=app, url_base_pathname='/', external_stylesheets=external_stylesheets) ##### Load data ##### pyrfume.set_data_path('data') gdb_umap = pyrfume.load_data('gdb_umap.pkl', remote=False) pf_umap = pyrfume.load_data('pf_umap.pkl', remote=False) hover_on = 1 def plot(big_umap, known_umap, skip=10): big_umap = big_umap.iloc[::skip] known_umap = known_umap.iloc[::skip] # The GDB scatter plot skip = 10 big_scatter = go.Scatter( x=big_umap.loc[:, 0], y=big_umap.loc[:, 1], name='Possible Molecules', mode="markers",
def get_predicted_odorless(): """Return a pandas Series that is True for molecules predicted to have an odor and False for those predicted to be odorless""" path = "odorants/predicted_odorless.csv" df = pyrfume.load_data(path) return df["PredictedOdorless"]
def load_raw_bmc_data(nrows=None): """Load raw data from Keller and Vosshall, 2016 supplement.""" df_raw = pyrfume.load_data("keller_2016/12868_2016_287_MOESM1_ESM.xlsx", header=2) return df_raw
def get_predicted_intensities(): """Return the DREAM model predicted intensities using Mordred (not Dragon) features""" path = "physicochemical/cids-names-smiles-mordredpredintensities.csv" df = pyrfume.load_data(path) return df["Intensity"]
def plotly_embedding(embedding, features=None, show_features=None, colors=None, colorscale='rainbow'): """ params: embedding: A dataframe wrapped around e.g. a fitted TSNE object, with an index of CIDs features: A dataframe of features, e.g. names, SMILES strings, or physicochemical features, with an index of CIDs """ if features is None: features = pyrfume.load_data("odorants/all-cids-properties.csv", usecols=range(5)) # Only retain those rows corresponding to odorants in the embedding features = features.loc[embedding.index] show_features = show_features or list(features) def format_features(col): return "%s: %s" % (index_name, col.values.split('<br>')) try: index_name = features.index.name or 'Index' names = ( features.loc[:, show_features] .reset_index() .astype("str") .apply(format_features, axis=1) ) except Exception: names = features.index assert embedding.shape[0] == features.shape[0] # The scatter plot scatter = go.Scatter( x=embedding.iloc[:, 0], y=embedding.iloc[:, 1], text=names, mode="markers", hoverinfo="text", opacity=0.5, marker={ "size": 5, "line": {"width": 0.5, "color": "white"}, "color": colors if colors is not None else "black", "colorscale": colorscale, }, ) # The axes, etc. layout = go.Layout( xaxis={"type": "linear", "title": "", "showline": False, "showticklabels": False}, yaxis={"type": "linear", "title": "", "showline": False, "showticklabels": False}, margin={"l": 40, "b": 40, "t": 10, "r": 10}, legend={"x": 0, "y": 1}, hovermode="closest", paper_bgcolor="rgba(0,0,0,0)", plot_bgcolor="rgba(0,0,0,0)", width=500, height=500, ) fig = go.FigureWidget(data=[scatter], layout=layout) fig.layout.hovermode = 'closest' # The 2D drawing of the molecule image_widget = Image( value=smiles_to_image("CCCCO"), layout=Layout(height="300px", width="300px") ) def hover_fn(trace, points, state): ind = points.point_inds[0] smiles = features["SMILES"].iloc[ind] image_widget.value = smiles_to_image(smiles) scatter = fig.data[0] scatter.on_hover(hover_fn) canvas = VBox([fig, image_widget]) return canvas