Esempio n. 1
0
# Create formula column using `name_to_formula` object
df = df[df.components.isin(name_to_formula.index)]
name_to_formula = name_to_formula[name_to_formula.index.isin(df.components)]
df["formula"] = df.components.apply(lambda chemical: name_to_formula[chemical])

# Define the atoms that will be in the compounds you search for (could obviously be expanded)
heavy_atoms = [
    "C", "O"
]  # maybe also include nitrogen, fluorine, boron, iodine, etc at some point. Doing search on formula names, so no need for case-sensitivity.
desired_atoms = ["H"] + heavy_atoms

# Get counts of atoms in each compound
df["n_atoms"] = df.formula.apply(
    lambda formula_string: thermoml_lib.count_atoms(formula_string))
df["n_heavy_atoms"] = df.formula.apply(
    lambda formula_string: thermoml_lib.count_atoms_in_set(
        formula_string, heavy_atoms))
df["n_desired_atoms"] = df.formula.apply(
    lambda formula_string: thermoml_lib.count_atoms_in_set(
        formula_string, desired_atoms))
df["n_other_atoms"] = df.n_atoms - df.n_desired_atoms  # how many undesired atoms are in each row?

df = df[df.n_other_atoms ==
        0]  # get rid of rows with other atoms other than those desired

# Define the desired size range of compounds by the number of heavy atoms they contain
df = df[df.n_heavy_atoms > 0]
df = df[df.n_heavy_atoms <= 40]
df.dropna(axis=1, how='all', inplace=True)

# Take the compound names and convert to SMILES for more refined filtering (I used cirpy, this can also be done with OpenEye)
df["SMILES"] = df.components.apply(lambda x: resolve_cached(x, "smiles")
# Split components into separate columns (to use name_to_formula)
df["x1"], df["x2"] =  zip(*df["components"].str.split('__').tolist())
df['x2'].replace('', np.nan, inplace=True)
df.dropna(subset=['x2'], inplace=True)

# Strip rows not in liquid phase
df = df[df['phase']=='Liquid']

df["formula1"] = df.x1.apply(lambda chemical: name_to_formula[chemical])
df["formula2"] = df.x2.apply(lambda chemical: name_to_formula[chemical])

heavy_atoms = ["C", "O", "F", "N", "S", "B", "P", "Cl", "Br", "I"]
desired_atoms = ["H"] + heavy_atoms

df["n_atoms1"] = df.formula1.apply(lambda formula_string : thermoml_lib.count_atoms(formula_string))
df["n_heavy_atoms1"] = df.formula1.apply(lambda formula_string : thermoml_lib.count_atoms_in_set(formula_string, heavy_atoms))
df["n_desired_atoms1"] = df.formula1.apply(lambda formula_string : thermoml_lib.count_atoms_in_set(formula_string, desired_atoms))
df["n_other_atoms1"] = df.n_atoms1 - df.n_desired_atoms1
df["n_atoms2"] = df.formula2.apply(lambda formula_string : thermoml_lib.count_atoms(formula_string))
df["n_heavy_atoms2"] = df.formula2.apply(lambda formula_string : thermoml_lib.count_atoms_in_set(formula_string, heavy_atoms))
df["n_desired_atoms2"] = df.formula2.apply(lambda formula_string : thermoml_lib.count_atoms_in_set(formula_string, desired_atoms))
df["n_other_atoms2"] = df.n_atoms2 - df.n_desired_atoms2

df = df[df.n_other_atoms1 == 0]
df = df[df.n_other_atoms2 == 0]

df = df[df.n_heavy_atoms1 > 0]
df = df[df.n_heavy_atoms2 > 0]
df.dropna(axis=1, how='all', inplace=True)

df["SMILES1"] = df.x1.apply(lambda x: resolve_cached(x, "smiles"))  # This should be cached via sklearn.
Esempio n. 3
0
name_to_formula = pd.read_hdf("/home/bmanubay/.thermoml/compound_name_to_formula.h5", 'data')
name_to_formula = name_to_formula.dropna()

# Extract rows with two components
df["n_components"] = df.components.apply(lambda x: len(x.split("__")))
df = df[df.n_components == 1]
df.dropna(axis=1, how='all', inplace=True)


df["formula"] = df.components.apply(lambda chemical: name_to_formula[chemical])

heavy_atoms = ["C", "O"]
desired_atoms = ["H"] + heavy_atoms

df["n_atoms"] = df.formula.apply(lambda formula_string : thermoml_lib.count_atoms(formula_string))
df["n_heavy_atoms"] = df.formula.apply(lambda formula_string : thermoml_lib.count_atoms_in_set(formula_string, heavy_atoms))
df["n_desired_atoms"] = df.formula.apply(lambda formula_string : thermoml_lib.count_atoms_in_set(formula_string, desired_atoms))
df["n_other_atoms"] = df.n_atoms - df.n_desired_atoms

df = df[df.n_other_atoms == 0]

df = df[df.n_heavy_atoms > 0]
df = df[df.n_heavy_atoms <= 10]
df.dropna(axis=1, how='all', inplace=True)

df["smiles"] = df.components.apply(lambda x: resolve_cached(x, "smiles"))  # This should be cached via sklearn.
df = df[df.smiles != None]
df = df[df["smiles"].str.contains('=O') == False] # Getting rid of data sets with C=O and C=C occurrences
df = df[df["smiles"].str.contains('#') == False]
df = df[df["smiles"].str.contains('O=') == False]
df = df[df["smiles"].str.contains('=C') == False]
Esempio n. 4
0
def test_count_atoms_in_set():
    n = thermoml_lib.count_atoms_in_set(formula, ["C", "H"])
    assert n == 8
Esempio n. 5
0
def test_count_atoms_in_set():
    n = thermoml_lib.count_atoms_in_set(formula, ["C", "H"])
    assert n == 8
Esempio n. 6
0
name_to_formula = name_to_formula.dropna()

X["n_components"] = X.components.apply(lambda x: len(x.split("__")))
X = X[X.n_components == 1]
X.dropna(axis=1, how='all', inplace=True)

counts_data = {}
counts_data["0.  Single Component"] = X.count()[experiments]

X["formula"] = X.components.apply(lambda chemical: name_to_formula[chemical])

heavy_atoms = ["N", "C", "O", "S", "Cl", "Br", "F"]
desired_atoms = ["H"] + heavy_atoms

X["n_atoms"] = X.formula.apply(lambda formula_string : thermoml_lib.count_atoms(formula_string))
X["n_heavy_atoms"] = X.formula.apply(lambda formula_string : thermoml_lib.count_atoms_in_set(formula_string, heavy_atoms))
X["n_desired_atoms"] = X.formula.apply(lambda formula_string : thermoml_lib.count_atoms_in_set(formula_string, desired_atoms))
X["n_other_atoms"] = X.n_atoms - X.n_desired_atoms

X = X[X.n_other_atoms == 0]

counts_data["1.  Druglike Elements"] = X.count()[experiments]

X = X[X.n_heavy_atoms > 0]
X = X[X.n_heavy_atoms <= 10]
X.dropna(axis=1, how='all', inplace=True)

counts_data["2.  Heavy Atoms"] = X.count()[experiments]

X["smiles"] = X.components.apply(lambda x: cirpy.resolve(x, "smiles"))  # This should be cached via sklearn.
X = X[X.smiles != None]