def test_truncate_n_molecules(): substance = Substance() substance.add_component( component=Component(smiles="[Na+]"), amount=MoleFraction(0.00267), ) substance.add_component( component=Component(smiles="[Cl-]"), amount=MoleFraction(0.00267), ) substance.add_component(component=Component(smiles="O"), amount=MoleFraction(1.0 - 2.0 * 0.00267)) # Attempt to get the number of molecules without truncating. with pytest.raises(ValueError): substance.get_molecules_per_component(1000, truncate_n_molecules=False) # Attempt to get the number of molecules with truncating. molecule_counts = substance.get_molecules_per_component( 1000, truncate_n_molecules=True) assert molecule_counts == { "[Na+]{solv}": 3, "[Cl-]{solv}": 3, "O{solv}": 994 }
def test_solvate_existing_structure_protocol(): """Tests solvating a single methanol molecule in water.""" import mdtraj methanol_component = Component("CO") methanol_substance = Substance() methanol_substance.add_component(methanol_component, ExactAmount(1)) water_substance = Substance() water_substance.add_component(Component("O"), MoleFraction(1.0)) with tempfile.TemporaryDirectory() as temporary_directory: build_methanol_coordinates = BuildCoordinatesPackmol("build_methanol") build_methanol_coordinates.max_molecules = 1 build_methanol_coordinates.substance = methanol_substance build_methanol_coordinates.execute(temporary_directory, ComputeResources()) methanol_residue_name = build_methanol_coordinates.assigned_residue_names[ methanol_component.identifier] solvate_coordinates = SolvateExistingStructure("solvate_methanol") solvate_coordinates.max_molecules = 9 solvate_coordinates.substance = water_substance solvate_coordinates.solute_coordinate_file = ( build_methanol_coordinates.coordinate_file_path) solvate_coordinates.execute(temporary_directory, ComputeResources()) solvated_system = mdtraj.load_pdb( solvate_coordinates.coordinate_file_path) assert solvated_system.n_residues == 10 assert solvated_system.top.residue(0).name == methanol_residue_name
def test_multiple_amounts(): substance = Substance() sodium = Component("[Na+]") chloride = Component("[Cl-]") substance.add_component(sodium, MoleFraction(0.75)) substance.add_component(sodium, ExactAmount(1)) substance.add_component(chloride, MoleFraction(0.25)) substance.add_component(chloride, ExactAmount(1)) assert substance.number_of_components == 2 sodium_amounts = substance.get_amounts(sodium) chlorine_amounts = substance.get_amounts(chloride) assert len(sodium_amounts) == 2 assert len(chlorine_amounts) == 2 molecule_counts = substance.get_molecules_per_component(6) assert len(molecule_counts) == 2 assert molecule_counts[sodium.identifier] == 4 assert molecule_counts[chloride.identifier] == 2
def dummy_complex() -> Substance: substance = Substance() substance.add_component( Component(smiles="C", role=Component.Role.Ligand), ExactAmount(1) ) substance.add_component( Component(smiles="CO", role=Component.Role.Receptor), ExactAmount(1) ) return substance
def create_dummy_substance(number_of_components, elements=None): """Creates a substance with a given number of components, each containing the specified elements. Parameters ---------- number_of_components : int The number of components to add to the substance. elements : list of str The elements that each component should containt. Returns ------- Substance The created substance. """ if elements is None: elements = ["C"] substance = Substance() mole_fraction = 1.0 / number_of_components for index in range(number_of_components): smiles_pattern = "".join(elements * (index + 1)) substance.add_component(Component(smiles_pattern), MoleFraction(mole_fraction)) return substance
def from_components(cls, *components): """Creates a new `Substance` object from a list of components. This method assumes that all components should be present with equal mole fractions. Parameters ---------- components: Component or str The components to add to the substance. These may either be full `Component` objects or just the smiles representation of the component. Returns ------- Substance The substance containing the requested components in equal amounts. """ if len(components) == 0: raise ValueError("At least one component must be specified") mole_fraction = 1.0 / len(components) return_substance = cls() for component in components: if isinstance(component, str): component = Component(smiles=component) return_substance.add_component(component, MoleFraction(mole_fraction)) return return_substance
def test_build_docked_coordinates_protocol(): """Tests docking a methanol molecule into alpha-Cyclodextrin.""" if not has_openeye(): pytest.skip("The `BuildDockedCoordinates` protocol requires OpenEye.") ligand_substance = Substance() ligand_substance.add_component( Component("CO", role=Component.Role.Ligand), ExactAmount(1), ) # TODO: This test could likely be made substantially faster # by storing the binary prepared receptor. Would this # be in breach of any oe license terms? with tempfile.TemporaryDirectory() as temporary_directory: build_docked_coordinates = BuildDockedCoordinates("build_methanol") build_docked_coordinates.ligand_substance = ligand_substance build_docked_coordinates.number_of_ligand_conformers = 5 build_docked_coordinates.receptor_coordinate_file = get_data_filename( "test/molecules/acd.mol2") build_docked_coordinates.execute(temporary_directory, ComputeResources()) docked_pdb = PDBFile( build_docked_coordinates.docked_complex_coordinate_path) assert docked_pdb.topology.getNumResidues() == 2
def test_add_mole_fractions(): substance = Substance() substance.add_component(Component("C"), MoleFraction(0.5)) substance.add_component(Component("C"), MoleFraction(0.5)) assert substance.number_of_components == 1 amounts = substance.get_amounts(substance.components[0]) assert len(amounts) == 1 amount = next(iter(amounts)) assert isinstance(amount, MoleFraction) assert np.isclose(amount.value, 1.0)
def test_build_coordinates_packmol_exact(count_exact_amount): """Tests that the build coordinate protocol behaves correctly for substances with exact amounts.""" import mdtraj substance = Substance() substance.add_component(Component("O"), MoleFraction(1.0)) substance.add_component(Component("C"), ExactAmount(1)) max_molecule = 11 if count_exact_amount else 10 build_coordinates = BuildCoordinatesPackmol("build_coordinates") build_coordinates.max_molecules = max_molecule build_coordinates.count_exact_amount = count_exact_amount build_coordinates.substance = substance with tempfile.TemporaryDirectory() as directory: build_coordinates.execute(directory) built_system = mdtraj.load_pdb(build_coordinates.coordinate_file_path) assert built_system.n_residues == 11
def data_frame() -> pandas.DataFrame: temperatures = [298.15, 318.15] pressures = [101.325, 101.0] properties = [Density, EnthalpyOfMixing] mole_fractions = [(1.0, ), (1.0, ), (0.25, 0.75), (0.75, 0.25)] smiles = {1: [("C(F)(Cl)(Br)", ), ("C", )], 2: [("CO", "C"), ("C", "CO")]} loop_variables = [( temperature, pressure, property_type, mole_fraction, ) for temperature in temperatures for pressure in pressures for property_type in properties for mole_fraction in mole_fractions] data_entries = [] for temperature, pressure, property_type, mole_fraction in loop_variables: n_components = len(mole_fraction) for smiles_tuple in smiles[n_components]: substance = Substance() for smiles_pattern, x in zip(smiles_tuple, mole_fraction): substance.add_component(Component(smiles_pattern), MoleFraction(x)) data_entries.append( property_type( thermodynamic_state=ThermodynamicState( temperature=temperature * unit.kelvin, pressure=pressure * unit.kilopascal, ), phase=PropertyPhase.Liquid, value=1.0 * property_type.default_unit(), uncertainty=1.0 * property_type.default_unit(), source=MeasurementSource(doi=" "), substance=substance, )) data_set = PhysicalPropertyDataSet() data_set.add_properties(*data_entries) return data_set.to_pandas()
def create_substance(): test_substance = Substance() test_substance.add_component( Component("C", role=Component.Role.Solute), ExactAmount(1), ) test_substance.add_component( Component("CC", role=Component.Role.Ligand), ExactAmount(1), ) test_substance.add_component( Component("CCC", role=Component.Role.Receptor), ExactAmount(1), ) test_substance.add_component( Component("O", role=Component.Role.Solvent), MoleFraction(1.0), ) return test_substance
def _build_input_output_substances(): """Builds sets if input and expected substances for the `test_build_coordinate_composition` test. Returns ------- list of tuple of Substance and Substance A list of input and expected substances. """ # Start with some easy cases substances = [ (Substance.from_components("O"), Substance.from_components("O")), (Substance.from_components("O", "C"), Substance.from_components("O", "C")), ( Substance.from_components("O", "C", "CO"), Substance.from_components("O", "C", "CO"), ), ] # Handle some cases where rounding will need to occur. input_substance = Substance() input_substance.add_component(Component("O"), MoleFraction(0.41)) input_substance.add_component(Component("C"), MoleFraction(0.59)) expected_substance = Substance() expected_substance.add_component(Component("O"), MoleFraction(0.4)) expected_substance.add_component(Component("C"), MoleFraction(0.6)) substances.append((input_substance, expected_substance)) input_substance = Substance() input_substance.add_component(Component("O"), MoleFraction(0.59)) input_substance.add_component(Component("C"), MoleFraction(0.41)) expected_substance = Substance() expected_substance.add_component(Component("O"), MoleFraction(0.6)) expected_substance.add_component(Component("C"), MoleFraction(0.4)) substances.append((input_substance, expected_substance)) return substances
def _build_substance( guest_smiles: Optional[str], host_smiles: str, ionic_strength: Optional[unit.Quantity], negative_buffer_ion: str = "[Cl-]", positive_buffer_ion: str = "[Na+]", ): """Builds a substance containing a ligand and receptor solvated in an aqueous solution with a given ionic strength Parameters ---------- guest_smiles The SMILES descriptor of the guest. host_smiles The SMILES descriptor of the host. ionic_strength The ionic strength of the aqueous solvent. Returns ------- The built substance. """ from openff.toolkit.topology import Molecule from simtk import unit as simtk_unit substance = Substance() if guest_smiles is not None: guest = Component(smiles=guest_smiles, role=Component.Role.Ligand) substance.add_component(component=guest, amount=ExactAmount(1)) host = Component(smiles=host_smiles, role=Component.Role.Receptor) substance.add_component(component=host, amount=ExactAmount(1)) water = Component(smiles="O", role=Component.Role.Solvent) sodium = Component(smiles=positive_buffer_ion, role=Component.Role.Solvent) chlorine = Component(smiles=negative_buffer_ion, role=Component.Role.Solvent) water_mole_fraction = 1.0 if ionic_strength is not None: salt_mole_fraction = Substance.calculate_aqueous_ionic_mole_fraction( ionic_strength) if isinstance(salt_mole_fraction, unit.Quantity): # noinspection PyUnresolvedReferences salt_mole_fraction = salt_mole_fraction.magnitude water_mole_fraction = 1.0 - salt_mole_fraction * 2 substance.add_component( component=sodium, amount=MoleFraction(salt_mole_fraction), ) substance.add_component( component=chlorine, amount=MoleFraction(salt_mole_fraction), ) substance.add_component(component=water, amount=MoleFraction(water_mole_fraction)) host_molecule_charge = Molecule.from_smiles(host_smiles).total_charge guest_molecule_charge = ( 0.0 * simtk_unit.elementary_charge if guest_smiles is None else Molecule.from_smiles(guest_smiles).total_charge) net_charge = (host_molecule_charge + guest_molecule_charge).value_in_unit( simtk_unit.elementary_charge) n_counter_ions = abs(int(net_charge)) if net_charge <= -0.9999: substance.add_component(sodium, ExactAmount(n_counter_ions)) elif net_charge >= 0.9999: substance.add_component(chlorine, ExactAmount(n_counter_ions)) return substance
def _apply( cls, data_frame: pandas.DataFrame, schema: ImportFreeSolvSchema, n_processes, ) -> pandas.DataFrame: from openff.evaluator import properties, substances, unit # Convert the data frame into data rows. free_solv_data_frame = cls._download_free_solv() data_entries = [] for _, row in free_solv_data_frame.iterrows(): # Extract and standardize the SMILES pattern of the solute_smiles = row["SMILES"].lstrip().rstrip() solute_smiles = substances.Component(solute_smiles).smiles # Build the substance. substance = Substance() substance.add_component(Component(smiles="O"), MoleFraction(1.0)) substance.add_component( Component(smiles=solute_smiles, role=Component.Role.Solute), ExactAmount(1), ) # Extract the value and uncertainty value = (float(row["experimental value (kcal/mol)"]) * unit.kilocalorie / unit.mole) std_error = (float(row["experimental uncertainty (kcal/mol)"]) * unit.kilocalorie / unit.mole) # Attempt to extract a DOI original_source = row[ "experimental reference (original or paper this value was taken from)"] doi = cls._validate_doi(original_source) data_entry = SolvationFreeEnergy( thermodynamic_state=ThermodynamicState( temperature=298.15 * unit.kelvin, pressure=101.325 * unit.kilopascal, ), phase=PropertyPhase.Liquid, substance=substance, value=value.to(properties.SolvationFreeEnergy.default_unit()), uncertainty=std_error.to( properties.SolvationFreeEnergy.default_unit()), source=MeasurementSource(doi=doi), ) data_entries.append(data_entry) data_set = PhysicalPropertyDataSet() data_set.add_properties(*data_entries) free_solv_data_frame = data_set.to_pandas() data_frame = pandas.concat([data_frame, free_solv_data_frame], ignore_index=True, sort=False) return data_frame
def main(): os.makedirs("raw_data_v2", exist_ok=True) for data_set_name in [ "curated_data_set", "gaff 1.81", "gaff 2.11", "parsley 1.0.0", "smirnoff99frosst 1.1.0", ]: with open(os.path.join("raw_data", f"{data_set_name}.json")) as file: raw_data_set = json.load(file) assert (raw_data_set["@type"] == "propertyestimator.datasets.datasets.PhysicalPropertyDataSet") physical_properties = [] for raw_data_set_entries in raw_data_set["properties"].values(): for raw_data_set_entry in raw_data_set_entries: # Extract the substance this entry was measured for. substance = Substance() for raw_component in raw_data_set_entry["substance"][ "components"]: component = Component( smiles=raw_component["smiles"], role=Component.Role[raw_component["role"]["value"]], ) raw_amounts = raw_data_set_entry["substance"]["amounts"][ raw_component["smiles"]] for raw_amount in raw_amounts["value"]: if (raw_amount["@type"] == "propertyestimator.substances.Substance->MoleFraction" ): substance.add_component( component, MoleFraction(raw_amount["value"])) elif (raw_amount["@type"] == "propertyestimator.substances.Substance->ExactAmount" ): substance.add_component( component, ExactAmount(raw_amount["value"])) else: raise NotImplementedError() # Extract the source of the property if (raw_data_set_entry["source"]["@type"] == "propertyestimator.properties.properties.CalculationSource" ): source = CalculationSource( fidelity=raw_data_set_entry["source"]["fidelity"]) elif (raw_data_set_entry["source"]["@type"] == "propertyestimator.properties.properties.MeasurementSource" ): source = MeasurementSource(doi=correct_doi( raw_data_set_entry["source"]["reference"])) else: raise NotImplementedError() # Generate the new property object. property_class = getattr( properties, raw_data_set_entry["@type"].split(".")[-1]) physical_property = property_class( thermodynamic_state=ThermodynamicState( temperature=( raw_data_set_entry["thermodynamic_state"] ["temperature"]["value"] * unit.Unit(raw_data_set_entry["thermodynamic_state"] ["temperature"]["unit"])), pressure=( raw_data_set_entry["thermodynamic_state"] ["pressure"]["value"] * unit.Unit(raw_data_set_entry["thermodynamic_state"] ["pressure"]["unit"])), ), phase=PropertyPhase(raw_data_set_entry["phase"]), substance=substance, value=(raw_data_set_entry["value"]["value"] * unit.Unit(raw_data_set_entry["value"]["unit"])), uncertainty=( None if isinstance(source, MeasurementSource) else (raw_data_set_entry["uncertainty"]["value"] * unit.Unit(raw_data_set_entry["uncertainty"]["unit"]) )), source=source, ) physical_property.id = raw_data_set_entry["id"] physical_properties.append(physical_property) data_set = PhysicalPropertyDataSet() data_set.add_properties(*physical_properties) data_set.json(os.path.join("raw_data_v2", f"{data_set_name}.json"), format=True) data_set.to_pandas().to_csv( os.path.join("raw_data_v2", f"{data_set_name}.csv"))
def test_solvation_yank_protocol(solvent_smiles): full_substance = Substance() full_substance.add_component( Component(smiles="CO", role=Component.Role.Solute), ExactAmount(1), ) full_substance.add_component( Component(smiles=solvent_smiles, role=Component.Role.Solvent), MoleFraction(1.0), ) solvent_substance = Substance() solvent_substance.add_component( Component(smiles=solvent_smiles, role=Component.Role.Solvent), MoleFraction(1.0), ) solute_substance = Substance() solute_substance.add_component( Component(smiles="CO", role=Component.Role.Solute), ExactAmount(1), ) thermodynamic_state = ThermodynamicState(temperature=298.15 * unit.kelvin, pressure=1.0 * unit.atmosphere) with tempfile.TemporaryDirectory() as directory: with temporarily_change_directory(directory): force_field_path = "ff.json" with open(force_field_path, "w") as file: file.write(build_tip3p_smirnoff_force_field().json()) solvated_coordinate_path, solvated_system = _setup_dummy_system( "full", full_substance, 2, force_field_path) vacuum_coordinate_path, vacuum_system = _setup_dummy_system( "vacuum", solute_substance, 1, force_field_path) run_yank = SolvationYankProtocol("yank") run_yank.solute = solute_substance run_yank.solvent_1 = solvent_substance run_yank.solvent_2 = Substance() run_yank.thermodynamic_state = thermodynamic_state run_yank.number_of_iterations = 1 run_yank.steps_per_iteration = 1 run_yank.checkpoint_interval = 1 run_yank.verbose = True run_yank.setup_only = True run_yank.solution_1_coordinates = solvated_coordinate_path run_yank.solution_1_system = solvated_system run_yank.solution_2_coordinates = vacuum_coordinate_path run_yank.solution_2_system = vacuum_system run_yank.electrostatic_lambdas_1 = [1.00] run_yank.steric_lambdas_1 = [1.00] run_yank.electrostatic_lambdas_2 = [1.00] run_yank.steric_lambdas_2 = [1.00] run_yank.execute("", ComputeResources())
def test_ligand_receptor_yank_protocol(): full_substance = Substance() full_substance.add_component( Component(smiles="c1ccccc1", role=Component.Role.Receptor), ExactAmount(1), ) full_substance.add_component( Component(smiles="C", role=Component.Role.Ligand), ExactAmount(1), ) full_substance.add_component( Component(smiles="O", role=Component.Role.Solvent), MoleFraction(1.0), ) solute_substance = Substance() solute_substance.add_component( Component(smiles="C", role=Component.Role.Ligand), ExactAmount(1), ) solute_substance.add_component( Component(smiles="O", role=Component.Role.Solvent), MoleFraction(1.0), ) thermodynamic_state = ThermodynamicState(temperature=298.15 * unit.kelvin, pressure=1.0 * unit.atmosphere) with tempfile.TemporaryDirectory() as directory: with temporarily_change_directory(directory): force_field_path = "ff.json" with open(force_field_path, "w") as file: file.write(build_tip3p_smirnoff_force_field().json()) complex_coordinate_path, complex_system = _setup_dummy_system( "full", full_substance, 3, force_field_path) ligand_coordinate_path, ligand_system = _setup_dummy_system( "ligand", solute_substance, 2, force_field_path) run_yank = LigandReceptorYankProtocol("yank") run_yank.substance = full_substance run_yank.thermodynamic_state = thermodynamic_state run_yank.number_of_iterations = 1 run_yank.steps_per_iteration = 1 run_yank.checkpoint_interval = 1 run_yank.verbose = True run_yank.setup_only = True run_yank.ligand_residue_name = "TMP" run_yank.receptor_residue_name = "TMP" run_yank.solvated_ligand_coordinates = ligand_coordinate_path run_yank.solvated_ligand_system = ligand_system run_yank.solvated_complex_coordinates = complex_coordinate_path run_yank.solvated_complex_system = complex_system run_yank.force_field_path = force_field_path run_yank.execute("", ComputeResources())
def _generate_residue_name(residue, smiles): """Generates residue name for a particular residue which corresponds to a particular smiles pattern. Where possible (i.e for amino acids and ions) a standard residue name will be returned, otherwise a random name will be used. Parameters ---------- residue: mdtraj.core.topology.Residue The residue to assign the name to. smiles: str The SMILES pattern to generate a resiude name for. """ from mdtraj.core import residue_names from openff.toolkit.topology import Molecule # Define the set of residue names which should be discarded # if randomly generated as they have a reserved meaning. # noinspection PyProtectedMember forbidden_residue_names = [ *residue_names._AMINO_ACID_CODES, *residue_names._SOLVENT_TYPES, *residue_names._WATER_RESIDUES, "ADE", "CYT", "CYX", "DAD", "DGU", "FOR", "GUA", "HID", "HIE", "HIH", "HSD", "HSH", "HSP", "NMA", "THY", "URA", ] amino_residue_mappings = { "C[C@H](N)C(=O)O": "ALA", "N=C(N)NCCC[C@H](N)C(=O)O": "ARG", "NC(=O)C[C@H](N)C(=O)O": "ASN", "N[C@@H](CC(=O)O)C(=O)O": "ASP", "N[C@@H](CS)C(=O)O": "CYS", "N[C@@H](CCC(=O)O)C(=O)O": "GLU", "NC(=O)CC[C@H](N)C(=O)O": "GLN", "NCC(=O)O": "GLY", "N[C@@H](Cc1c[nH]cn1)C(=O)O": "HIS", "CC[C@H](C)[C@H](N)C(=O)O": "ILE", "CC(C)C[C@H](N)C(=O)O": "LEU", "NCCCC[C@H](N)C(=O)O": "LYS", "CSCC[C@H](N)C(=O)O": "MET", "N[C@@H](Cc1ccccc1)C(=O)O": "PHE", "O=C(O)[C@@H]1CCCN1": "PRO", "N[C@@H](CO)C(=O)O": "SER", "C[C@@H](O)[C@H](N)C(=O)O": "THR", "N[C@@H](Cc1c[nH]c2ccccc12)C(=O)O": "TRP", "N[C@@H](Cc1ccc(O)cc1)C(=O)O": "TYR", "CC(C)[C@H](N)C(=O)O": "VAL", } standardized_smiles = Component(smiles=smiles).smiles # Check for amino acids. if standardized_smiles in amino_residue_mappings: residue.name = amino_residue_mappings[standardized_smiles] return # Check for water if standardized_smiles == "O": residue.name = "HOH" # Re-assign the water atom names. These need to be set to get # correct CONECT statements. h_counter = 1 for atom in residue.atoms: if atom.element.symbol == "O": atom.name = "O1" else: atom.name = f"H{h_counter}" h_counter += 1 return # Check for ions openff_molecule = Molecule.from_smiles(smiles, allow_undefined_stereo=True) if openff_molecule.n_atoms == 1: residue.name = _ion_residue_name(openff_molecule) residue.atom(0).name = residue.name return # Randomly generate a name random_residue_name = "".join( [random.choice(string.ascii_uppercase) for _ in range(3)]) while random_residue_name in forbidden_residue_names: # Re-choose the residue name until we find a safe one. random_residue_name = "".join( [random.choice(string.ascii_uppercase) for _ in range(3)]) residue.name = random_residue_name # Assign unique atom names. element_counter = defaultdict(int) for atom in residue.atoms: atom.name = f"{atom.element.symbol}{element_counter[atom.element.symbol] + 1}" element_counter[atom.element.symbol] += 1
def complete_evaluator_data_set(): """Create a more comprehensive `PhysicalPropertyDataSet` which contains one measurement for each of: * pure density * binary density * pure enthalpy of vaporization * binary enthalpy of mixing * binary excess molar volume * hydration free energy Returns ------- PhysicalPropertyDataSet """ thermodynamic_state = ThermodynamicState(298.15 * unit.kelvin, pressure=1.0 * unit.atmosphere) source = MeasurementSource(doi="10.1000/xyz123") solvation_substance = Substance() solvation_substance.add_component(Component("O"), MoleFraction(1.0)) solvation_substance.add_component(Component("CCCO"), ExactAmount(1)) evaluator_properties = [ Density( thermodynamic_state=thermodynamic_state, phase=PropertyPhase.Liquid, substance=Substance.from_components("O"), value=1.0 * unit.kilogram / unit.meter**3, uncertainty=0.1 * unit.kilogram / unit.meter**3, source=source, ), Density( thermodynamic_state=thermodynamic_state, phase=PropertyPhase.Liquid, substance=Substance.from_components("O", "CC=O"), value=1.0 * unit.kilogram / unit.meter**3, uncertainty=0.1 * unit.kilogram / unit.meter**3, source=source, ), EnthalpyOfVaporization( thermodynamic_state=thermodynamic_state, phase=PropertyPhase(PropertyPhase.Liquid | PropertyPhase.Gas), substance=Substance.from_components("CCO"), value=1.0 * EnthalpyOfVaporization.default_unit(), uncertainty=0.1 * EnthalpyOfVaporization.default_unit(), source=source, ), EnthalpyOfMixing( thermodynamic_state=thermodynamic_state, phase=PropertyPhase.Liquid, substance=Substance.from_components("CCCCO", "CC(C=O)C"), value=1.0 * EnthalpyOfMixing.default_unit(), uncertainty=0.1 * EnthalpyOfMixing.default_unit(), source=source, ), ExcessMolarVolume( thermodynamic_state=thermodynamic_state, phase=PropertyPhase.Liquid, substance=Substance.from_components("C(=O)CCCO", "CCCCCC"), value=1.0 * ExcessMolarVolume.default_unit(), uncertainty=0.1 * ExcessMolarVolume.default_unit(), source=source, ), SolvationFreeEnergy( thermodynamic_state=thermodynamic_state, phase=PropertyPhase.Liquid, substance=solvation_substance, value=1.0 * SolvationFreeEnergy.default_unit(), uncertainty=0.1 * SolvationFreeEnergy.default_unit(), source=source, ), ] for index, evaluator_property in enumerate(evaluator_properties): evaluator_property.id = str(index + 1) evaluator_data_set = PhysicalPropertyDataSet() evaluator_data_set.add_properties(*evaluator_properties) return evaluator_data_set
def define_data_set(reweighting: bool) -> PhysicalPropertyDataSet: # Define a common state to compute estimates at states = [ ThermodynamicState(temperature=296.15 * unit.kelvin, pressure=1.0 * unit.atmosphere), ThermodynamicState(temperature=298.15 * unit.kelvin, pressure=1.0 * unit.atmosphere), ThermodynamicState(temperature=300.15 * unit.kelvin, pressure=1.0 * unit.atmosphere), ] data_set = PhysicalPropertyDataSet() # Solvation free energies. if not reweighting: ethanol_substance = Substance.from_components("CCO") ethanol_substance.add_component( Component("CC=O", Component.Role.Solute), ExactAmount(1)) ethanal_substance = Substance.from_components("CC=O") ethanal_substance.add_component( Component("CCO", Component.Role.Solute), ExactAmount(1)) data_set.add_properties( SolvationFreeEnergy( thermodynamic_state=states[1], phase=PropertyPhase.Liquid, substance=ethanol_substance, value=0.0 * SolvationFreeEnergy.default_unit(), ), SolvationFreeEnergy( thermodynamic_state=states[1], phase=PropertyPhase.Liquid, substance=ethanal_substance, value=0.0 * SolvationFreeEnergy.default_unit(), ), *CurationWorkflow.apply( PhysicalPropertyDataSet(), CurationWorkflowSchema(component_schemas=[ ImportFreeSolvSchema(), FilterBySubstancesSchema(substances_to_include=[("O", "CO")]), ]), ), ) for state in states: # Excess properties. data_set.add_properties( ExcessMolarVolume( thermodynamic_state=state, phase=PropertyPhase.Liquid, substance=Substance.from_components("CC=O", "CCO"), value=0.0 * ExcessMolarVolume.default_unit(), ), EnthalpyOfMixing( thermodynamic_state=state, phase=PropertyPhase.Liquid, substance=Substance.from_components("CC=O", "CCO"), value=0.0 * EnthalpyOfMixing.default_unit(), ), ) # Pure properties data_set.add_properties( Density( thermodynamic_state=state, phase=PropertyPhase.Liquid, substance=Substance.from_components("CCO"), value=0.0 * Density.default_unit(), ), EnthalpyOfVaporization( thermodynamic_state=state, phase=PropertyPhase(PropertyPhase.Liquid | PropertyPhase.Gas), substance=Substance.from_components("CCO"), value=0.0 * EnthalpyOfVaporization.default_unit(), ), DielectricConstant( thermodynamic_state=state, phase=PropertyPhase.Liquid, substance=Substance.from_components("CCO"), value=0.0 * DielectricConstant.default_unit(), ), ) return data_set
def from_pandas(cls, data_frame: pandas.DataFrame) -> "PhysicalPropertyDataSet": """Constructs a data set object from a pandas ``DataFrame`` object. Notes ----- * All physical properties are assumed to be source from experimental measurements. * Currently this method onlu supports data frames containing properties which are built-in to the framework (e.g. Density). * This method assumes the data frame has a structure identical to that produced by the ``PhysicalPropertyDataSet.to_pandas`` function. Parameters ---------- data_frame The data frame to construct the data set from. Returns ------- The constructed data set. """ from openff.evaluator import properties property_header_matches = { re.match(r"^([a-zA-Z]+) Value \(([a-zA-Z0-9+-/\s]*)\)$", header) for header in data_frame if header.find(" Value ") >= 0 } property_headers = {} # Validate that the headers have the correct format, specify a # built-in property type, and specify correctly the properties # units. for match in property_header_matches: assert match property_type_string, property_unit_string = match.groups() assert hasattr(properties, property_type_string) property_type = getattr(properties, property_type_string) property_unit = unit.Unit(property_unit_string) assert property_unit is not None assert (property_unit.dimensionality == property_type.default_unit().dimensionality) property_headers[match.group(0)] = (property_type, property_unit) # Convert the data rows to property objects. physical_properties = [] for _, data_row in data_frame.iterrows(): data_row = data_row.dropna() # Extract the state at which the measurement was made. thermodynamic_state = ThermodynamicState( temperature=data_row["Temperature (K)"] * unit.kelvin, pressure=data_row["Pressure (kPa)"] * unit.kilopascal, ) property_phase = PropertyPhase.from_string(data_row["Phase"]) # Extract the substance the measurement was made for. substance = Substance() for i in range(data_row["N Components"]): component = Component( smiles=data_row[f"Component {i + 1}"], role=Component.Role[data_row.get(f"Role {i + 1}", "Solvent")], ) mole_fraction = data_row.get(f"Mole Fraction {i + 1}", 0.0) exact_amount = data_row.get(f"Exact Amount {i + 1}", 0) if not numpy.isclose(mole_fraction, 0.0): substance.add_component(component, MoleFraction(mole_fraction)) if not numpy.isclose(exact_amount, 0.0): substance.add_component(component, ExactAmount(exact_amount)) for ( property_header, (property_type, property_unit), ) in property_headers.items(): # Check to see whether the row contains a value for this # type of property. if property_header not in data_row: continue uncertainty_header = property_header.replace( "Value", "Uncertainty") source_string = data_row["Source"] is_doi = all( any( re.match(pattern, split_string, re.I) for pattern in [ r"^10.\d{4,9}/[-._;()/:A-Z0-9]+$", r"^10.1002/[^\s]+$", r"^10.\d{4}/\d+-\d+X?(\d+)\d+<[\d\w]+:[\d\w]*>\d+.\d+.\w+;\d$", r"^10.1021/\w\w\d+$", r"^10.1207/[\w\d]+\&\d+_\d+$", ]) for split_string in source_string.split(" + ")) physical_property = property_type( thermodynamic_state=thermodynamic_state, phase=property_phase, value=data_row[property_header] * property_unit, uncertainty=None if uncertainty_header not in data_row else data_row[uncertainty_header] * property_unit, substance=substance, source=MeasurementSource( doi="" if not is_doi else source_string, reference=source_string if not is_doi else "", ), ) identifier = data_row.get("Id", None) if identifier: physical_property.id = identifier physical_properties.append(physical_property) data_set = PhysicalPropertyDataSet() data_set.add_properties(*physical_properties) return data_set
def test_component_standardization(smiles, expected): component = Component(smiles=smiles) assert component.smiles == expected