def test_filter_ionic_liquid(): thermodynamic_state = ThermodynamicState( temperature=298.15 * unit.kelvin, pressure=101.325 * unit.kilopascal, ) # Ensure ionic liquids are filtered. data_set = PhysicalPropertyDataSet() data_set.add_properties( Density( thermodynamic_state=thermodynamic_state, phase=PropertyPhase.Liquid, value=1.0 * Density.default_unit(), uncertainty=1.0 * Density.default_unit(), source=MeasurementSource(doi=" "), substance=Substance.from_components("[Na+].[Cl-]"), ), Density( thermodynamic_state=thermodynamic_state, phase=PropertyPhase.Liquid, value=1.0 * Density.default_unit(), uncertainty=1.0 * Density.default_unit(), source=MeasurementSource(doi=" "), substance=Substance.from_components("C"), ), ) data_frame = data_set.to_pandas() filtered_frame = FilterByIonicLiquid.apply( data_frame, FilterByIonicLiquidSchema(), ) assert len(filtered_frame) == 1
def create_filterable_data_set(): """Creates a dummy data with a diverse set of properties to be filtered, namely: - a liquid density measured at 298 K and 0.5 atm with 1 component containing only carbon. - a gaseous dielectric measured at 288 K and 1 atm with 2 components containing only nitrogen. - a solid EoM measured at 308 K and 1.5 atm with 3 components containing only oxygen. Returns ------- PhysicalPropertyDataSet The created data set. """ source = CalculationSource("Dummy", {}) carbon_substance = create_dummy_substance(number_of_components=1, elements=["C"]) density_property = Density( thermodynamic_state=ThermodynamicState(temperature=298 * unit.kelvin, pressure=0.5 * unit.atmosphere), phase=PropertyPhase.Liquid, substance=carbon_substance, value=1 * unit.gram / unit.milliliter, uncertainty=0.11 * unit.gram / unit.milliliter, source=source, ) nitrogen_substance = create_dummy_substance(number_of_components=2, elements=["N"]) dielectric_property = DielectricConstant( thermodynamic_state=ThermodynamicState(temperature=288 * unit.kelvin, pressure=1 * unit.atmosphere), phase=PropertyPhase.Gas, substance=nitrogen_substance, value=1 * unit.dimensionless, uncertainty=0.11 * unit.dimensionless, source=source, ) oxygen_substance = create_dummy_substance(number_of_components=3, elements=["O"]) enthalpy_property = EnthalpyOfMixing( thermodynamic_state=ThermodynamicState(temperature=308 * unit.kelvin, pressure=1.5 * unit.atmosphere), phase=PropertyPhase.Solid, substance=oxygen_substance, value=1 * unit.kilojoules / unit.mole, uncertainty=0.11 * unit.kilojoules / unit.mole, source=source, ) data_set = PhysicalPropertyDataSet() data_set.add_properties(density_property, dielectric_property, enthalpy_property) return data_set
def estimated_reference_sets(): estimated_density = Density( thermodynamic_state=ThermodynamicState(298.15 * unit.kelvin, pressure=1.0 * unit.atmosphere), phase=PropertyPhase.Liquid, substance=Substance.from_components("O", "CC=O"), value=1.0 * unit.kilogram / unit.meter**3, uncertainty=0.1 * unit.kilogram / unit.meter**3, ) estimated_density.id = "1" estimated_enthalpy = EnthalpyOfMixing( thermodynamic_state=ThermodynamicState(298.15 * unit.kelvin, pressure=1.0 * unit.atmosphere), phase=PropertyPhase.Liquid, substance=Substance.from_components("O", "CC=O"), value=1.0 * unit.kilocalorie / unit.mole, uncertainty=0.1 * unit.kilojoule / unit.mole, ) estimated_enthalpy.id = "2" estimated_data_set = PhysicalPropertyDataSet() estimated_data_set.add_properties(estimated_density, estimated_enthalpy) reference_density = DataSetEntry( id=1, property_type="Density", temperature=298.15, pressure=101.325, value=0.001, std_error=0.0001, doi=" ", components=[ Component(smiles="O", mole_fraction=0.5), Component(smiles="CC=O", mole_fraction=0.5), ], ) reference_enthalpy = DataSetEntry( id=2, property_type="EnthalpyOfMixing", temperature=298.15, pressure=101.325, value=4.184, std_error=0.1, doi=" ", components=[ Component(smiles="O", mole_fraction=0.5), Component(smiles="CC=O", mole_fraction=0.5), ], ) reference_data_set = DataSet( id="ref", description=" ", authors=[Author(name=" ", email="*****@*****.**", institute=" ")], entries=[reference_density, reference_enthalpy], ) return estimated_data_set, reference_data_set
def test_sources_substances(): physical_property = create_dummy_property(Density) data_set = PhysicalPropertyDataSet() data_set.add_properties(physical_property) assert next(iter(data_set.sources)) == physical_property.source assert next(iter(data_set.substances)) == physical_property.substance
def test_from_pandas(): """A test to ensure that data sets may be created from pandas objects.""" thermodynamic_state = ThermodynamicState(temperature=298.15 * unit.kelvin, pressure=1.0 * unit.atmosphere) original_data_set = PhysicalPropertyDataSet() original_data_set.add_properties( Density( thermodynamic_state=thermodynamic_state, phase=PropertyPhase.Liquid, substance=Substance.from_components("CO", "O"), value=1.0 * unit.kilogram / unit.meter**3, uncertainty=1.0 * unit.kilogram / unit.meter**3, source=MeasurementSource(doi="10.5281/zenodo.596537"), ), EnthalpyOfVaporization( thermodynamic_state=thermodynamic_state, phase=PropertyPhase.from_string("Liquid + Gas"), substance=Substance.from_components("C"), value=2.0 * unit.kilojoule / unit.mole, source=MeasurementSource(reference="2"), ), DielectricConstant( thermodynamic_state=thermodynamic_state, phase=PropertyPhase.Liquid, substance=Substance.from_components("C"), value=3.0 * unit.dimensionless, source=MeasurementSource(reference="3"), ), ) data_frame = original_data_set.to_pandas() recreated_data_set = PhysicalPropertyDataSet.from_pandas(data_frame) assert len(original_data_set) == len(recreated_data_set) for original_property in original_data_set: recreated_property = next(x for x in recreated_data_set if x.id == original_property.id) assert (original_property.thermodynamic_state == recreated_property.thermodynamic_state) assert original_property.phase == recreated_property.phase assert original_property.substance == recreated_property.substance assert numpy.isclose(original_property.value, recreated_property.value) if original_property.uncertainty == UNDEFINED: assert original_property.uncertainty == recreated_property.uncertainty else: assert numpy.isclose(original_property.uncertainty, recreated_property.uncertainty) assert original_property.source.doi == recreated_property.source.doi assert original_property.source.reference == recreated_property.source.reference
def to_evaluator(self) -> "PhysicalPropertyDataSet": from openff.evaluator.datasets import PhysicalPropertyDataSet physical_properties = [entry.to_evaluator() for entry in self.entries] evaluator_set = PhysicalPropertyDataSet() evaluator_set.add_properties(*physical_properties) return evaluator_set
def test_reindex_data_set_no_mole_fraction(): """Tests that the ``reindex_data_set`` function behaves as expected when exact amounts are present.""" setup_timestamp_logging(logging.INFO) substance = substances.Substance() substance.add_component(substances.Component(smiles="O"), amount=substances.MoleFraction(1.0)) substance.add_component( substances.Component(smiles="CO", role=substances.Component.Role.Solute), amount=substances.ExactAmount(1), ) evaluator_data_set = PhysicalPropertyDataSet() evaluator_data_set.add_properties( SolvationFreeEnergy( thermodynamic_state=ThermodynamicState( temperature=298.15 * unit.kelvin, pressure=1.0 * unit.atmosphere), phase=PropertyPhase.Liquid, substance=substance, value=1.0 * SolvationFreeEnergy.default_unit(), uncertainty=1.0 * SolvationFreeEnergy.default_unit(), ), ) data_set = DataSet( id="data-set", description=" ", authors=[Author(name=" ", email="*****@*****.**", institute=" ")], entries=[ DataSetEntry( id=1, property_type="SolvationFreeEnergy", temperature=298.15, pressure=101.325, value=1.0, std_error=1.0, doi=" ", components=[ Component(smiles="O", mole_fraction=1.0), Component(smiles="CO", mole_fraction=0.0, exact_amount=1, role="Solute"), ], ) ], ) reindex_data_set(evaluator_data_set, data_set) assert evaluator_data_set.properties[0].id == "1"
def test_benchmark_analysis(caplog, monkeypatch, dummy_conda_env): from openff.evaluator.client import RequestResult from openff.evaluator.datasets import PhysicalPropertyDataSet benchmark = create_benchmark( "project-1", "study-1", "benchmark-1", ["data-set-1"], "optimization-1", None ) # Create a reference data set. reference_data_set = create_data_set("data-set-1") reference_data_set.entries.append(reference_data_set.entries[0].copy()) reference_data_set.entries[0].id = 1 reference_data_set.entries[1].id = 2 # Create a set of evaluator results estimated_data_set = PhysicalPropertyDataSet() estimated_data_set.add_properties(reference_data_set.entries[0].to_evaluator()) unsuccessful_properties = PhysicalPropertyDataSet() unsuccessful_properties.add_properties(reference_data_set.entries[1].to_evaluator()) results = RequestResult() results.estimated_properties = estimated_data_set results.unsuccessful_properties = unsuccessful_properties with temporary_cd(os.path.dirname(dummy_conda_env)): # Save the expected input files. with open("benchmark.json", "w") as file: file.write(benchmark.json()) with open("test-set-collection.json", "w") as file: file.write(DataSetCollection(data_sets=[reference_data_set]).json()) results.json("results.json") with caplog.at_level(logging.WARNING): BenchmarkAnalysisFactory.analyze(True) assert ( "1 properties could not be estimated and so were not analyzed" in caplog.text ) assert os.path.isdir("analysis") assert os.path.isfile(os.path.join("analysis", "benchmark-results.json")) results_object = BenchmarkResult.parse_file( os.path.join("analysis", "benchmark-results.json") ) assert len(results_object.calculation_environment) > 0 assert len(results_object.analysis_environment) > 0
def test_serialization(): """A test to ensure that data sets are JSON serializable.""" data_set = PhysicalPropertyDataSet() data_set.add_properties(create_dummy_property(Density)) data_set_json = data_set.json() parsed_data_set = PhysicalPropertyDataSet.parse_json(data_set_json) assert len(data_set) == len(parsed_data_set) parsed_data_set_json = parsed_data_set.json() assert parsed_data_set_json == data_set_json
def test_protocol_replacement(force_field_source, expected_protocol_type): data_set = PhysicalPropertyDataSet() for property_type in property_types: physical_property = create_dummy_property(property_type) data_set.add_properties(physical_property) options = EvaluatorClient.default_request_options(data_set, force_field_source) options_json = options.json(format=True) assert options_json.find('BaseBuildSystem"') < 0 assert options_json.find(expected_protocol_type) >= 0
def data_frame() -> pandas.DataFrame: temperatures = [298.15, 318.15] pressures = [101.325, 101.0] properties = [Density, EnthalpyOfMixing] mole_fractions = [(1.0, ), (1.0, ), (0.25, 0.75), (0.75, 0.25)] smiles = {1: [("C(F)(Cl)(Br)", ), ("C", )], 2: [("CO", "C"), ("C", "CO")]} loop_variables = [( temperature, pressure, property_type, mole_fraction, ) for temperature in temperatures for pressure in pressures for property_type in properties for mole_fraction in mole_fractions] data_entries = [] for temperature, pressure, property_type, mole_fraction in loop_variables: n_components = len(mole_fraction) for smiles_tuple in smiles[n_components]: substance = Substance() for smiles_pattern, x in zip(smiles_tuple, mole_fraction): substance.add_component(Component(smiles_pattern), MoleFraction(x)) data_entries.append( property_type( thermodynamic_state=ThermodynamicState( temperature=temperature * unit.kelvin, pressure=pressure * unit.kilopascal, ), phase=PropertyPhase.Liquid, value=1.0 * property_type.default_unit(), uncertainty=1.0 * property_type.default_unit(), source=MeasurementSource(doi=" "), substance=substance, )) data_set = PhysicalPropertyDataSet() data_set.add_properties(*data_entries) return data_set.to_pandas()
def data_frame() -> pandas.DataFrame: temperatures = [303.15, 298.15] property_types = [Density, EnthalpyOfVaporization] data_set_entries = [] def _temperature_noise(): return (numpy.random.rand() / 2.0 + 0.51) / 10.0 for temperature in temperatures: for index, property_type in enumerate(property_types): noise = _temperature_noise() noise *= 1 if index == 0 else -1 data_set_entries.append( property_type( thermodynamic_state=ThermodynamicState( temperature=temperature * unit.kelvin, pressure=101.325 * unit.kilopascal, ), phase=PropertyPhase.Liquid, value=1.0 * property_type.default_unit(), uncertainty=1.0 * property_type.default_unit(), source=MeasurementSource(doi=" "), substance=Substance.from_components("C"), ), ) data_set_entries.append( property_type( thermodynamic_state=ThermodynamicState( temperature=(temperature + noise) * unit.kelvin, pressure=101.325 * unit.kilopascal, ), phase=PropertyPhase.Liquid, value=1.0 * property_type.default_unit(), uncertainty=1.0 * property_type.default_unit(), source=MeasurementSource(doi=" "), substance=Substance.from_components("C"), ), ) data_set = PhysicalPropertyDataSet() data_set.add_properties(*data_set_entries) data_frame = data_set.to_pandas() return data_frame
def test_properties_by_type(): density = create_dummy_property(Density) dielectric = create_dummy_property(DielectricConstant) data_set = PhysicalPropertyDataSet() data_set.add_properties(density, dielectric) densities = [x for x in data_set.properties_by_type("Density")] assert len(densities) == 1 assert densities[0] == density dielectrics = [ x for x in data_set.properties_by_type("DielectricConstant") ] assert len(dielectrics) == 1 assert dielectrics[0] == dielectric
def test_same_component_batching(): thermodynamic_state = ThermodynamicState(temperature=1.0 * unit.kelvin, pressure=1.0 * unit.atmosphere) data_set = PhysicalPropertyDataSet() data_set.add_properties( Density( thermodynamic_state=thermodynamic_state, substance=Substance.from_components("O", "C"), value=0.0 * unit.kilogram / unit.meter**3, ), EnthalpyOfVaporization( thermodynamic_state=thermodynamic_state, substance=Substance.from_components("O", "C"), value=0.0 * unit.kilojoule / unit.mole, ), Density( thermodynamic_state=thermodynamic_state, substance=Substance.from_components("O", "CO"), value=0.0 * unit.kilogram / unit.meter**3, ), EnthalpyOfVaporization( thermodynamic_state=thermodynamic_state, substance=Substance.from_components("O", "CO"), value=0.0 * unit.kilojoule / unit.mole, ), ) options = RequestOptions() submission = EvaluatorClient._Submission() submission.dataset = data_set submission.options = options with DaskLocalCluster() as calculation_backend: server = EvaluatorServer(calculation_backend) batches = server._batch_by_same_component(submission, "") assert len(batches) == 2 assert len(batches[0].queued_properties) == 2 assert len(batches[1].queued_properties) == 2
def test_default_options(): """Test creating the default estimation options.""" data_set = PhysicalPropertyDataSet() force_field_source = SmirnoffForceFieldSource.from_path( "smirnoff99Frosst-1.1.0.offxml") for property_type in property_types: physical_property = create_dummy_property(property_type) data_set.add_properties(physical_property) options = EvaluatorClient.default_request_options(data_set, force_field_source) options.validate() assert len(options.calculation_layers) == 2 assert len(options.calculation_schemas) == len(property_types) assert all( len(x) == len(options.calculation_layers) for x in options.calculation_schemas.values())
def data_frame() -> pandas.DataFrame: data_set = PhysicalPropertyDataSet() data_set.add_properties( Density( thermodynamic_state=ThermodynamicState( temperature=298.15 * unit.kelvin, pressure=101.325 * unit.kilopascal, ), phase=PropertyPhase.Liquid, value=1.0 * Density.default_unit(), uncertainty=1.0 * Density.default_unit(), source=MeasurementSource(doi=" "), substance=Substance.from_components("C"), ), Density( thermodynamic_state=ThermodynamicState( temperature=305.15 * unit.kelvin, pressure=101.325 * unit.kilopascal, ), phase=PropertyPhase.Liquid, value=1.0 * Density.default_unit(), uncertainty=1.0 * Density.default_unit(), source=MeasurementSource(doi=" "), substance=Substance.from_components("C"), ), Density( thermodynamic_state=ThermodynamicState( temperature=298.15 * unit.kelvin, pressure=105.325 * unit.kilopascal, ), phase=PropertyPhase.Liquid, value=1.0 * Density.default_unit(), uncertainty=1.0 * Density.default_unit(), source=MeasurementSource(doi=" "), substance=Substance.from_components("C"), ), ) return data_set.to_pandas()
def test_validate_data_set(): valid_property = Density( ThermodynamicState(298 * unit.kelvin, 1 * unit.atmosphere), PropertyPhase.Liquid, Substance.from_components("O"), 0.0 * unit.gram / unit.milliliter, 0.0 * unit.gram / unit.milliliter, ) data_set = PhysicalPropertyDataSet() data_set.add_properties(valid_property) data_set.validate() invalid_property = Density( ThermodynamicState(-1 * unit.kelvin, 1 * unit.atmosphere), PropertyPhase.Liquid, Substance.from_components("O"), 0.0 * unit.gram / unit.milliliter, 0.0 * unit.gram / unit.milliliter, ) with pytest.raises(AssertionError): data_set.add_properties(invalid_property) data_set.add_properties(invalid_property, validate=False) with pytest.raises(AssertionError): data_set.validate()
def test_launch_batch(): # Set up a dummy data set data_set = PhysicalPropertyDataSet() data_set.add_properties(create_dummy_property(Density), create_dummy_property(Density)) batch = Batch() batch.force_field_id = "" batch.options = RequestOptions() batch.options.calculation_layers = ["QuickCalculationLayer"] batch.options.calculation_schemas = { "Density": { "QuickCalculationLayer": CalculationLayerSchema() } } batch.parameter_gradient_keys = [] batch.queued_properties = [*data_set] batch.validate() with tempfile.TemporaryDirectory() as directory: with temporarily_change_directory(directory): with DaskLocalCluster() as calculation_backend: server = EvaluatorServer( calculation_backend=calculation_backend, working_directory=directory, ) server._queued_batches[batch.id] = batch server._launch_batch(batch) while len(batch.queued_properties) > 0: sleep(0.01) assert len(batch.estimated_properties) == 1 assert len(batch.unsuccessful_properties) == 1
def simple_evaluator_data_set(): """Create a simple evaluator `PhysicalPropertyDataSet` which contains a simple binary density measurement. Returns ------- PhysicalPropertyDataSet """ evaluator_density = Density( thermodynamic_state=ThermodynamicState(298.15 * unit.kelvin, pressure=1.0 * unit.atmosphere), phase=PropertyPhase.Liquid, substance=Substance.from_components("O", "CC=O"), value=1.0 * unit.kilogram / unit.meter**3, uncertainty=0.1 * unit.kilogram / unit.meter**3, source=MeasurementSource(doi="10.1000/xyz123"), ) evaluator_density.id = "1" evaluator_data_set = PhysicalPropertyDataSet() evaluator_data_set.add_properties(evaluator_density) return evaluator_data_set
def _apply( cls, data_frame: pandas.DataFrame, schema: ImportFreeSolvSchema, n_processes, ) -> pandas.DataFrame: from openff.evaluator import properties, substances, unit # Convert the data frame into data rows. free_solv_data_frame = cls._download_free_solv() data_entries = [] for _, row in free_solv_data_frame.iterrows(): # Extract and standardize the SMILES pattern of the solute_smiles = row["SMILES"].lstrip().rstrip() solute_smiles = substances.Component(solute_smiles).smiles # Build the substance. substance = Substance() substance.add_component(Component(smiles="O"), MoleFraction(1.0)) substance.add_component( Component(smiles=solute_smiles, role=Component.Role.Solute), ExactAmount(1), ) # Extract the value and uncertainty value = (float(row["experimental value (kcal/mol)"]) * unit.kilocalorie / unit.mole) std_error = (float(row["experimental uncertainty (kcal/mol)"]) * unit.kilocalorie / unit.mole) # Attempt to extract a DOI original_source = row[ "experimental reference (original or paper this value was taken from)"] doi = cls._validate_doi(original_source) data_entry = SolvationFreeEnergy( thermodynamic_state=ThermodynamicState( temperature=298.15 * unit.kelvin, pressure=101.325 * unit.kilopascal, ), phase=PropertyPhase.Liquid, substance=substance, value=value.to(properties.SolvationFreeEnergy.default_unit()), uncertainty=std_error.to( properties.SolvationFreeEnergy.default_unit()), source=MeasurementSource(doi=doi), ) data_entries.append(data_entry) data_set = PhysicalPropertyDataSet() data_set.add_properties(*data_entries) free_solv_data_frame = data_set.to_pandas() data_frame = pandas.concat([data_frame, free_solv_data_frame], ignore_index=True, sort=False) return data_frame
def test_to_pandas(): """A test to ensure that data sets are convertable to pandas objects.""" source = CalculationSource("Dummy", {}) pure_substance = Substance.from_components("C") binary_substance = Substance.from_components("C", "O") data_set = PhysicalPropertyDataSet() for temperature in [ 298 * unit.kelvin, 300 * unit.kelvin, 302 * unit.kelvin ]: thermodynamic_state = ThermodynamicState(temperature=temperature, pressure=1.0 * unit.atmosphere) density_property = Density( thermodynamic_state=thermodynamic_state, phase=PropertyPhase.Liquid, substance=pure_substance, value=1 * unit.gram / unit.milliliter, uncertainty=0.11 * unit.gram / unit.milliliter, source=source, ) dielectric_property = DielectricConstant( thermodynamic_state=thermodynamic_state, phase=PropertyPhase.Liquid, substance=pure_substance, value=1 * unit.dimensionless, uncertainty=0.11 * unit.dimensionless, source=source, ) data_set.add_properties(density_property) data_set.add_properties(dielectric_property) for temperature in [ 298 * unit.kelvin, 300 * unit.kelvin, 302 * unit.kelvin ]: thermodynamic_state = ThermodynamicState(temperature=temperature, pressure=1.0 * unit.atmosphere) enthalpy_property = EnthalpyOfMixing( thermodynamic_state=thermodynamic_state, phase=PropertyPhase.Liquid, substance=binary_substance, value=1 * unit.kilojoules / unit.mole, uncertainty=0.11 * unit.kilojoules / unit.mole, source=source, ) excess_property = ExcessMolarVolume( thermodynamic_state=thermodynamic_state, phase=PropertyPhase.Liquid, substance=binary_substance, value=1 * unit.meter**3 / unit.mole, uncertainty=0.11 * unit.meter**3 / unit.mole, source=source, ) data_set.add_properties(enthalpy_property) data_set.add_properties(excess_property) data_set_pandas = data_set.to_pandas() required_columns = [ "Id", "Temperature (K)", "Pressure (kPa)", "Phase", "N Components", "Source", "Component 1", "Role 1", "Mole Fraction 1", "Exact Amount 1", "Component 2", "Role 2", "Mole Fraction 2", "Exact Amount 2", ] assert all(x in data_set_pandas for x in required_columns) assert data_set_pandas is not None assert data_set_pandas.shape == (12, 22) data_set_without_na = data_set_pandas.dropna(axis=1, how="all") assert data_set_without_na.shape == (12, 20)
def test_reindex_data_set(): """Tests that the ``reindex_data_set`` function behaves as expected.""" setup_timestamp_logging(logging.INFO) evaluator_data_set = PhysicalPropertyDataSet() evaluator_data_set.add_properties( Density( thermodynamic_state=ThermodynamicState( temperature=298.15 * unit.kelvin, pressure=1.0 * unit.atmosphere), phase=PropertyPhase.Liquid, substance=substances.Substance.from_components("O"), value=1.0 * Density.default_unit(), uncertainty=1.0 * Density.default_unit(), ), Density( thermodynamic_state=ThermodynamicState( temperature=298.15 * unit.kelvin, pressure=1.0 * unit.atmosphere), phase=PropertyPhase.Liquid, substance=substances.Substance.from_components("C", "O"), value=1.0 * Density.default_unit(), uncertainty=1.0 * Density.default_unit(), ), Density( thermodynamic_state=ThermodynamicState( temperature=300.0 * unit.kelvin, pressure=1.0 * unit.atmosphere), phase=PropertyPhase.Liquid, substance=substances.Substance.from_components("C", "O"), value=1.0 * Density.default_unit(), uncertainty=1.0 * Density.default_unit(), ), ) data_set = DataSet( id="data-set", description=" ", authors=[Author(name=" ", email="*****@*****.**", institute=" ")], entries=[ DataSetEntry( id=1, property_type="Density", temperature=298.15, pressure=101.325, value=1.0, std_error=1.0, doi=" ", components=[ Component(smiles="O", mole_fraction=0.5), Component(smiles="C", mole_fraction=0.5), ], ), DataSetEntry( id=2, property_type="Density", temperature=298.15, pressure=101.325, value=1.0, std_error=1.0, doi=" ", components=[Component(smiles="O", mole_fraction=1.0)], ), ], ) un_indexed_id = evaluator_data_set.properties[2].id reindex_data_set(evaluator_data_set, data_set) assert evaluator_data_set.properties[0].id == "2" assert evaluator_data_set.properties[1].id == "1" assert evaluator_data_set.properties[2].id == un_indexed_id data_set_collection = DataSetCollection(data_sets=[ DataSet( id="0", description=" ", authors=[Author(name=" ", email="*****@*****.**", institute=" ")], entries=[ DataSetEntry( id=3, property_type="Density", temperature=298.15, pressure=101.325, value=1.0, std_error=1.0, doi=" ", components=[ Component(smiles="O", mole_fraction=0.5), Component(smiles="C", mole_fraction=0.5), ], ) ], ), DataSet( id="1", description=" ", authors=[Author(name=" ", email="*****@*****.**", institute=" ")], entries=[ DataSetEntry( id=4, property_type="Density", temperature=298.15, pressure=101.325, value=1.0, std_error=1.0, doi=" ", components=[Component(smiles="O", mole_fraction=1.0)], ) ], ), ]) reindex_data_set(evaluator_data_set, data_set_collection) assert evaluator_data_set.properties[0].id == "4" assert evaluator_data_set.properties[1].id == "3" assert evaluator_data_set.properties[2].id == un_indexed_id
def test_analysed_result_from_evaluator(): """Tests the `AnalysedResult.from_evaluator` function.""" expected_mean = 0.0 expected_std = numpy.random.rand() + 1.0 values = numpy.random.normal(expected_mean, expected_std, 1000) estimated_properties = [] reference_entries = [] for index, value in enumerate(values): property_id = index + 1 estimated_density = Density( thermodynamic_state=ThermodynamicState(298.15 * unit.kelvin, pressure=1.0 * unit.atmosphere), phase=PropertyPhase.Liquid, substance=Substance.from_components("O"), value=value * Density.default_unit(), uncertainty=0.0 * Density.default_unit(), ) estimated_density.id = str(property_id) estimated_properties.append(estimated_density) reference_density = DataSetEntry( id=property_id, property_type="Density", temperature=298.15, pressure=101.325, value=expected_mean, std_error=None, doi=" ", components=[Component(smiles="O", mole_fraction=1.0)], ) reference_entries.append(reference_density) estimated_data_set = PhysicalPropertyDataSet() estimated_data_set.add_properties(*estimated_properties) reference_data_set = DataSet( id="ref", description=" ", authors=[Author(name=" ", email="*****@*****.**", institute=" ")], entries=reference_entries, ) analysis_environments = [ChemicalEnvironment.Aqueous] analysed_results = DataSetResult.from_evaluator( reference_data_set=reference_data_set, estimated_data_set=estimated_data_set, analysis_environments=analysis_environments, statistic_types=[StatisticType.RMSE], bootstrap_iterations=1000, ) assert len(analysed_results.result_entries) == len(estimated_properties) full_statistics = next( iter(x for x in analysed_results.statistic_entries if x.category is None)) assert full_statistics.property_type == "Density" assert full_statistics.n_components == 1 assert full_statistics.statistic_type == StatisticType.RMSE assert numpy.isclose(full_statistics.value, expected_std, rtol=0.10)
def main(): os.makedirs("raw_data_v2", exist_ok=True) for data_set_name in [ "curated_data_set", "gaff 1.81", "gaff 2.11", "parsley 1.0.0", "smirnoff99frosst 1.1.0", ]: with open(os.path.join("raw_data", f"{data_set_name}.json")) as file: raw_data_set = json.load(file) assert (raw_data_set["@type"] == "propertyestimator.datasets.datasets.PhysicalPropertyDataSet") physical_properties = [] for raw_data_set_entries in raw_data_set["properties"].values(): for raw_data_set_entry in raw_data_set_entries: # Extract the substance this entry was measured for. substance = Substance() for raw_component in raw_data_set_entry["substance"][ "components"]: component = Component( smiles=raw_component["smiles"], role=Component.Role[raw_component["role"]["value"]], ) raw_amounts = raw_data_set_entry["substance"]["amounts"][ raw_component["smiles"]] for raw_amount in raw_amounts["value"]: if (raw_amount["@type"] == "propertyestimator.substances.Substance->MoleFraction" ): substance.add_component( component, MoleFraction(raw_amount["value"])) elif (raw_amount["@type"] == "propertyestimator.substances.Substance->ExactAmount" ): substance.add_component( component, ExactAmount(raw_amount["value"])) else: raise NotImplementedError() # Extract the source of the property if (raw_data_set_entry["source"]["@type"] == "propertyestimator.properties.properties.CalculationSource" ): source = CalculationSource( fidelity=raw_data_set_entry["source"]["fidelity"]) elif (raw_data_set_entry["source"]["@type"] == "propertyestimator.properties.properties.MeasurementSource" ): source = MeasurementSource(doi=correct_doi( raw_data_set_entry["source"]["reference"])) else: raise NotImplementedError() # Generate the new property object. property_class = getattr( properties, raw_data_set_entry["@type"].split(".")[-1]) physical_property = property_class( thermodynamic_state=ThermodynamicState( temperature=( raw_data_set_entry["thermodynamic_state"] ["temperature"]["value"] * unit.Unit(raw_data_set_entry["thermodynamic_state"] ["temperature"]["unit"])), pressure=( raw_data_set_entry["thermodynamic_state"] ["pressure"]["value"] * unit.Unit(raw_data_set_entry["thermodynamic_state"] ["pressure"]["unit"])), ), phase=PropertyPhase(raw_data_set_entry["phase"]), substance=substance, value=(raw_data_set_entry["value"]["value"] * unit.Unit(raw_data_set_entry["value"]["unit"])), uncertainty=( None if isinstance(source, MeasurementSource) else (raw_data_set_entry["uncertainty"]["value"] * unit.Unit(raw_data_set_entry["uncertainty"]["unit"]) )), source=source, ) physical_property.id = raw_data_set_entry["id"] physical_properties.append(physical_property) data_set = PhysicalPropertyDataSet() data_set.add_properties(*physical_properties) data_set.json(os.path.join("raw_data_v2", f"{data_set_name}.json"), format=True) data_set.to_pandas().to_csv( os.path.join("raw_data_v2", f"{data_set_name}.csv"))
def complete_evaluator_data_set(): """Create a more comprehensive `PhysicalPropertyDataSet` which contains one measurement for each of: * pure density * binary density * pure enthalpy of vaporization * binary enthalpy of mixing * binary excess molar volume * hydration free energy Returns ------- PhysicalPropertyDataSet """ thermodynamic_state = ThermodynamicState(298.15 * unit.kelvin, pressure=1.0 * unit.atmosphere) source = MeasurementSource(doi="10.1000/xyz123") solvation_substance = Substance() solvation_substance.add_component(Component("O"), MoleFraction(1.0)) solvation_substance.add_component(Component("CCCO"), ExactAmount(1)) evaluator_properties = [ Density( thermodynamic_state=thermodynamic_state, phase=PropertyPhase.Liquid, substance=Substance.from_components("O"), value=1.0 * unit.kilogram / unit.meter**3, uncertainty=0.1 * unit.kilogram / unit.meter**3, source=source, ), Density( thermodynamic_state=thermodynamic_state, phase=PropertyPhase.Liquid, substance=Substance.from_components("O", "CC=O"), value=1.0 * unit.kilogram / unit.meter**3, uncertainty=0.1 * unit.kilogram / unit.meter**3, source=source, ), EnthalpyOfVaporization( thermodynamic_state=thermodynamic_state, phase=PropertyPhase(PropertyPhase.Liquid | PropertyPhase.Gas), substance=Substance.from_components("CCO"), value=1.0 * EnthalpyOfVaporization.default_unit(), uncertainty=0.1 * EnthalpyOfVaporization.default_unit(), source=source, ), EnthalpyOfMixing( thermodynamic_state=thermodynamic_state, phase=PropertyPhase.Liquid, substance=Substance.from_components("CCCCO", "CC(C=O)C"), value=1.0 * EnthalpyOfMixing.default_unit(), uncertainty=0.1 * EnthalpyOfMixing.default_unit(), source=source, ), ExcessMolarVolume( thermodynamic_state=thermodynamic_state, phase=PropertyPhase.Liquid, substance=Substance.from_components("C(=O)CCCO", "CCCCCC"), value=1.0 * ExcessMolarVolume.default_unit(), uncertainty=0.1 * ExcessMolarVolume.default_unit(), source=source, ), SolvationFreeEnergy( thermodynamic_state=thermodynamic_state, phase=PropertyPhase.Liquid, substance=solvation_substance, value=1.0 * SolvationFreeEnergy.default_unit(), uncertainty=0.1 * SolvationFreeEnergy.default_unit(), source=source, ), ] for index, evaluator_property in enumerate(evaluator_properties): evaluator_property.id = str(index + 1) evaluator_data_set = PhysicalPropertyDataSet() evaluator_data_set.add_properties(*evaluator_properties) return evaluator_data_set
def define_data_set(reweighting: bool) -> PhysicalPropertyDataSet: # Define a common state to compute estimates at states = [ ThermodynamicState(temperature=296.15 * unit.kelvin, pressure=1.0 * unit.atmosphere), ThermodynamicState(temperature=298.15 * unit.kelvin, pressure=1.0 * unit.atmosphere), ThermodynamicState(temperature=300.15 * unit.kelvin, pressure=1.0 * unit.atmosphere), ] data_set = PhysicalPropertyDataSet() # Solvation free energies. if not reweighting: ethanol_substance = Substance.from_components("CCO") ethanol_substance.add_component( Component("CC=O", Component.Role.Solute), ExactAmount(1)) ethanal_substance = Substance.from_components("CC=O") ethanal_substance.add_component( Component("CCO", Component.Role.Solute), ExactAmount(1)) data_set.add_properties( SolvationFreeEnergy( thermodynamic_state=states[1], phase=PropertyPhase.Liquid, substance=ethanol_substance, value=0.0 * SolvationFreeEnergy.default_unit(), ), SolvationFreeEnergy( thermodynamic_state=states[1], phase=PropertyPhase.Liquid, substance=ethanal_substance, value=0.0 * SolvationFreeEnergy.default_unit(), ), *CurationWorkflow.apply( PhysicalPropertyDataSet(), CurationWorkflowSchema(component_schemas=[ ImportFreeSolvSchema(), FilterBySubstancesSchema(substances_to_include=[("O", "CO")]), ]), ), ) for state in states: # Excess properties. data_set.add_properties( ExcessMolarVolume( thermodynamic_state=state, phase=PropertyPhase.Liquid, substance=Substance.from_components("CC=O", "CCO"), value=0.0 * ExcessMolarVolume.default_unit(), ), EnthalpyOfMixing( thermodynamic_state=state, phase=PropertyPhase.Liquid, substance=Substance.from_components("CC=O", "CCO"), value=0.0 * EnthalpyOfMixing.default_unit(), ), ) # Pure properties data_set.add_properties( Density( thermodynamic_state=state, phase=PropertyPhase.Liquid, substance=Substance.from_components("CCO"), value=0.0 * Density.default_unit(), ), EnthalpyOfVaporization( thermodynamic_state=state, phase=PropertyPhase(PropertyPhase.Liquid | PropertyPhase.Gas), substance=Substance.from_components("CCO"), value=0.0 * EnthalpyOfVaporization.default_unit(), ), DielectricConstant( thermodynamic_state=state, phase=PropertyPhase.Liquid, substance=Substance.from_components("CCO"), value=0.0 * DielectricConstant.default_unit(), ), ) return data_set
def test_filter_by_environment_list(): """Test that the ``FilterByEnvironments`` filter works well with the ``environments`` schema option""" data_set = PhysicalPropertyDataSet() data_set.add_properties( _build_entry("O"), _build_entry("C"), _build_entry("C", "O"), _build_entry("O", "CC(=O)CC=O"), _build_entry("CC(=O)CC=O", "O"), ) data_frame = data_set.to_pandas() # Retain only aqueous functionality filtered_frame = FilterByEnvironments.apply( data_frame, FilterByEnvironmentsSchema(environments=[ChemicalEnvironment.Aqueous], at_least_one_environment=True), ) assert len(filtered_frame) == 1 assert filtered_frame["N Components"].max() == 1 assert {*filtered_frame["Component 1"].unique()} == {"O"} # Retain both aqueous and aldehyde functionality but not strictly filtered_frame = FilterByEnvironments.apply( data_frame, FilterByEnvironmentsSchema( environments=[ ChemicalEnvironment.Aqueous, ChemicalEnvironment.Aldehyde ], at_least_one_environment=True, ), ) assert len(filtered_frame) == 3 assert filtered_frame["N Components"].min() == 1 assert filtered_frame["N Components"].max() == 2 pure_data = filtered_frame[filtered_frame["N Components"] == 1] binary_data = filtered_frame[filtered_frame["N Components"] == 2] assert len(pure_data) == 1 assert {*pure_data["Component 1"].unique()} == {"O"} assert len(binary_data) == 2 assert { *binary_data["Component 1"].unique(), *binary_data["Component 2"].unique(), } == {"CC(=O)CC=O", "O"} # Ensure enforcing the strict behaviour correctly filters out the # combined aldehyde and ketone functionality when only aldehyde and # aqueous is permitted. filtered_frame = FilterByEnvironments.apply( data_frame, FilterByEnvironmentsSchema( environments=[ ChemicalEnvironment.Aqueous, ChemicalEnvironment.Aldehyde ], at_least_one_environment=False, strictly_specified_environments=True, ), ) assert len(filtered_frame) == 1 assert filtered_frame["N Components"].max() == 1 assert {*filtered_frame["Component 1"].unique()} == {"O"}
def test_filter_by_environment_per_component(): """Test that the ``FilterByEnvironments`` filter works well with the ``per_component_environments`` schema option""" data_set = PhysicalPropertyDataSet() data_set.add_properties( _build_entry("O"), _build_entry("C"), _build_entry("C", "O"), _build_entry("O", "CC(=O)CC=O"), _build_entry("CC(=O)CC=O", "O"), ) data_frame = data_set.to_pandas() # Retain only aqueous functionality filtered_frame = FilterByEnvironments.apply( data_frame, FilterByEnvironmentsSchema( per_component_environments={ 1: [[ChemicalEnvironment.Aqueous]], 2: [[ChemicalEnvironment.Aqueous], [ChemicalEnvironment.Aqueous]], }, at_least_one_environment=True, ), ) assert len(filtered_frame) == 1 assert filtered_frame["N Components"].max() == 1 assert {*filtered_frame["Component 1"].unique()} == {"O"} # Retain any pure component data, and only aqueous aldehyde mixture data. filtered_frame = FilterByEnvironments.apply( data_frame, FilterByEnvironmentsSchema( per_component_environments={ 2: [[ChemicalEnvironment.Aldehyde], [ChemicalEnvironment.Aqueous]] }, at_least_one_environment=True, ), ) assert len(filtered_frame) == 4 assert filtered_frame["N Components"].min() == 1 assert filtered_frame["N Components"].max() == 2 pure_data = filtered_frame[filtered_frame["N Components"] == 1] binary_data = filtered_frame[filtered_frame["N Components"] == 2] assert len(pure_data) == 2 assert {*pure_data["Component 1"].unique()} == {"O", "C"} assert len(binary_data) == 2 assert { *binary_data["Component 1"].unique(), *binary_data["Component 2"].unique(), } == {"CC(=O)CC=O", "O"} # Repeat the last test but this time make the filtering strict. filtered_frame = FilterByEnvironments.apply( data_frame, FilterByEnvironmentsSchema( per_component_environments={ 2: [[ChemicalEnvironment.Aldehyde], [ChemicalEnvironment.Aqueous]] }, at_least_one_environment=False, strictly_specified_environments=True, ), ) assert len(filtered_frame) == 2 assert filtered_frame["N Components"].max() == 1 assert {*filtered_frame["Component 1"].unique()} == {"O", "C"} filtered_frame = FilterByEnvironments.apply( data_frame, FilterByEnvironmentsSchema( per_component_environments={ 2: [ [ ChemicalEnvironment.Aldehyde, ChemicalEnvironment.Ketone, ChemicalEnvironment.Carbonyl, ], [ChemicalEnvironment.Aqueous], ] }, at_least_one_environment=False, strictly_specified_environments=True, ), ) assert len(filtered_frame) == 4 assert filtered_frame["N Components"].min() == 1 assert filtered_frame["N Components"].max() == 2 pure_data = filtered_frame[filtered_frame["N Components"] == 1] binary_data = filtered_frame[filtered_frame["N Components"] == 2] assert len(pure_data) == 2 assert {*pure_data["Component 1"].unique()} == {"O", "C"} assert len(binary_data) == 2