def test_pandas_round_trip(evaluator_data_set): """A simple test that the `DataSet.from_pandas` and `DataSet.to_pandas` functions work in conjunction with one another.""" data_frame = evaluator_data_set.to_pandas() data_set = DataSet.from_pandas( data_frame, "id", description="Lorem Ipsum", authors=[Author(name=" ", email="*****@*****.**", institute=" ")], ) data_frame = data_set.to_pandas() data_set = DataSet.from_pandas( data_frame, "id", description="Lorem Ipsum", authors=[Author(name=" ", email="*****@*****.**", institute=" ")], ) evaluator_properties_by_id = {x.id: x for x in evaluator_data_set} for entry in data_set.entries: evaluator_property = evaluator_properties_by_id[str(entry.id)] compare_properties(evaluator_property, entry)
def test_author_validation(): """Test that pydantic correctly validates authors""" # Create a valid author Author(name="SB", email="*****@*****.**", institute="Inst") # Create an author with an invalid email with pytest.raises(ValidationError): Author(name="SB", email="fakeemail.com", institute="Inst")
def test_project_validation(valid_optimization_kwargs, valid_benchmark_kwargs): """Test that pydantic correctly validates studies""" project_id = "project-1" study_id = "study-1" # Test that a valid project can be produced project_kwargs = { "id": project_id, "name": " ", "description": " ", "authors": [Author(name=" ", email="*****@*****.**", institute=" ")], } valid_study = Study( id=study_id, project_id=project_id, name=" ", description=" ", optimizations=[ Optimization( **{ **valid_optimization_kwargs, "project_id": project_id, "study_id": study_id, }) ], benchmarks=[ Benchmark( **{ **valid_benchmark_kwargs, "project_id": project_id, "study_id": study_id, }) ], ) Project(**project_kwargs, studies=[valid_study]) # Test non-unique ids. with pytest.raises(ValidationError): Project(**project_kwargs, studies=[valid_study, valid_study]) # Test bad project id. bad_study = Study(**{**valid_study.dict(), "project_id": "a"}) with pytest.raises(ValidationError): Project(**project_kwargs, studies=[bad_study]) bad_study = Study(**valid_study.dict()) bad_study.optimizations[0].project_id = "a" with pytest.raises(ValidationError): Project(**project_kwargs, studies=[bad_study]) bad_study = Study(**valid_study.dict()) bad_study.benchmarks[0].project_id = "a" with pytest.raises(ValidationError): Project(**project_kwargs, studies=[bad_study])
def estimated_reference_sets(): estimated_density = Density( thermodynamic_state=ThermodynamicState(298.15 * unit.kelvin, pressure=1.0 * unit.atmosphere), phase=PropertyPhase.Liquid, substance=Substance.from_components("O", "CC=O"), value=1.0 * unit.kilogram / unit.meter**3, uncertainty=0.1 * unit.kilogram / unit.meter**3, ) estimated_density.id = "1" estimated_enthalpy = EnthalpyOfMixing( thermodynamic_state=ThermodynamicState(298.15 * unit.kelvin, pressure=1.0 * unit.atmosphere), phase=PropertyPhase.Liquid, substance=Substance.from_components("O", "CC=O"), value=1.0 * unit.kilocalorie / unit.mole, uncertainty=0.1 * unit.kilojoule / unit.mole, ) estimated_enthalpy.id = "2" estimated_data_set = PhysicalPropertyDataSet() estimated_data_set.add_properties(estimated_density, estimated_enthalpy) reference_density = DataSetEntry( id=1, property_type="Density", temperature=298.15, pressure=101.325, value=0.001, std_error=0.0001, doi=" ", components=[ Component(smiles="O", mole_fraction=0.5), Component(smiles="CC=O", mole_fraction=0.5), ], ) reference_enthalpy = DataSetEntry( id=2, property_type="EnthalpyOfMixing", temperature=298.15, pressure=101.325, value=4.184, std_error=0.1, doi=" ", components=[ Component(smiles="O", mole_fraction=0.5), Component(smiles="CC=O", mole_fraction=0.5), ], ) reference_data_set = DataSet( id="ref", description=" ", authors=[Author(name=" ", email="*****@*****.**", institute=" ")], entries=[reference_density, reference_enthalpy], ) return estimated_data_set, reference_data_set
def test_reindex_data_set_no_mole_fraction(): """Tests that the ``reindex_data_set`` function behaves as expected when exact amounts are present.""" setup_timestamp_logging(logging.INFO) substance = substances.Substance() substance.add_component(substances.Component(smiles="O"), amount=substances.MoleFraction(1.0)) substance.add_component( substances.Component(smiles="CO", role=substances.Component.Role.Solute), amount=substances.ExactAmount(1), ) evaluator_data_set = PhysicalPropertyDataSet() evaluator_data_set.add_properties( SolvationFreeEnergy( thermodynamic_state=ThermodynamicState( temperature=298.15 * unit.kelvin, pressure=1.0 * unit.atmosphere), phase=PropertyPhase.Liquid, substance=substance, value=1.0 * SolvationFreeEnergy.default_unit(), uncertainty=1.0 * SolvationFreeEnergy.default_unit(), ), ) data_set = DataSet( id="data-set", description=" ", authors=[Author(name=" ", email="*****@*****.**", institute=" ")], entries=[ DataSetEntry( id=1, property_type="SolvationFreeEnergy", temperature=298.15, pressure=101.325, value=1.0, std_error=1.0, doi=" ", components=[ Component(smiles="O", mole_fraction=1.0), Component(smiles="CO", mole_fraction=0.0, exact_amount=1, role="Solute"), ], ) ], ) reindex_data_set(evaluator_data_set, data_set) assert evaluator_data_set.properties[0].id == "1"
def create_author(): """Creates an author objects with * name="Fake Name" * email="*****@*****.**" * institute="None" Returns ------- Author The created author """ return Author(name="Fake Name", email="*****@*****.**", institute="None")
def test_evaluator_round_trip(evaluator_data_set): """A simple test that the `DataSet.from_pandas` and `DataSet.to_evaluator` functions work in conjunction with one another.""" data_frame = evaluator_data_set.to_pandas() data_set = DataSet.from_pandas( data_frame, "id", description="Lorem Ipsum", authors=[Author(name=" ", email="*****@*****.**", institute=" ")], ) recreated_data_set = data_set.to_evaluator() assert len(recreated_data_set) == len(evaluator_data_set) evaluator_properties_by_id = {x.id: x for x in evaluator_data_set} for recreated_property in recreated_data_set: evaluator_property = evaluator_properties_by_id[recreated_property.id] compare_evaluator_properties(evaluator_property, recreated_property)
def test_collection_to_evaluator(evaluator_data_set): """A simple test that the `DataSetCollection.to_evaluator` function works as expected.""" data_frame = evaluator_data_set.to_pandas() data_set = DataSet.from_pandas( data_frame, "id", description="Lorem Ipsum", authors=[Author(name=" ", email="*****@*****.**", institute=" ")], ) data_set_collection = DataSetCollection(data_sets=[data_set]) recreated_data_set = data_set_collection.to_evaluator() assert len(recreated_data_set) == len(evaluator_data_set) evaluator_properties_by_id = {x.id: x for x in evaluator_data_set} for recreated_property in recreated_data_set: evaluator_property = evaluator_properties_by_id[recreated_property.id] compare_evaluator_properties(evaluator_property, recreated_property)
def test_from_pandas(evaluator_data_set): """A test that the `DataSet.from_pandas` function works as expected.""" data_frame = evaluator_data_set.to_pandas() data_set = DataSet.from_pandas( data_frame, "id", description="Lorem Ipsum", authors=[Author(name=" ", email="*****@*****.**", institute=" ")], ) assert data_set.id == "id" assert data_set.description == "Lorem Ipsum" assert len(data_set.authors) == 1 assert len(data_set.entries) == len(evaluator_data_set) evaluator_properties_by_id = {x.id: x for x in evaluator_data_set} for entry in data_set.entries: evaluator_property = evaluator_properties_by_id[str(entry.id)] compare_properties(evaluator_property, entry)
def test_analysed_result_from_evaluator(): """Tests the `AnalysedResult.from_evaluator` function.""" expected_mean = 0.0 expected_std = numpy.random.rand() + 1.0 values = numpy.random.normal(expected_mean, expected_std, 1000) estimated_properties = [] reference_entries = [] for index, value in enumerate(values): property_id = index + 1 estimated_density = Density( thermodynamic_state=ThermodynamicState(298.15 * unit.kelvin, pressure=1.0 * unit.atmosphere), phase=PropertyPhase.Liquid, substance=Substance.from_components("O"), value=value * Density.default_unit(), uncertainty=0.0 * Density.default_unit(), ) estimated_density.id = str(property_id) estimated_properties.append(estimated_density) reference_density = DataSetEntry( id=property_id, property_type="Density", temperature=298.15, pressure=101.325, value=expected_mean, std_error=None, doi=" ", components=[Component(smiles="O", mole_fraction=1.0)], ) reference_entries.append(reference_density) estimated_data_set = PhysicalPropertyDataSet() estimated_data_set.add_properties(*estimated_properties) reference_data_set = DataSet( id="ref", description=" ", authors=[Author(name=" ", email="*****@*****.**", institute=" ")], entries=reference_entries, ) analysis_environments = [ChemicalEnvironment.Aqueous] analysed_results = DataSetResult.from_evaluator( reference_data_set=reference_data_set, estimated_data_set=estimated_data_set, analysis_environments=analysis_environments, statistic_types=[StatisticType.RMSE], bootstrap_iterations=1000, ) assert len(analysed_results.result_entries) == len(estimated_properties) full_statistics = next( iter(x for x in analysed_results.statistic_entries if x.category is None)) assert full_statistics.property_type == "Density" assert full_statistics.n_components == 1 assert full_statistics.statistic_type == StatisticType.RMSE assert numpy.isclose(full_statistics.value, expected_std, rtol=0.10)
def analyze( cls, optimization: Optimization, target: EvaluatorTarget, target_directory: str, result_directory: str, reindex: bool = False, ) -> Optional[EvaluatorTargetResult]: from openff.evaluator.client import RequestResult from openff.evaluator.datasets import PhysicalPropertyDataSet results_path = os.path.join(result_directory, "results.json") if not os.path.isfile(results_path): return None # Load the reference data set reference_data_set: PhysicalPropertyDataSet = PhysicalPropertyDataSet.from_json( os.path.join(target_directory, "training-set.json") ) # Check to see if any of the ids were set to strings that can't be cast to # integers, and if so, apply slight re-indexing try: {int(entry.id) for entry in reference_data_set.properties} except (TypeError, ValueError): _logger.warning( "The reference data set contains properties with ids that cannot be " "cast to integers - attempting to fix. Note this in general is not " "recommended and in future it is suggested to use integer ids in " "physical property data sets." ) for i, physical_property in enumerate(reference_data_set): physical_property.id = str(i + 1) reindex = True reference_data_set: DataSet = DataSet.from_pandas( reference_data_set.to_pandas(), identifier="empty", description="empty", authors=[Author(name="empty", email="*****@*****.**", institute="empty")], ) results = RequestResult.from_json(results_path) if reindex: results = reindex_results(results, reference_data_set) estimated_data_set = results.estimated_properties # Generate statistics about each iteration. data_set_result = DataSetResult.from_evaluator( reference_data_set=reference_data_set, estimated_data_set=estimated_data_set, analysis_environments=optimization.analysis_environments, statistic_types=[StatisticType.RMSE], ) objective_function = cls._read_objective_function(result_directory) return EvaluatorTargetResult( objective_function=target.weight * objective_function, statistic_entries=data_set_result.statistic_entries, )
def create(db: Session, author: authors.Author) -> models.Author: db_author = models.Author.unique(db, models.Author(**author.dict())) return db_author
def test_reindex_data_set(): """Tests that the ``reindex_data_set`` function behaves as expected.""" setup_timestamp_logging(logging.INFO) evaluator_data_set = PhysicalPropertyDataSet() evaluator_data_set.add_properties( Density( thermodynamic_state=ThermodynamicState( temperature=298.15 * unit.kelvin, pressure=1.0 * unit.atmosphere), phase=PropertyPhase.Liquid, substance=substances.Substance.from_components("O"), value=1.0 * Density.default_unit(), uncertainty=1.0 * Density.default_unit(), ), Density( thermodynamic_state=ThermodynamicState( temperature=298.15 * unit.kelvin, pressure=1.0 * unit.atmosphere), phase=PropertyPhase.Liquid, substance=substances.Substance.from_components("C", "O"), value=1.0 * Density.default_unit(), uncertainty=1.0 * Density.default_unit(), ), Density( thermodynamic_state=ThermodynamicState( temperature=300.0 * unit.kelvin, pressure=1.0 * unit.atmosphere), phase=PropertyPhase.Liquid, substance=substances.Substance.from_components("C", "O"), value=1.0 * Density.default_unit(), uncertainty=1.0 * Density.default_unit(), ), ) data_set = DataSet( id="data-set", description=" ", authors=[Author(name=" ", email="*****@*****.**", institute=" ")], entries=[ DataSetEntry( id=1, property_type="Density", temperature=298.15, pressure=101.325, value=1.0, std_error=1.0, doi=" ", components=[ Component(smiles="O", mole_fraction=0.5), Component(smiles="C", mole_fraction=0.5), ], ), DataSetEntry( id=2, property_type="Density", temperature=298.15, pressure=101.325, value=1.0, std_error=1.0, doi=" ", components=[Component(smiles="O", mole_fraction=1.0)], ), ], ) un_indexed_id = evaluator_data_set.properties[2].id reindex_data_set(evaluator_data_set, data_set) assert evaluator_data_set.properties[0].id == "2" assert evaluator_data_set.properties[1].id == "1" assert evaluator_data_set.properties[2].id == un_indexed_id data_set_collection = DataSetCollection(data_sets=[ DataSet( id="0", description=" ", authors=[Author(name=" ", email="*****@*****.**", institute=" ")], entries=[ DataSetEntry( id=3, property_type="Density", temperature=298.15, pressure=101.325, value=1.0, std_error=1.0, doi=" ", components=[ Component(smiles="O", mole_fraction=0.5), Component(smiles="C", mole_fraction=0.5), ], ) ], ), DataSet( id="1", description=" ", authors=[Author(name=" ", email="*****@*****.**", institute=" ")], entries=[ DataSetEntry( id=4, property_type="Density", temperature=298.15, pressure=101.325, value=1.0, std_error=1.0, doi=" ", components=[Component(smiles="O", mole_fraction=1.0)], ) ], ), ]) reindex_data_set(evaluator_data_set, data_set_collection) assert evaluator_data_set.properties[0].id == "4" assert evaluator_data_set.properties[1].id == "3" assert evaluator_data_set.properties[2].id == un_indexed_id