def test_pandas_round_trip(evaluator_data_set): """A simple test that the `DataSet.from_pandas` and `DataSet.to_pandas` functions work in conjunction with one another.""" data_frame = evaluator_data_set.to_pandas() data_set = DataSet.from_pandas( data_frame, "id", description="Lorem Ipsum", authors=[Author(name=" ", email="*****@*****.**", institute=" ")], ) data_frame = data_set.to_pandas() data_set = DataSet.from_pandas( data_frame, "id", description="Lorem Ipsum", authors=[Author(name=" ", email="*****@*****.**", institute=" ")], ) evaluator_properties_by_id = {x.id: x for x in evaluator_data_set} for entry in data_set.entries: evaluator_property = evaluator_properties_by_id[str(entry.id)] compare_properties(evaluator_property, entry)
def test_retrieve(self, requests_mock, runner, as_pandas): data_set = create_data_set("data-set-1") data_set.entries[0].id = 1 mock_get_data_set(requests_mock, data_set) output_path = "dataset.json" if not as_pandas else "dataset.csv" arguments = ["retrieve", "--id", data_set.id, "--output", output_path] if as_pandas: arguments.append("--pandas") result = runner.invoke(dataset_cli, arguments) if result.exit_code != 0: raise result.exception if as_pandas: rest_data_set = pandas.read_csv(output_path) assert len(rest_data_set) == len(data_set.entries) else: rest_data_set = DataSet.parse_file(output_path) assert rest_data_set.json().replace("\n", "") == data_set.json()
def estimated_reference_sets(): estimated_density = Density( thermodynamic_state=ThermodynamicState(298.15 * unit.kelvin, pressure=1.0 * unit.atmosphere), phase=PropertyPhase.Liquid, substance=Substance.from_components("O", "CC=O"), value=1.0 * unit.kilogram / unit.meter**3, uncertainty=0.1 * unit.kilogram / unit.meter**3, ) estimated_density.id = "1" estimated_enthalpy = EnthalpyOfMixing( thermodynamic_state=ThermodynamicState(298.15 * unit.kelvin, pressure=1.0 * unit.atmosphere), phase=PropertyPhase.Liquid, substance=Substance.from_components("O", "CC=O"), value=1.0 * unit.kilocalorie / unit.mole, uncertainty=0.1 * unit.kilojoule / unit.mole, ) estimated_enthalpy.id = "2" estimated_data_set = PhysicalPropertyDataSet() estimated_data_set.add_properties(estimated_density, estimated_enthalpy) reference_density = DataSetEntry( id=1, property_type="Density", temperature=298.15, pressure=101.325, value=0.001, std_error=0.0001, doi=" ", components=[ Component(smiles="O", mole_fraction=0.5), Component(smiles="CC=O", mole_fraction=0.5), ], ) reference_enthalpy = DataSetEntry( id=2, property_type="EnthalpyOfMixing", temperature=298.15, pressure=101.325, value=4.184, std_error=0.1, doi=" ", components=[ Component(smiles="O", mole_fraction=0.5), Component(smiles="CC=O", mole_fraction=0.5), ], ) reference_data_set = DataSet( id="ref", description=" ", authors=[Author(name=" ", email="*****@*****.**", institute=" ")], entries=[reference_density, reference_enthalpy], ) return estimated_data_set, reference_data_set
def retrieve(data_set_id, return_pandas, output_path): data_set = DataSet.from_rest(data_set_id=data_set_id) if return_pandas: data_set = data_set.to_pandas() data_set.to_csv(output_path, index=False) else: with open(output_path, "w") as file: file.write(data_set.json())
def test_reindex_data_set_no_mole_fraction(): """Tests that the ``reindex_data_set`` function behaves as expected when exact amounts are present.""" setup_timestamp_logging(logging.INFO) substance = substances.Substance() substance.add_component(substances.Component(smiles="O"), amount=substances.MoleFraction(1.0)) substance.add_component( substances.Component(smiles="CO", role=substances.Component.Role.Solute), amount=substances.ExactAmount(1), ) evaluator_data_set = PhysicalPropertyDataSet() evaluator_data_set.add_properties( SolvationFreeEnergy( thermodynamic_state=ThermodynamicState( temperature=298.15 * unit.kelvin, pressure=1.0 * unit.atmosphere), phase=PropertyPhase.Liquid, substance=substance, value=1.0 * SolvationFreeEnergy.default_unit(), uncertainty=1.0 * SolvationFreeEnergy.default_unit(), ), ) data_set = DataSet( id="data-set", description=" ", authors=[Author(name=" ", email="*****@*****.**", institute=" ")], entries=[ DataSetEntry( id=1, property_type="SolvationFreeEnergy", temperature=298.15, pressure=101.325, value=1.0, std_error=1.0, doi=" ", components=[ Component(smiles="O", mole_fraction=1.0), Component(smiles="CO", mole_fraction=0.0, exact_amount=1, role="Solute"), ], ) ], ) reindex_data_set(evaluator_data_set, data_set) assert evaluator_data_set.properties[0].id == "1"
def create_data_set(data_set_id: str, entry_id: Optional[int] = None): """Creates a single author data set which contains a single density data entry. The entry contains two components, an aqueous solvent (x=1) and a methanol solute (n=1). Parameters ---------- data_set_id: str The id to assign to the data set. entry_id The id to assign to the one data entry. Returns ------- DataSet """ author = create_author() data_entry = DataSetEntry( id=entry_id, property_type="Density", temperature=298.15, pressure=101.325, value=1.0, std_error=0.1, doi=" ", components=[ Component(smiles="O", mole_fraction=1.0, exact_amount=0, role="Solvent"), Component(smiles="CO", mole_fraction=0.0, exact_amount=1, role="Solute"), ], ) data_set = DataSet(id=data_set_id, description=" ", authors=[author], entries=[data_entry]) return data_set
def test_evaluator_round_trip(evaluator_data_set): """A simple test that the `DataSet.from_pandas` and `DataSet.to_evaluator` functions work in conjunction with one another.""" data_frame = evaluator_data_set.to_pandas() data_set = DataSet.from_pandas( data_frame, "id", description="Lorem Ipsum", authors=[Author(name=" ", email="*****@*****.**", institute=" ")], ) recreated_data_set = data_set.to_evaluator() assert len(recreated_data_set) == len(evaluator_data_set) evaluator_properties_by_id = {x.id: x for x in evaluator_data_set} for recreated_property in recreated_data_set: evaluator_property = evaluator_properties_by_id[recreated_property.id] compare_evaluator_properties(evaluator_property, recreated_property)
def test_collection_to_evaluator(evaluator_data_set): """A simple test that the `DataSetCollection.to_evaluator` function works as expected.""" data_frame = evaluator_data_set.to_pandas() data_set = DataSet.from_pandas( data_frame, "id", description="Lorem Ipsum", authors=[Author(name=" ", email="*****@*****.**", institute=" ")], ) data_set_collection = DataSetCollection(data_sets=[data_set]) recreated_data_set = data_set_collection.to_evaluator() assert len(recreated_data_set) == len(evaluator_data_set) evaluator_properties_by_id = {x.id: x for x in evaluator_data_set} for recreated_property in recreated_data_set: evaluator_property = evaluator_properties_by_id[recreated_property.id] compare_evaluator_properties(evaluator_property, recreated_property)
def test_from_pandas(evaluator_data_set): """A test that the `DataSet.from_pandas` function works as expected.""" data_frame = evaluator_data_set.to_pandas() data_set = DataSet.from_pandas( data_frame, "id", description="Lorem Ipsum", authors=[Author(name=" ", email="*****@*****.**", institute=" ")], ) assert data_set.id == "id" assert data_set.description == "Lorem Ipsum" assert len(data_set.authors) == 1 assert len(data_set.entries) == len(evaluator_data_set) evaluator_properties_by_id = {x.id: x for x in evaluator_data_set} for entry in data_set.entries: evaluator_property = evaluator_properties_by_id[str(entry.id)] compare_properties(evaluator_property, entry)
def test_analysed_result_from_evaluator(): """Tests the `AnalysedResult.from_evaluator` function.""" expected_mean = 0.0 expected_std = numpy.random.rand() + 1.0 values = numpy.random.normal(expected_mean, expected_std, 1000) estimated_properties = [] reference_entries = [] for index, value in enumerate(values): property_id = index + 1 estimated_density = Density( thermodynamic_state=ThermodynamicState(298.15 * unit.kelvin, pressure=1.0 * unit.atmosphere), phase=PropertyPhase.Liquid, substance=Substance.from_components("O"), value=value * Density.default_unit(), uncertainty=0.0 * Density.default_unit(), ) estimated_density.id = str(property_id) estimated_properties.append(estimated_density) reference_density = DataSetEntry( id=property_id, property_type="Density", temperature=298.15, pressure=101.325, value=expected_mean, std_error=None, doi=" ", components=[Component(smiles="O", mole_fraction=1.0)], ) reference_entries.append(reference_density) estimated_data_set = PhysicalPropertyDataSet() estimated_data_set.add_properties(*estimated_properties) reference_data_set = DataSet( id="ref", description=" ", authors=[Author(name=" ", email="*****@*****.**", institute=" ")], entries=reference_entries, ) analysis_environments = [ChemicalEnvironment.Aqueous] analysed_results = DataSetResult.from_evaluator( reference_data_set=reference_data_set, estimated_data_set=estimated_data_set, analysis_environments=analysis_environments, statistic_types=[StatisticType.RMSE], bootstrap_iterations=1000, ) assert len(analysed_results.result_entries) == len(estimated_properties) full_statistics = next( iter(x for x in analysed_results.statistic_entries if x.category is None)) assert full_statistics.property_type == "Density" assert full_statistics.n_components == 1 assert full_statistics.statistic_type == StatisticType.RMSE assert numpy.isclose(full_statistics.value, expected_std, rtol=0.10)
def analyze( cls, optimization: Optimization, target: EvaluatorTarget, target_directory: str, result_directory: str, reindex: bool = False, ) -> Optional[EvaluatorTargetResult]: from openff.evaluator.client import RequestResult from openff.evaluator.datasets import PhysicalPropertyDataSet results_path = os.path.join(result_directory, "results.json") if not os.path.isfile(results_path): return None # Load the reference data set reference_data_set: PhysicalPropertyDataSet = PhysicalPropertyDataSet.from_json( os.path.join(target_directory, "training-set.json") ) # Check to see if any of the ids were set to strings that can't be cast to # integers, and if so, apply slight re-indexing try: {int(entry.id) for entry in reference_data_set.properties} except (TypeError, ValueError): _logger.warning( "The reference data set contains properties with ids that cannot be " "cast to integers - attempting to fix. Note this in general is not " "recommended and in future it is suggested to use integer ids in " "physical property data sets." ) for i, physical_property in enumerate(reference_data_set): physical_property.id = str(i + 1) reindex = True reference_data_set: DataSet = DataSet.from_pandas( reference_data_set.to_pandas(), identifier="empty", description="empty", authors=[Author(name="empty", email="*****@*****.**", institute="empty")], ) results = RequestResult.from_json(results_path) if reindex: results = reindex_results(results, reference_data_set) estimated_data_set = results.estimated_properties # Generate statistics about each iteration. data_set_result = DataSetResult.from_evaluator( reference_data_set=reference_data_set, estimated_data_set=estimated_data_set, analysis_environments=optimization.analysis_environments, statistic_types=[StatisticType.RMSE], ) objective_function = cls._read_objective_function(result_directory) return EvaluatorTargetResult( objective_function=target.weight * objective_function, statistic_entries=data_set_result.statistic_entries, )
def mock_get_data_set(requests_mock, data_set: DataSet): """Mock the get data sets endpoint.""" requests_mock.get( DataSet._get_endpoint(data_set_id=data_set.id), text=data_set.json(), )
def test_reindex_data_set(): """Tests that the ``reindex_data_set`` function behaves as expected.""" setup_timestamp_logging(logging.INFO) evaluator_data_set = PhysicalPropertyDataSet() evaluator_data_set.add_properties( Density( thermodynamic_state=ThermodynamicState( temperature=298.15 * unit.kelvin, pressure=1.0 * unit.atmosphere), phase=PropertyPhase.Liquid, substance=substances.Substance.from_components("O"), value=1.0 * Density.default_unit(), uncertainty=1.0 * Density.default_unit(), ), Density( thermodynamic_state=ThermodynamicState( temperature=298.15 * unit.kelvin, pressure=1.0 * unit.atmosphere), phase=PropertyPhase.Liquid, substance=substances.Substance.from_components("C", "O"), value=1.0 * Density.default_unit(), uncertainty=1.0 * Density.default_unit(), ), Density( thermodynamic_state=ThermodynamicState( temperature=300.0 * unit.kelvin, pressure=1.0 * unit.atmosphere), phase=PropertyPhase.Liquid, substance=substances.Substance.from_components("C", "O"), value=1.0 * Density.default_unit(), uncertainty=1.0 * Density.default_unit(), ), ) data_set = DataSet( id="data-set", description=" ", authors=[Author(name=" ", email="*****@*****.**", institute=" ")], entries=[ DataSetEntry( id=1, property_type="Density", temperature=298.15, pressure=101.325, value=1.0, std_error=1.0, doi=" ", components=[ Component(smiles="O", mole_fraction=0.5), Component(smiles="C", mole_fraction=0.5), ], ), DataSetEntry( id=2, property_type="Density", temperature=298.15, pressure=101.325, value=1.0, std_error=1.0, doi=" ", components=[Component(smiles="O", mole_fraction=1.0)], ), ], ) un_indexed_id = evaluator_data_set.properties[2].id reindex_data_set(evaluator_data_set, data_set) assert evaluator_data_set.properties[0].id == "2" assert evaluator_data_set.properties[1].id == "1" assert evaluator_data_set.properties[2].id == un_indexed_id data_set_collection = DataSetCollection(data_sets=[ DataSet( id="0", description=" ", authors=[Author(name=" ", email="*****@*****.**", institute=" ")], entries=[ DataSetEntry( id=3, property_type="Density", temperature=298.15, pressure=101.325, value=1.0, std_error=1.0, doi=" ", components=[ Component(smiles="O", mole_fraction=0.5), Component(smiles="C", mole_fraction=0.5), ], ) ], ), DataSet( id="1", description=" ", authors=[Author(name=" ", email="*****@*****.**", institute=" ")], entries=[ DataSetEntry( id=4, property_type="Density", temperature=298.15, pressure=101.325, value=1.0, std_error=1.0, doi=" ", components=[Component(smiles="O", mole_fraction=1.0)], ) ], ), ]) reindex_data_set(evaluator_data_set, data_set_collection) assert evaluator_data_set.properties[0].id == "4" assert evaluator_data_set.properties[1].id == "3" assert evaluator_data_set.properties[2].id == un_indexed_id