def results_to_pandas(force_fields: List[str]) -> pandas.DataFrame: """Imports the experimental and estimated data sets and stores them in a pandas data frame. """ # Load in the experimental data set. training_set = { physical_property.id: physical_property for physical_property in PhysicalPropertyDataSet.from_json( os.path.join("raw_data_v2", "curated_data_set.json")) } # Load in the results. estimated_results = { force_field: { physical_property.id: physical_property for physical_property in PhysicalPropertyDataSet.from_json( os.path.join("raw_data_v2", f"{force_field}.json")) } for force_field in force_fields } # Refactor the experimental and estimated data into a single data frame. data_rows = [] for property_id in training_set: experimental_property = training_set[property_id] estimated_properties = { force_field: estimated_results[force_field].get(property_id, None) for force_field in force_fields } if (any(estimated_property is None for estimated_property in estimated_properties.values()) or property_id in OUTLIERS): print(f"Skipping property {property_id}") continue data_rows.extend({ "Id": property_id, "Type": (f"{experimental_property.__class__.__name__}_" f"{len(experimental_property.substance)}"), "Force Field": force_field, "NIST ThermoML": experimental_property.value.to( experimental_property.default_unit()).magnitude, "Estimated": estimated_properties[force_field].value.to( experimental_property.default_unit()).magnitude, "Estimated Uncertainty": estimated_properties[force_field].uncertainty.to( experimental_property.default_unit()).magnitude, } for force_field in force_fields) return pandas.DataFrame(data_rows)
def main(): setup_timestamp_logging() # Load in the force field force_field_path = "smirnoff99Frosst-1.1.0.offxml" force_field_source = SmirnoffForceFieldSource.from_path(force_field_path) # Load in the data set containing the pure and binary properties. data_set = PhysicalPropertyDataSet.from_json("pure_data_set.json") data_set.merge(PhysicalPropertyDataSet.from_json("binary_data_set.json")) # Set up a server object to run the calculations using. server = setup_server(backend_type=BackendType.LocalGPU, max_number_of_workers=1, port=8001) with server: # Request the estimates. property_estimator = EvaluatorClient( ConnectionOptions(server_port=8001)) for calculation_layer in ["SimulationLayer", "ReweightingLayer"]: options = RequestOptions() options.calculation_layers = [calculation_layer] parameter_gradient_keys = [ ParameterGradientKey(tag="vdW", smirks="[#6X4:1]", attribute="epsilon"), ParameterGradientKey(tag="vdW", smirks="[#6X4:1]", attribute="rmin_half"), ] request, _ = property_estimator.request_estimate( property_set=data_set, force_field_source=force_field_source, options=options, parameter_gradient_keys=parameter_gradient_keys, ) # Wait for the results. results, _ = request.results(True, 5) layer_name = re.sub(r"(?<!^)(?=[A-Z])", "_", calculation_layer).lower() results.json(f"pure_binary_{layer_name}.json", True)
def test_analyze_non_integer_ids(mock_target, caplog): optimization, target, directory = mock_target reference_data_set: PhysicalPropertyDataSet = PhysicalPropertyDataSet.from_json( os.path.join(directory, "training-set.json")) assert len(reference_data_set ) == 1 # Sanity check in case this changes in future. reference_data_set.properties[0].id = "a" reference_data_set.json(os.path.join(directory, "training-set.json")) results = RequestResult() results.estimated_properties = reference_data_set results.json(os.path.join(directory, "results.json")) with caplog.at_level(logging.WARNING): target_result = EvaluatorAnalysisFactory.analyze( optimization=optimization, target=target, target_directory=directory, result_directory=directory, reindex=False, ) assert ("The reference data set contains properties " "with ids that cannot be cast to integers" in caplog.text) assert numpy.isclose(target_result.objective_function, 1.0) assert len(target_result.statistic_entries) == 1
def test_generate_evaluator_target(self, requests_mock): data_set = create_data_set("data-set-1") mock_get_data_set(requests_mock, data_set) target = create_evaluator_target("evaluator-target-1", [data_set.id]) with temporary_cd(): OptimizationInputFactory._generate_evaluator_target( target, 8000, None) assert os.path.isfile("training-set.json") off_data_set = PhysicalPropertyDataSet.from_json( "training-set.json") assert off_data_set.json() == data_set.to_evaluator().json() assert os.path.isfile("options.json")
def main(): setup_timestamp_logging() # Load in the force field force_field_path = "smirnoff99Frosst-1.1.0.offxml" force_field_source = SmirnoffForceFieldSource.from_path(force_field_path) # Create a data set containing three solvation free energies. data_set = PhysicalPropertyDataSet.from_json("hydration_data_set.json") data_set.json("hydration_data_set.json", format=True) # Set up a server object to run the calculations using. server = setup_server(backend_type=BackendType.LocalGPU, max_number_of_workers=1, port=8002) with server: # Request the estimates. property_estimator = EvaluatorClient( ConnectionOptions(server_port=8002)) options = RequestOptions() options.calculation_layers = ["SimulationLayer"] options.add_schema("SimulationLayer", "SolvationFreeEnergy", _get_fixed_lambda_schema()) request, _ = property_estimator.request_estimate( property_set=data_set, force_field_source=force_field_source, options=options, ) # Wait for the results. results, _ = request.results(True, 60) # Save the result to file. results.json("results.json", True)
def _initialize(self): """Initializes the evaluator target from an input json file. 1. Reads the user specified input file. 2. Creates a `evaluator` client object. 3. Loads in a reference experimental data set. 4. Assigns and normalises weights for each property. """ # Load in the options from a user provided JSON file. print(os.path.join(self.tgtdir, self.evaluator_input)) options_file_path = os.path.join(self.tgtdir, self.evaluator_input) self._options = self.OptionsFile.from_json(options_file_path) for property_type, denominator in self._options.denominators.items(): self._default_units[property_type] = denominator.units # Attempt to create an evaluator client object using the specified # connection options. self._client = EvaluatorClient(self._options.connection_options) # Load in the experimental data set. data_set_path = os.path.join(self.tgtdir, self._options.data_set_path) self._reference_data_set = PhysicalPropertyDataSet.from_json(data_set_path) if len(self._reference_data_set) == 0: raise ValueError( "The physical property data set to optimise against is empty." ) # Print the reference data, and count the number of instances of # each property type. printcool("Loaded experimental data.") property_types = self._reference_data_set.property_types number_of_properties = { x: sum(1 for y in self._reference_data_set.properties_by_type(x)) for x in property_types } for substance in self._reference_data_set.substances: dict_for_print = {} for physical_property in self._reference_data_set.properties_by_substance( substance ): property_type = physical_property.__class__.__name__ value = physical_property.value.to(self._default_units[property_type]) uncertainty = np.nan if physical_property.uncertainty != UNDEFINED: uncertainty = physical_property.uncertainty.to( self._default_units[property_type] ) tuple_key = ( property_type, physical_property.thermodynamic_state.temperature, physical_property.thermodynamic_state.pressure, ) dict_for_print["%s %s-%s" % tuple_key] = "%s+/-%s" % ( value, uncertainty, ) printcool_dictionary( dict_for_print, title="Reference %s data" % substance.identifier, ) # Assign and normalize weights for each phase point (average for now) self._normalised_weights = {} for property_type in self._reference_data_set.property_types: self._normalised_weights[property_type] = ( self._options.weights[property_type] / number_of_properties[property_type] )
def analyze( cls, optimization: Optimization, target: EvaluatorTarget, target_directory: str, result_directory: str, reindex: bool = False, ) -> Optional[EvaluatorTargetResult]: from openff.evaluator.client import RequestResult from openff.evaluator.datasets import PhysicalPropertyDataSet results_path = os.path.join(result_directory, "results.json") if not os.path.isfile(results_path): return None # Load the reference data set reference_data_set: PhysicalPropertyDataSet = PhysicalPropertyDataSet.from_json( os.path.join(target_directory, "training-set.json") ) # Check to see if any of the ids were set to strings that can't be cast to # integers, and if so, apply slight re-indexing try: {int(entry.id) for entry in reference_data_set.properties} except (TypeError, ValueError): _logger.warning( "The reference data set contains properties with ids that cannot be " "cast to integers - attempting to fix. Note this in general is not " "recommended and in future it is suggested to use integer ids in " "physical property data sets." ) for i, physical_property in enumerate(reference_data_set): physical_property.id = str(i + 1) reindex = True reference_data_set: DataSet = DataSet.from_pandas( reference_data_set.to_pandas(), identifier="empty", description="empty", authors=[Author(name="empty", email="*****@*****.**", institute="empty")], ) results = RequestResult.from_json(results_path) if reindex: results = reindex_results(results, reference_data_set) estimated_data_set = results.estimated_properties # Generate statistics about each iteration. data_set_result = DataSetResult.from_evaluator( reference_data_set=reference_data_set, estimated_data_set=estimated_data_set, analysis_environments=optimization.analysis_environments, statistic_types=[StatisticType.RMSE], ) objective_function = cls._read_objective_function(result_directory) return EvaluatorTargetResult( objective_function=target.weight * objective_function, statistic_entries=data_set_result.statistic_entries, )