def create_filterable_data_set(): """Creates a dummy data with a diverse set of properties to be filtered, namely: - a liquid density measured at 298 K and 0.5 atm with 1 component containing only carbon. - a gaseous dielectric measured at 288 K and 1 atm with 2 components containing only nitrogen. - a solid EoM measured at 308 K and 1.5 atm with 3 components containing only oxygen. Returns ------- PhysicalPropertyDataSet The created data set. """ source = CalculationSource("Dummy", {}) carbon_substance = create_dummy_substance(number_of_components=1, elements=["C"]) density_property = Density( thermodynamic_state=ThermodynamicState(temperature=298 * unit.kelvin, pressure=0.5 * unit.atmosphere), phase=PropertyPhase.Liquid, substance=carbon_substance, value=1 * unit.gram / unit.milliliter, uncertainty=0.11 * unit.gram / unit.milliliter, source=source, ) nitrogen_substance = create_dummy_substance(number_of_components=2, elements=["N"]) dielectric_property = DielectricConstant( thermodynamic_state=ThermodynamicState(temperature=288 * unit.kelvin, pressure=1 * unit.atmosphere), phase=PropertyPhase.Gas, substance=nitrogen_substance, value=1 * unit.dimensionless, uncertainty=0.11 * unit.dimensionless, source=source, ) oxygen_substance = create_dummy_substance(number_of_components=3, elements=["O"]) enthalpy_property = EnthalpyOfMixing( thermodynamic_state=ThermodynamicState(temperature=308 * unit.kelvin, pressure=1.5 * unit.atmosphere), phase=PropertyPhase.Solid, substance=oxygen_substance, value=1 * unit.kilojoules / unit.mole, uncertainty=0.11 * unit.kilojoules / unit.mole, source=source, ) data_set = PhysicalPropertyDataSet() data_set.add_properties(density_property, dielectric_property, enthalpy_property) return data_set
def test_sources_substances(): physical_property = create_dummy_property(Density) data_set = PhysicalPropertyDataSet() data_set.add_properties(physical_property) assert next(iter(data_set.sources)) == physical_property.source assert next(iter(data_set.substances)) == physical_property.substance
def test_protocol_replacement(force_field_source, expected_protocol_type): data_set = PhysicalPropertyDataSet() for property_type in property_types: physical_property = create_dummy_property(property_type) data_set.add_properties(physical_property) options = EvaluatorClient.default_request_options(data_set, force_field_source) options_json = options.json(format=True) assert options_json.find('BaseBuildSystem"') < 0 assert options_json.find(expected_protocol_type) >= 0
def main(): setup_timestamp_logging() # Load in the force field force_field_path = "smirnoff99Frosst-1.1.0.offxml" force_field_source = SmirnoffForceFieldSource.from_path(force_field_path) # Load in the data set containing the pure and binary properties. data_set = PhysicalPropertyDataSet.from_json("pure_data_set.json") data_set.merge(PhysicalPropertyDataSet.from_json("binary_data_set.json")) # Set up a server object to run the calculations using. server = setup_server(backend_type=BackendType.LocalGPU, max_number_of_workers=1, port=8001) with server: # Request the estimates. property_estimator = EvaluatorClient( ConnectionOptions(server_port=8001)) for calculation_layer in ["SimulationLayer", "ReweightingLayer"]: options = RequestOptions() options.calculation_layers = [calculation_layer] parameter_gradient_keys = [ ParameterGradientKey(tag="vdW", smirks="[#6X4:1]", attribute="epsilon"), ParameterGradientKey(tag="vdW", smirks="[#6X4:1]", attribute="rmin_half"), ] request, _ = property_estimator.request_estimate( property_set=data_set, force_field_source=force_field_source, options=options, parameter_gradient_keys=parameter_gradient_keys, ) # Wait for the results. results, _ = request.results(True, 5) layer_name = re.sub(r"(?<!^)(?=[A-Z])", "_", calculation_layer).lower() results.json(f"pure_binary_{layer_name}.json", True)
def find_training_smiles(): """Returns the smiles of all of the substances which appeared in the training set. Returns ------- list of tuple of str The smiles patterns of the training substances. """ # Find those alcohols which were included in the training set training_set = PhysicalPropertyDataSet.from_json( os.path.join( "..", "..", "..", "pure_mixture_optimisation", "force_balance", "alcohol_ester", "h_mix_rho_x_rho_pure_h_vap", "targets", "mixture_data", "training_set.json", ) ).to_pandas() training_smiles = data_frame_to_smiles_tuples(training_set) training_smiles = set(x for y in training_smiles for x in y) return training_smiles
def test_submission(): with tempfile.TemporaryDirectory() as directory: with temporarily_change_directory(directory): with DaskLocalCluster() as calculation_backend: # Spin up a server instance. server = EvaluatorServer( calculation_backend=calculation_backend, working_directory=directory, ) with server: # Connect a client. client = EvaluatorClient() # Submit an empty data set. force_field_path = "smirnoff99Frosst-1.1.0.offxml" force_field_source = SmirnoffForceFieldSource.from_path( force_field_path ) request, error = client.request_estimate( PhysicalPropertyDataSet(), force_field_source ) assert error is None assert isinstance(request, Request) result, error = request.results(polling_interval=0.01) assert error is None assert isinstance(result, RequestResult)
def main(input_data_set_path, server_port): # Create the options which propertyestimator should use. estimator_options = RequestOptions() # Choose which calculation layers to make available. estimator_options.calculation_layers = ["SimulationLayer"] # Load in the training data set and create schemas for each of the types # of property to be calculated. training_set = PhysicalPropertyDataSet.from_json(input_data_set_path) # Zero out any undefined uncertainties due to a bug in ForceBalance. for physical_property in training_set: physical_property.uncertainty = 0.0 * physical_property.default_unit() data_set_path = "training_set.json" training_set.json(data_set_path, format=True) # Create the force balance options target_options = Evaluator_SMIRNOFF.OptionsFile() target_options.connection_options = ConnectionOptions( server_address="localhost", server_port=server_port) target_options.estimation_options = estimator_options target_options.data_set_path = data_set_path # Set the property weights and denominators. target_options.weights = {x: 1.0 for x in training_set.property_types} target_options.denominators = calculate_denominators(training_set) # Save the options to file. with open("options.json", "w") as file: file.write(target_options.to_json())
def main(): setup_timestamp_logging() # Load in the force field force_field_path = "openff-1.0.0-refit.offxml" force_field_source = SmirnoffForceFieldSource.from_path(force_field_path) # Load in the test set. data_set = PhysicalPropertyDataSet.from_json("full_set.json") # Set up a server object to run the calculations using. working_directory = "working_directory" # Set up a backend to run the calculations on. This assume running # on a HPC resources with the LSF queue system installed. queue_resources = QueueWorkerResources( number_of_threads=1, number_of_gpus=1, preferred_gpu_toolkit=QueueWorkerResources.GPUToolkit.CUDA, per_thread_memory_limit=5 * unit.gigabyte, wallclock_time_limit="05:59", ) worker_script_commands = [ "conda activate forcebalance", "module load cuda/10.1" ] calculation_backend = DaskLSFBackend( minimum_number_of_workers=1, maximum_number_of_workers=50, resources_per_worker=queue_resources, queue_name="gpuqueue", setup_script_commands=worker_script_commands, adaptive_interval="1000ms", ) with calculation_backend: server = EvaluatorServer( calculation_backend=calculation_backend, working_directory=working_directory, port=8002, ) with server: # Request the estimates. client = EvaluatorClient(ConnectionOptions(server_port=8002)) request, _ = client.request_estimate( property_set=data_set, force_field_source=force_field_source, ) # Wait for the results. results, _ = request.results(True, 5) results.json(f"results.json")
def test_same_component_batching(): thermodynamic_state = ThermodynamicState(temperature=1.0 * unit.kelvin, pressure=1.0 * unit.atmosphere) data_set = PhysicalPropertyDataSet() data_set.add_properties( Density( thermodynamic_state=thermodynamic_state, substance=Substance.from_components("O", "C"), value=0.0 * unit.kilogram / unit.meter**3, ), EnthalpyOfVaporization( thermodynamic_state=thermodynamic_state, substance=Substance.from_components("O", "C"), value=0.0 * unit.kilojoule / unit.mole, ), Density( thermodynamic_state=thermodynamic_state, substance=Substance.from_components("O", "CO"), value=0.0 * unit.kilogram / unit.meter**3, ), EnthalpyOfVaporization( thermodynamic_state=thermodynamic_state, substance=Substance.from_components("O", "CO"), value=0.0 * unit.kilojoule / unit.mole, ), ) options = RequestOptions() submission = EvaluatorClient._Submission() submission.dataset = data_set submission.options = options with DaskLocalCluster() as calculation_backend: server = EvaluatorServer(calculation_backend) batches = server._batch_by_same_component(submission, "") assert len(batches) == 2 assert len(batches[0].queued_properties) == 2 assert len(batches[1].queued_properties) == 2
def test_default_options(): """Test creating the default estimation options.""" data_set = PhysicalPropertyDataSet() force_field_source = SmirnoffForceFieldSource.from_path( "smirnoff99Frosst-1.1.0.offxml" ) for property_type in property_types: physical_property = create_dummy_property(property_type) data_set.add_properties(physical_property) options = EvaluatorClient.default_request_options(data_set, force_field_source) options.validate() assert len(options.calculation_layers) == 2 assert len(options.calculation_schemas) == len(property_types) assert all( len(x) == len(options.calculation_layers) for x in options.calculation_schemas.values() )
def test_launch_batch(): # Set up a dummy data set data_set = PhysicalPropertyDataSet() data_set.add_properties(create_dummy_property(Density), create_dummy_property(Density)) batch = Batch() batch.force_field_id = "" batch.options = RequestOptions() batch.options.calculation_layers = ["QuickCalculationLayer"] batch.options.calculation_schemas = { "Density": { "QuickCalculationLayer": CalculationLayerSchema() } } batch.parameter_gradient_keys = [] batch.queued_properties = [*data_set] batch.validate() with tempfile.TemporaryDirectory() as directory: with temporarily_change_directory(directory): with DaskLocalCluster() as calculation_backend: server = EvaluatorServer( calculation_backend=calculation_backend, working_directory=directory, ) server._queued_batches[batch.id] = batch server._launch_batch(batch) while len(batch.queued_properties) > 0: sleep(0.01) assert len(batch.estimated_properties) == 1 assert len(batch.unsuccessful_properties) == 1
class RequestResult(AttributeClass): """The current results of an estimation request - these results may be partial if the server hasn't yet completed the request. """ queued_properties = Attribute( docstring="The set of properties which have yet to be, or " "are currently being estimated.", type_hint=PhysicalPropertyDataSet, default_value=PhysicalPropertyDataSet(), ) estimated_properties = Attribute( docstring= "The set of properties which have been successfully estimated.", type_hint=PhysicalPropertyDataSet, default_value=PhysicalPropertyDataSet(), ) unsuccessful_properties = Attribute( docstring= "The set of properties which could not be successfully estimated.", type_hint=PhysicalPropertyDataSet, default_value=PhysicalPropertyDataSet(), ) exceptions = Attribute( docstring="The set of properties which have yet to be, or " "are currently being estimated.", type_hint=list, default_value=[], ) def validate(self, attribute_type=None): super(RequestResult, self).validate(attribute_type) assert all( (isinstance(x, EvaluatorException) for x in self.exceptions))
def test_serialization(): """A test to ensure that data sets are JSON serializable.""" data_set = PhysicalPropertyDataSet() data_set.add_properties(create_dummy_property(Density)) data_set_json = data_set.json() parsed_data_set = PhysicalPropertyDataSet.parse_json(data_set_json) assert len(data_set) == len(parsed_data_set) parsed_data_set_json = parsed_data_set.json() assert parsed_data_set_json == data_set_json
def _estimate_required_simulations(properties_of_interest, data_set): """Attempt to estimate how many simulations the evaluator framework will try and run to estimate the given data set of properties. Parameters ---------- properties_of_interest: list of tuple of type and SubstanceType A list of the property types which are of interest to optimise against. data_set: PhysicalPropertyDataSet The data set containing the data set of properties of interest. Returns ------- int The estimated number of simulations required. """ data_set = PhysicalPropertyDataSet.parse_json(data_set.json()) options = RequestOptions() calculation_layer = "SimulationLayer" for property_type, _ in properties_of_interest: default_schema = property_type.default_simulation_schema() options.add_schema(calculation_layer, property_type.__name__, default_schema) workflow_graph, _ = SimulationLayer._build_workflow_graph( "", LocalFileStorage(), data_set.properties, "", [], options) number_of_simulations = 0 for protocol_id, protocol in workflow_graph.protocols.items(): if not isinstance(protocol, ConditionalGroup): continue number_of_simulations += 1 return number_of_simulations
def main(): setup_timestamp_logging() # Load in the force field force_field_path = "smirnoff99Frosst-1.1.0.offxml" force_field_source = SmirnoffForceFieldSource.from_path(force_field_path) # Create a data set containing three solvation free energies. data_set = PhysicalPropertyDataSet.from_json("hydration_data_set.json") data_set.json("hydration_data_set.json", format=True) # Set up a server object to run the calculations using. server = setup_server(backend_type=BackendType.LocalGPU, max_number_of_workers=1, port=8002) with server: # Request the estimates. property_estimator = EvaluatorClient( ConnectionOptions(server_port=8002)) options = RequestOptions() options.calculation_layers = ["SimulationLayer"] options.add_schema("SimulationLayer", "SolvationFreeEnergy", _get_fixed_lambda_schema()) request, _ = property_estimator.request_estimate( property_set=data_set, force_field_source=force_field_source, options=options, ) # Wait for the results. results, _ = request.results(True, 60) # Save the result to file. results.json(f"results.json", True)
def test_properties_by_type(): density = create_dummy_property(Density) dielectric = create_dummy_property(DielectricConstant) data_set = PhysicalPropertyDataSet() data_set.add_properties(density, dielectric) densities = [x for x in data_set.properties_by_type("Density")] assert len(densities) == 1 assert densities[0] == density dielectrics = [ x for x in data_set.properties_by_type("DielectricConstant") ] assert len(dielectrics) == 1 assert dielectrics[0] == dielectric
def test_filter_by_smiles(): """A test to ensure that data sets may be filtered by which smiles their measured properties contain.""" methanol_substance = Substance() methanol_substance.add_component(Component("CO"), MoleFraction(1.0)) ethanol_substance = Substance() ethanol_substance.add_component(Component("CCO"), MoleFraction(1.0)) property_a = create_dummy_property(Density) property_a.substance = methanol_substance property_b = create_dummy_property(Density) property_b.substance = ethanol_substance data_set = PhysicalPropertyDataSet() data_set.add_properties(property_a, property_b) data_set.filter_by_smiles("CO") assert len(data_set) == 1 assert methanol_substance in data_set.substances assert ethanol_substance not in data_set.substances
def main(): # Set up logging logging.basicConfig(level=logging.INFO) output_directory = "training_sets" os.makedirs(output_directory, exist_ok=True) rho_pure_h_vap = PhysicalPropertyDataSet.from_json( "../../../pure_optimisation/data_set_generation/expanded_set/training_set.json" ) rho_pure = PhysicalPropertyDataSet.from_json( "../../../pure_optimisation/data_set_generation/expanded_set/training_set.json" ) rho_pure.filter_by_property_types("Density") h_mix_rho_x = PhysicalPropertyDataSet.from_json( "../../../mixture_optimisation/data_set_generation/" "expanded_set/training_sets/h_mix_rho_x_training_set.json") h_mix_rho_x_rho_pure = PhysicalPropertyDataSet() h_mix_rho_x_rho_pure.merge(rho_pure) h_mix_rho_x_rho_pure.merge(h_mix_rho_x) h_mix_rho_x_rho_pure.json( os.path.join(output_directory, "h_mix_rho_x_rho_pure.json")) h_mix_rho_x_rho_pure.to_pandas().to_csv( os.path.join(output_directory, "h_mix_rho_x_rho_pure.csv")) h_mix_rho_x_rho_pure_h_vap = PhysicalPropertyDataSet() h_mix_rho_x_rho_pure_h_vap.merge(rho_pure_h_vap) h_mix_rho_x_rho_pure_h_vap.merge(h_mix_rho_x) h_mix_rho_x_rho_pure_h_vap.json( os.path.join(output_directory, "h_mix_rho_x_rho_pure_h_vap.json")) h_mix_rho_x_rho_pure_h_vap.to_pandas().to_csv( os.path.join(output_directory, "h_mix_rho_x_rho_pure_h_vap.csv"))
def test_validate_data_set(): valid_property = Density( ThermodynamicState(298 * unit.kelvin, 1 * unit.atmosphere), PropertyPhase.Liquid, Substance.from_components("O"), 0.0 * unit.gram / unit.milliliter, 0.0 * unit.gram / unit.milliliter, ) data_set = PhysicalPropertyDataSet() data_set.add_properties(valid_property) data_set.validate() invalid_property = Density( ThermodynamicState(-1 * unit.kelvin, 1 * unit.atmosphere), PropertyPhase.Liquid, Substance.from_components("O"), 0.0 * unit.gram / unit.milliliter, 0.0 * unit.gram / unit.milliliter, ) with pytest.raises(AssertionError): data_set.add_properties(invalid_property) data_set.add_properties(invalid_property, validate=False) with pytest.raises(AssertionError): data_set.validate()
def test_to_pandas(): """A test to ensure that data sets are convertable to pandas objects.""" source = CalculationSource("Dummy", {}) pure_substance = Substance.from_components("C") binary_substance = Substance.from_components("C", "O") data_set = PhysicalPropertyDataSet() for temperature in [ 298 * unit.kelvin, 300 * unit.kelvin, 302 * unit.kelvin ]: thermodynamic_state = ThermodynamicState(temperature=temperature, pressure=1.0 * unit.atmosphere) density_property = Density( thermodynamic_state=thermodynamic_state, phase=PropertyPhase.Liquid, substance=pure_substance, value=1 * unit.gram / unit.milliliter, uncertainty=0.11 * unit.gram / unit.milliliter, source=source, ) dielectric_property = DielectricConstant( thermodynamic_state=thermodynamic_state, phase=PropertyPhase.Liquid, substance=pure_substance, value=1 * unit.dimensionless, uncertainty=0.11 * unit.dimensionless, source=source, ) data_set.add_properties(density_property) data_set.add_properties(dielectric_property) for temperature in [ 298 * unit.kelvin, 300 * unit.kelvin, 302 * unit.kelvin ]: thermodynamic_state = ThermodynamicState(temperature=temperature, pressure=1.0 * unit.atmosphere) enthalpy_property = EnthalpyOfMixing( thermodynamic_state=thermodynamic_state, phase=PropertyPhase.Liquid, substance=binary_substance, value=1 * unit.kilojoules / unit.mole, uncertainty=0.11 * unit.kilojoules / unit.mole, source=source, ) excess_property = ExcessMolarVolume( thermodynamic_state=thermodynamic_state, phase=PropertyPhase.Liquid, substance=binary_substance, value=1 * unit.meter**3 / unit.mole, uncertainty=0.11 * unit.meter**3 / unit.mole, source=source, ) data_set.add_properties(enthalpy_property) data_set.add_properties(excess_property) data_set_pandas = data_set.to_pandas() required_columns = [ "Temperature (K)", "Pressure (kPa)", "Phase", "N Components", "Source", "Component 1", "Role 1", "Mole Fraction 1", "Exact Amount 1", "Component 2", "Role 2", "Mole Fraction 2", "Exact Amount 2", ] assert all(x in data_set_pandas for x in required_columns) assert data_set_pandas is not None assert data_set_pandas.shape == (12, 21) data_set_without_na = data_set_pandas.dropna(axis=1, how="all") assert data_set_without_na.shape == (12, 19)
def data_set_from_data_frame(data_frame): """Converts a `pandas.DataFrame` to a `PhysicalPropertyDataSet` object. See the `PhysicalPropertyDataSet.to_pandas()` function for information on the required columns. Parameters ---------- data_frame: pandas.DataFrame The data frame to convert. Returns ------- PhysicalPropertyDataSet The converted data set. """ return_value = PhysicalPropertyDataSet() if len(data_frame) == 0: return return_value # Make sure the base columns are present. required_base_columns = [ "Temperature (K)", "Pressure (kPa)", "Phase", "N Components", "Source", ] assert all(x in data_frame for x in required_base_columns) # Make sure the substance columns are present. max_components = max(int(x) for x in data_frame["N Components"]) assert max_components > 0 required_components_columns = [ x for i in range(max_components) for x in [ f"Component {i + 1}", f"Role {i + 1}", f"Mole Fraction {i + 1}", f"Exact Amount {i + 1}", ] ] assert all(x in data_frame for x in required_components_columns) property_types = [] for column_name in data_frame: if " Value" not in column_name: continue column_name_split = column_name.split(" ") assert len(column_name_split) >= 2 property_type = getattr(evaluator.properties, column_name_split[0]) property_types.append(property_type) assert len(property_types) > 0 # Make sure we don't have duplicate property columns. assert len(set(property_types)) == len(property_types) properties = [] for _, row in data_frame.iterrows(): # Create the substance from the component columns number_of_components = row["N Components"] substance = Substance() for component_index in range(number_of_components): smiles = row[f"Component {component_index + 1}"] role = Component.Role[row[f"Role {component_index + 1}"]] mole_fraction = row[f"Mole Fraction {component_index + 1}"] exact_amount = row[f"Exact Amount {component_index + 1}"] assert not numpy.isnan(mole_fraction) or not numpy.isnan( exact_amount) component = Component(smiles, role) if not numpy.isnan(mole_fraction): substance.add_component(component, MoleFraction(mole_fraction)) if not numpy.isnan(exact_amount): substance.add_component(component, ExactAmount(exact_amount)) # Extract the state pressure = row["Pressure (kPa)"] * unit.kilopascal temperature = row["Temperature (K)"] * unit.kelvin thermodynamic_state = ThermodynamicState(temperature, pressure) phase = PropertyPhase.from_string(row["Phase"]) source = MeasurementSource(reference=row["Source"]) for property_type in property_types: default_unit = property_type.default_unit() value_header = f"{property_type.__name__} Value ({default_unit:~})" if numpy.isnan(row[value_header]): continue value = row[value_header] * default_unit uncertainty = 0.0 * default_unit physical_property = property_type( thermodynamic_state=thermodynamic_state, phase=phase, substance=substance, value=value, uncertainty=uncertainty, source=source, ) properties.append(physical_property) return_value.add_properties(*properties) return return_value
def main(): # Set up logging logging.basicConfig(level=logging.INFO) output_directory = "training_sets" os.makedirs(output_directory, exist_ok=True) pure_density_h_vap = PhysicalPropertyDataSet.from_json( "../../pure_optimisation/data_set_generation/training_set.json" ) pure_density = PhysicalPropertyDataSet.from_json( "../../pure_optimisation/data_set_generation/training_set.json" ) pure_density.filter_by_property_types("Density") h_mix_v_excess = PhysicalPropertyDataSet.from_json( "../../mixture_optimisation/data_set_generation/" "training_sets/h_mix_v_excess_training_set.json" ) h_mix_binary_density = PhysicalPropertyDataSet.from_json( "../../mixture_optimisation/data_set_generation/" "training_sets/h_mix_density_training_set.json" ) h_mix_binary_density_pure_density = PhysicalPropertyDataSet() h_mix_binary_density_pure_density.merge(pure_density) h_mix_binary_density_pure_density.merge(h_mix_binary_density) h_mix_binary_density_pure_density.json( os.path.join(output_directory, "h_mix_binary_density_pure_density.json") ) h_mix_binary_density_pure_density.to_pandas().to_csv( os.path.join(output_directory, "h_mix_binary_density_pure_density.csv") ) h_mix_v_excess_pure_density = PhysicalPropertyDataSet() h_mix_v_excess_pure_density.merge(pure_density) h_mix_v_excess_pure_density.merge(h_mix_v_excess) h_mix_v_excess_pure_density.json( os.path.join(output_directory, "h_mix_v_excess_pure_density.json") ) h_mix_v_excess_pure_density.to_pandas().to_csv( os.path.join(output_directory, "h_mix_v_excess_pure_density.csv") ) h_mix_binary_density_pure_density_h_vap = PhysicalPropertyDataSet() h_mix_binary_density_pure_density_h_vap.merge(pure_density_h_vap) h_mix_binary_density_pure_density_h_vap.merge(h_mix_binary_density) h_mix_binary_density_pure_density_h_vap.json( os.path.join(output_directory, "h_mix_binary_density_pure_density_h_vap.json") ) h_mix_binary_density_pure_density_h_vap.to_pandas().to_csv( os.path.join(output_directory, "h_mix_binary_density_pure_density_h_vap.csv") ) h_mix_v_excess_pure_density_h_vap = PhysicalPropertyDataSet() h_mix_v_excess_pure_density_h_vap.merge(pure_density_h_vap) h_mix_v_excess_pure_density_h_vap.merge(h_mix_v_excess) h_mix_v_excess_pure_density_h_vap.json( os.path.join(output_directory, "h_mix_v_excess_pure_density_h_vap.json") ) h_mix_v_excess_pure_density_h_vap.to_pandas().to_csv( os.path.join(output_directory, "h_mix_v_excess_pure_density_h_vap.csv") )
def _parse_thermoml_archives(file_paths, retain_values, retain_uncertainties, **_): """Loads a number of ThermoML data xml files (making sure to catch errors raised by individual files), and concatenates them into data sets containing a single type of property. Parameters ---------- file_paths: list of str The file paths of the ThermoML xml files to load. retain_values: bool If False, all values for the measured properties will be stripped from the final data set. retain_uncertainties: bool If False, all uncertainties in measured property values will be stripped from the final data set. Returns ------- dict of str and pandas.DataFrame The parsed data. """ properties_by_type = defaultdict(list) try: # We make sure to wrap each of the 'error prone' calls in this method # in try-catch blocks to stop workers from being killed. for file_path in file_paths: try: data_set = ThermoMLDataSet.from_file(file_path) except Exception: logger.exception(f"An exception was raised when loading {file_path}") continue # A data set will be none if no 'valid' properties were found # in the archive file. if data_set is None: continue for physical_property in data_set: if not retain_values: physical_property.value = UNDEFINED if not retain_uncertainties: physical_property.uncertainty = UNDEFINED property_type = physical_property.__class__.__name__ properties_by_type[property_type].append(physical_property) except Exception: logger.exception(f"An uncaught exception was raised.") properties_by_type = {} data_frames = {} for property_type in properties_by_type: if len(properties_by_type[property_type]) == 0: continue data_set = PhysicalPropertyDataSet() data_set.add_properties(*properties_by_type[property_type]) data_frames[property_type] = data_set.to_pandas() return data_frames
def main(): training_set_smiles = [ "CCO", "CC(=O)O", "COC=O", "CC(C)(C)O", "CC(C)O", "CO", "CCOC(C)=O", "CCOC(=O)CC(=O)OCC", "CC(C)CO", "CCCCO", "CCCCOC(C)=O", "CCCOC(C)=O", ] # Ensure the smiles patterns are standardized. smiles = [Component(x).smiles for x in training_set_smiles] # Load in the Hvap data h_vap_data_frame = pandas.read_csv( os.path.join( "..", "..", "..", "data_availability", "sourced_h_vap_data", "enthalpy_of_vaporization_pure.csv", )) h_vap_data_frame = filter_by_smiles(h_vap_data_frame, smiles_to_include=smiles, smiles_to_exclude=None) h_vap_data_set = data_set_from_data_frame(h_vap_data_frame) # # Load in the density data density_data_frame = pandas.read_csv( os.path.join( "..", "..", "..", "data_availability", "data_by_environments", "alcohol_ester", "all_data", "density_pure.csv", )) density_data_frame = filter_by_smiles(density_data_frame, smiles_to_include=smiles, smiles_to_exclude=None) density_data_set = data_set_from_data_frame(density_data_frame) # Retain the density measurements which were made closest to 298.15K and 1 atm. target_state_point = StatePoint( temperature=298.15 * unit.kelvin, pressure=1.0 * unit.atmosphere, mole_fractions=(1.0, ), ) final_data_set = PhysicalPropertyDataSet() for substance in density_data_set.substances: properties_per_state = defaultdict(list) # Refactor the properties into more convenient data structures. for physical_property in density_data_set.properties_by_substance( substance): state_point = StatePoint.from_physical_property(physical_property) properties_per_state[state_point].append(physical_property) # Sort the state points based on their distance to the target state. sorted_states_points = list( sorted( properties_per_state.keys(), key=functools.partial(StatePoint.individual_distances, target_state_point), )) final_data_set.add_properties( properties_per_state[sorted_states_points[0]][0]) final_data_set.merge(h_vap_data_set) final_data_set.json("training_set.json", format=True) final_data_set.to_pandas().to_csv("training_set.csv", index=False)
def _write_smiles_section(smiles_tuple, exercised_vdw_smirks_patterns, full_data_set, property_tuples): smiles_header = " + ".join([ _sanitize_identifier(smiles_pattern) for smiles_pattern in smiles_tuple ]) row_template = [ r"\newpage", "", r"\hrulefill", "", r"\vspace{.3cm}", r"\begin{center}", f" \\large{{\\textbf{{{smiles_header}}}}}", r"\end{center}" r"\vspace{.3cm}", "", ] for smiles_pattern in smiles_tuple: exercised_smirks = [ smirks for smirks in exercised_vdw_smirks_patterns if smiles_pattern in exercised_vdw_smirks_patterns[smirks] ] exercised_smirks_strings = [ f"\\item {{{_sanitize_identifier(smirks)}}}" for smirks in exercised_smirks ] image_file_name = smiles_pattern.replace("/", "").replace("\\", "") row_template.extend([ r"\begin{tabular}{ m{5cm} m{9cm} }", " {Structure} & {SMIRKS Exercised} \\\\", f' {{\\catcode`\\#=12 \\includegraphics{{{"./images/" + image_file_name + ".png"}}}}} & ' f'\\begin{{itemize}} {" ".join(exercised_smirks_strings)} \\end{{itemize}} \\\\', r"\end{tabular}", ]) for property_type, substance_type in property_tuples: def filter_by_substance_type(property_to_filter): return substance_type_to_int[substance_type] == len( property_to_filter.substance.components) def filter_by_smiles_tuple(property_to_filter): smiles_list = list(smiles_tuple) for component in property_to_filter.substance.components: if component.smiles not in smiles_list: return False smiles_list.remove(component.smiles) return len(smiles_list) == 0 data_set = PhysicalPropertyDataSet.parse_json(full_data_set.json()) data_set.filter_by_property_types(property_type) data_set.filter_by_function(filter_by_substance_type) data_set.filter_by_function(filter_by_smiles_tuple) for physical_property in data_set: if len(physical_property.source.doi) > 0: continue physical_property.source = MeasurementSource( reference=os.path.basename(physical_property.source.reference)) pandas_data_frame = data_set.to_pandas() if pandas_data_frame.shape[0] == 0: continue headers_to_keep = ["Temperature (K)", "Pressure (kPa)"] header_to_sort = ["Pressure (kPa)", "Temperature (K)"] mole_fraction_index = 0 while f"Mole Fraction {mole_fraction_index + 1}" in pandas_data_frame: headers_to_keep.append(f"Mole Fraction {mole_fraction_index + 1}") header_to_sort.append(f"Mole Fraction {mole_fraction_index + 1}") mole_fraction_index += 1 headers_to_keep.append("Source") pandas_data_frame = pandas_data_frame[headers_to_keep] pandas_data_frame = pandas_data_frame.sort_values(header_to_sort) property_name = " ".join( re.sub( "([A-Z][a-z]+)", r" \1", re.sub("([A-Z]+)", r" \1", property_type.__name__), ).split()) row_template.append( f"\n{str(substance_type.value).title()} {property_name.title()} Data\n" ) row_template.append("\\vspace{.3cm}\n") row_template.append( tabulate(pandas_data_frame, headers="keys", tablefmt="latex", showindex=False)) row_template.append("\\vspace{.3cm}\n") return "\n\n".join(row_template) + "\n"
def select_data_points(data_directory, chosen_substances, target_state_points): """The method attempts to find a set of data points for each property which are clustered around the set of conditions specified in the `target_state_points` input array. The points will be chosen so as to try and maximise the number of properties measured at the same condition (e.g. ideally we would have a data point for each property at T=298.15 and p=1atm) as this will maximise the chances that we can extract all properties from a single simulation. Parameters ---------- data_directory: str The directory which contains the processed pandas data sets chosen_substances: list of tuple of str, optional The substances to choose data points for. If None, no filtering of substances will be performed by this function. target_state_points: dict of tuple of type and SubstanceType and list of StatePoint A list of the state points for which we would ideally have data points for. The value tuple should be of the form (temperature, pressure, (mole fraction 0, ..., mole fraction N)) Returns ------- PhysicalPropertyDataSet A data set which contains the chosen data points. """ # Load the full data set from the processed data files data_frames = [] for property_type, substance_type in target_state_points: data_frame = load_processed_data_set(data_directory, property_type, substance_type) data_frames.append(data_frame) full_data_frame = pandas.concat(data_frames, ignore_index=True, sort=False) data_set = data_set_from_data_frame(full_data_frame) properties_by_substance = defaultdict(list) # Partition the properties by their substance components, # filtering out any not chosen substances. for substance in data_set.substances: substance_tuple = tuple( sorted([component.smiles for component in substance.components])) if chosen_substances is not None and substance_tuple not in chosen_substances: continue properties_by_substance[substance_tuple].extend( data_set.properties_by_substance(substance)) # Start to choose the state points. return_data_set = PhysicalPropertyDataSet() for substance_tuple in properties_by_substance: # Cluster the data points around the closest states of interest. clustered_properties = _cluster_properties_around_states( properties_by_substance[substance_tuple], target_state_points) # For each cluster, we try to find the state points for which we have # measured the most types of properties (i.e. prioritise states # for which we have a density, dielectric and enthalpy measurement # over those for which we only have a density measurement). for target_state_point, physical_properties in clustered_properties.items( ): properties_per_state = defaultdict(list) property_types_per_state = defaultdict(set) # Refactor the properties into more convenient data structures. for physical_property in physical_properties: state_point = StatePoint.from_physical_property( physical_property) property_tuple = property_to_type_tuple(physical_property) properties_per_state[state_point].append(physical_property) property_types_per_state[state_point].add(property_tuple) # Sort the state points based on their distance to the target state. sorted_states_points = list( sorted( properties_per_state.keys(), key=functools.partial(StatePoint.individual_distances, target_state_point), )) # Keep track of the properties which we need to choose a state point for properties_to_cover = set( property_tuple for property_tuple in target_state_points) # as well as the chosen state points chosen_state_points = set() # Iteratively consider state points which have all data points, down # to state points for which we only have single property measurements. for target_number_of_properties in reversed( range(1, len(target_state_points) + 1)): for state_point in sorted_states_points: property_types_at_state = property_types_per_state[ state_point] if len(property_types_at_state ) != target_number_of_properties: continue if (len( properties_to_cover.intersection( property_types_at_state)) == 0): continue chosen_state_points.add(state_point) properties_to_cover = properties_to_cover.symmetric_difference( properties_to_cover.intersection( property_types_at_state)) # Add the properties which were measured at the chosen state points # to the returned data set. for state_point in chosen_state_points: if len(properties_per_state[state_point]) == 0: continue return_data_set.add_properties( *properties_per_state[state_point]) return return_data_set
def generate_report( data_set_path="curated_data_set.json", report_name="report", vdw_smirks_of_interest=None, ): """A helper utility which will take as input a PhysicalPropertyDataSet and generate a report of its contents and coverage. Parameters ---------- data_set_path: str The path to the data set. report_name: str The name of the report files to generate. vdw_smirks_of_interest: list of str, optional The vdW smirks patterns which should be included in the summary table. If `None`, all vdW smirks will be included. """ with open(data_set_path) as file: data_set = PhysicalPropertyDataSet.parse_json(file.read()) all_substances = set() all_smiles = set() all_smiles_tuples = set() all_property_types = set() data_count_per_substance = defaultdict(lambda: defaultdict(int)) data_per_substance = defaultdict(lambda: defaultdict(list)) for physical_property in data_set: substance_type = int_to_substance_type[ physical_property.substance.number_of_components] property_type_tuple = (type(physical_property), substance_type) all_property_types.add(property_type_tuple) all_substances.add(physical_property.substance) for component in physical_property.substance.components: all_smiles.add(component.smiles) all_smiles_tuples.add( tuple( sorted([ component.smiles for component in physical_property.substance.components ]))) data_count_per_substance[ physical_property.substance][property_type_tuple] += 1 data_per_substance[physical_property.substance][ property_type_tuple].append(physical_property) # Determine the number of unique molecules number_of_substances = len(all_smiles) # Determine the list of all exercised vdW smirks patterns. all_vdw_smirks_patterns = vdw_smirks_of_interest if all_vdw_smirks_patterns is None: all_vdw_smirks_patterns = [ smirks for smirks in find_parameter_smirks_matches("vdW").keys() ] exercised_vdw_smirks_patterns = find_parameter_smirks_matches( "vdW", *all_smiles) # Invert the exercised_vdw_smirks_patterns dictionary. vdw_smirks_patterns_by_smiles = invert_dict_of_list( exercised_vdw_smirks_patterns) # Count the number of data points per smirks pattern. data_points_per_vdw_smirks = defaultdict(lambda: defaultdict(int)) for substance in data_count_per_substance: exercised_smirks = set() for component in substance.components: exercised_smirks.update( vdw_smirks_patterns_by_smiles[component.smiles]) for smirks in exercised_smirks: if smirks not in all_vdw_smirks_patterns: continue for data_tuple in data_count_per_substance[substance]: data_points_per_vdw_smirks[smirks][data_tuple] += 1 number_of_simulations = _estimate_required_simulations( all_property_types, data_set) _create_molecule_images(all_smiles, "images") smiles_sections = "\n".join([ _write_smiles_section( smiles_tuple, exercised_vdw_smirks_patterns, data_set, all_property_types, ) for smiles_tuple in all_smiles_tuples ]) latex_document = "\n\n".join([ _write_header(), _write_title( number_of_substances, len(data_set), number_of_simulations, ), _write_smirks_exercised_table(all_property_types, all_vdw_smirks_patterns, data_points_per_vdw_smirks), _write_unique_substances_per_property_table(all_property_types, data_count_per_substance), _write_substances_per_data_type_sections(all_property_types, data_count_per_substance), r"\pagebreak", smiles_sections, r"\end{document}", ]) report_path = report_name + ".tex" with open(report_path, "w") as file: file.write(latex_document) if shutil.which("pdflatex") is not None: subprocess.call( [ "pdflatex", "-synctex=1", "-interaction=nonstopmode", report_path ], stdout=subprocess.PIPE, stderr=subprocess.PIPE, )