Esempio n. 1
0
def create_filterable_data_set():
    """Creates a dummy data with a diverse set of properties to
    be filtered, namely:

        - a liquid density measured at 298 K and 0.5 atm with 1 component containing only carbon.
        - a gaseous dielectric measured at 288 K and 1 atm with 2 components containing only nitrogen.
        - a solid EoM measured at 308 K and 1.5 atm with 3 components containing only oxygen.

    Returns
    -------
    PhysicalPropertyDataSet
        The created data set.
    """

    source = CalculationSource("Dummy", {})
    carbon_substance = create_dummy_substance(number_of_components=1,
                                              elements=["C"])

    density_property = Density(
        thermodynamic_state=ThermodynamicState(temperature=298 * unit.kelvin,
                                               pressure=0.5 * unit.atmosphere),
        phase=PropertyPhase.Liquid,
        substance=carbon_substance,
        value=1 * unit.gram / unit.milliliter,
        uncertainty=0.11 * unit.gram / unit.milliliter,
        source=source,
    )

    nitrogen_substance = create_dummy_substance(number_of_components=2,
                                                elements=["N"])

    dielectric_property = DielectricConstant(
        thermodynamic_state=ThermodynamicState(temperature=288 * unit.kelvin,
                                               pressure=1 * unit.atmosphere),
        phase=PropertyPhase.Gas,
        substance=nitrogen_substance,
        value=1 * unit.dimensionless,
        uncertainty=0.11 * unit.dimensionless,
        source=source,
    )

    oxygen_substance = create_dummy_substance(number_of_components=3,
                                              elements=["O"])

    enthalpy_property = EnthalpyOfMixing(
        thermodynamic_state=ThermodynamicState(temperature=308 * unit.kelvin,
                                               pressure=1.5 * unit.atmosphere),
        phase=PropertyPhase.Solid,
        substance=oxygen_substance,
        value=1 * unit.kilojoules / unit.mole,
        uncertainty=0.11 * unit.kilojoules / unit.mole,
        source=source,
    )

    data_set = PhysicalPropertyDataSet()
    data_set.add_properties(density_property, dielectric_property,
                            enthalpy_property)

    return data_set
Esempio n. 2
0
def test_sources_substances():

    physical_property = create_dummy_property(Density)

    data_set = PhysicalPropertyDataSet()
    data_set.add_properties(physical_property)

    assert next(iter(data_set.sources)) == physical_property.source
    assert next(iter(data_set.substances)) == physical_property.substance
Esempio n. 3
0
def test_protocol_replacement(force_field_source, expected_protocol_type):

    data_set = PhysicalPropertyDataSet()

    for property_type in property_types:
        physical_property = create_dummy_property(property_type)
        data_set.add_properties(physical_property)

    options = EvaluatorClient.default_request_options(data_set, force_field_source)
    options_json = options.json(format=True)

    assert options_json.find('BaseBuildSystem"') < 0
    assert options_json.find(expected_protocol_type) >= 0
Esempio n. 4
0
def main():

    setup_timestamp_logging()

    # Load in the force field
    force_field_path = "smirnoff99Frosst-1.1.0.offxml"
    force_field_source = SmirnoffForceFieldSource.from_path(force_field_path)

    # Load in the data set containing the pure and binary properties.
    data_set = PhysicalPropertyDataSet.from_json("pure_data_set.json")
    data_set.merge(PhysicalPropertyDataSet.from_json("binary_data_set.json"))

    # Set up a server object to run the calculations using.
    server = setup_server(backend_type=BackendType.LocalGPU,
                          max_number_of_workers=1,
                          port=8001)

    with server:

        # Request the estimates.
        property_estimator = EvaluatorClient(
            ConnectionOptions(server_port=8001))

        for calculation_layer in ["SimulationLayer", "ReweightingLayer"]:

            options = RequestOptions()
            options.calculation_layers = [calculation_layer]

            parameter_gradient_keys = [
                ParameterGradientKey(tag="vdW",
                                     smirks="[#6X4:1]",
                                     attribute="epsilon"),
                ParameterGradientKey(tag="vdW",
                                     smirks="[#6X4:1]",
                                     attribute="rmin_half"),
            ]

            request, _ = property_estimator.request_estimate(
                property_set=data_set,
                force_field_source=force_field_source,
                options=options,
                parameter_gradient_keys=parameter_gradient_keys,
            )

            # Wait for the results.
            results, _ = request.results(True, 5)

            layer_name = re.sub(r"(?<!^)(?=[A-Z])", "_",
                                calculation_layer).lower()
            results.json(f"pure_binary_{layer_name}.json", True)
def find_training_smiles():
    """Returns the smiles of all of the substances which
    appeared in the training set.

    Returns
    -------
    list of tuple of str
        The smiles patterns of the training substances.
    """

    # Find those alcohols which were included in the training set
    training_set = PhysicalPropertyDataSet.from_json(
        os.path.join(
            "..",
            "..",
            "..",
            "pure_mixture_optimisation",
            "force_balance",
            "alcohol_ester",
            "h_mix_rho_x_rho_pure_h_vap",
            "targets",
            "mixture_data",
            "training_set.json",
        )
    ).to_pandas()

    training_smiles = data_frame_to_smiles_tuples(training_set)
    training_smiles = set(x for y in training_smiles for x in y)

    return training_smiles
Esempio n. 6
0
def test_submission():

    with tempfile.TemporaryDirectory() as directory:

        with temporarily_change_directory(directory):

            with DaskLocalCluster() as calculation_backend:

                # Spin up a server instance.
                server = EvaluatorServer(
                    calculation_backend=calculation_backend,
                    working_directory=directory,
                )

                with server:

                    # Connect a client.
                    client = EvaluatorClient()

                    # Submit an empty data set.
                    force_field_path = "smirnoff99Frosst-1.1.0.offxml"
                    force_field_source = SmirnoffForceFieldSource.from_path(
                        force_field_path
                    )

                    request, error = client.request_estimate(
                        PhysicalPropertyDataSet(), force_field_source
                    )
                    assert error is None
                    assert isinstance(request, Request)

                    result, error = request.results(polling_interval=0.01)
                    assert error is None
                    assert isinstance(result, RequestResult)
Esempio n. 7
0
def main(input_data_set_path, server_port):

    # Create the options which propertyestimator should use.
    estimator_options = RequestOptions()

    # Choose which calculation layers to make available.
    estimator_options.calculation_layers = ["SimulationLayer"]

    # Load in the training data set and create schemas for each of the types
    # of property to be calculated.
    training_set = PhysicalPropertyDataSet.from_json(input_data_set_path)

    # Zero out any undefined uncertainties due to a bug in ForceBalance.
    for physical_property in training_set:
        physical_property.uncertainty = 0.0 * physical_property.default_unit()

    data_set_path = "training_set.json"
    training_set.json(data_set_path, format=True)

    # Create the force balance options
    target_options = Evaluator_SMIRNOFF.OptionsFile()
    target_options.connection_options = ConnectionOptions(
        server_address="localhost", server_port=server_port)
    target_options.estimation_options = estimator_options

    target_options.data_set_path = data_set_path

    # Set the property weights and denominators.
    target_options.weights = {x: 1.0 for x in training_set.property_types}
    target_options.denominators = calculate_denominators(training_set)

    # Save the options to file.
    with open("options.json", "w") as file:
        file.write(target_options.to_json())
Esempio n. 8
0
def main():

    setup_timestamp_logging()

    # Load in the force field
    force_field_path = "openff-1.0.0-refit.offxml"
    force_field_source = SmirnoffForceFieldSource.from_path(force_field_path)

    # Load in the test set.
    data_set = PhysicalPropertyDataSet.from_json("full_set.json")

    # Set up a server object to run the calculations using.
    working_directory = "working_directory"

    # Set up a backend to run the calculations on. This assume running
    # on a HPC resources with the LSF queue system installed.
    queue_resources = QueueWorkerResources(
        number_of_threads=1,
        number_of_gpus=1,
        preferred_gpu_toolkit=QueueWorkerResources.GPUToolkit.CUDA,
        per_thread_memory_limit=5 * unit.gigabyte,
        wallclock_time_limit="05:59",
    )

    worker_script_commands = [
        "conda activate forcebalance", "module load cuda/10.1"
    ]

    calculation_backend = DaskLSFBackend(
        minimum_number_of_workers=1,
        maximum_number_of_workers=50,
        resources_per_worker=queue_resources,
        queue_name="gpuqueue",
        setup_script_commands=worker_script_commands,
        adaptive_interval="1000ms",
    )

    with calculation_backend:

        server = EvaluatorServer(
            calculation_backend=calculation_backend,
            working_directory=working_directory,
            port=8002,
        )

        with server:

            # Request the estimates.
            client = EvaluatorClient(ConnectionOptions(server_port=8002))

            request, _ = client.request_estimate(
                property_set=data_set,
                force_field_source=force_field_source,
            )

            # Wait for the results.
            results, _ = request.results(True, 5)
            results.json(f"results.json")
def test_same_component_batching():

    thermodynamic_state = ThermodynamicState(temperature=1.0 * unit.kelvin,
                                             pressure=1.0 * unit.atmosphere)

    data_set = PhysicalPropertyDataSet()
    data_set.add_properties(
        Density(
            thermodynamic_state=thermodynamic_state,
            substance=Substance.from_components("O", "C"),
            value=0.0 * unit.kilogram / unit.meter**3,
        ),
        EnthalpyOfVaporization(
            thermodynamic_state=thermodynamic_state,
            substance=Substance.from_components("O", "C"),
            value=0.0 * unit.kilojoule / unit.mole,
        ),
        Density(
            thermodynamic_state=thermodynamic_state,
            substance=Substance.from_components("O", "CO"),
            value=0.0 * unit.kilogram / unit.meter**3,
        ),
        EnthalpyOfVaporization(
            thermodynamic_state=thermodynamic_state,
            substance=Substance.from_components("O", "CO"),
            value=0.0 * unit.kilojoule / unit.mole,
        ),
    )

    options = RequestOptions()

    submission = EvaluatorClient._Submission()
    submission.dataset = data_set
    submission.options = options

    with DaskLocalCluster() as calculation_backend:

        server = EvaluatorServer(calculation_backend)
        batches = server._batch_by_same_component(submission, "")

    assert len(batches) == 2

    assert len(batches[0].queued_properties) == 2
    assert len(batches[1].queued_properties) == 2
Esempio n. 10
0
def test_default_options():
    """Test creating the default estimation options."""

    data_set = PhysicalPropertyDataSet()
    force_field_source = SmirnoffForceFieldSource.from_path(
        "smirnoff99Frosst-1.1.0.offxml"
    )

    for property_type in property_types:
        physical_property = create_dummy_property(property_type)
        data_set.add_properties(physical_property)

    options = EvaluatorClient.default_request_options(data_set, force_field_source)
    options.validate()

    assert len(options.calculation_layers) == 2
    assert len(options.calculation_schemas) == len(property_types)
    assert all(
        len(x) == len(options.calculation_layers)
        for x in options.calculation_schemas.values()
    )
def test_launch_batch():

    # Set up a dummy data set
    data_set = PhysicalPropertyDataSet()
    data_set.add_properties(create_dummy_property(Density),
                            create_dummy_property(Density))

    batch = Batch()
    batch.force_field_id = ""
    batch.options = RequestOptions()
    batch.options.calculation_layers = ["QuickCalculationLayer"]
    batch.options.calculation_schemas = {
        "Density": {
            "QuickCalculationLayer": CalculationLayerSchema()
        }
    }
    batch.parameter_gradient_keys = []
    batch.queued_properties = [*data_set]
    batch.validate()

    with tempfile.TemporaryDirectory() as directory:

        with temporarily_change_directory(directory):

            with DaskLocalCluster() as calculation_backend:

                server = EvaluatorServer(
                    calculation_backend=calculation_backend,
                    working_directory=directory,
                )

                server._queued_batches[batch.id] = batch
                server._launch_batch(batch)

                while len(batch.queued_properties) > 0:
                    sleep(0.01)

                assert len(batch.estimated_properties) == 1
                assert len(batch.unsuccessful_properties) == 1
Esempio n. 12
0
class RequestResult(AttributeClass):
    """The current results of an estimation request - these
    results may be partial if the server hasn't yet completed
    the request.
    """

    queued_properties = Attribute(
        docstring="The set of properties which have yet to be, or "
        "are currently being estimated.",
        type_hint=PhysicalPropertyDataSet,
        default_value=PhysicalPropertyDataSet(),
    )

    estimated_properties = Attribute(
        docstring=
        "The set of properties which have been successfully estimated.",
        type_hint=PhysicalPropertyDataSet,
        default_value=PhysicalPropertyDataSet(),
    )
    unsuccessful_properties = Attribute(
        docstring=
        "The set of properties which could not be successfully estimated.",
        type_hint=PhysicalPropertyDataSet,
        default_value=PhysicalPropertyDataSet(),
    )

    exceptions = Attribute(
        docstring="The set of properties which have yet to be, or "
        "are currently being estimated.",
        type_hint=list,
        default_value=[],
    )

    def validate(self, attribute_type=None):
        super(RequestResult, self).validate(attribute_type)
        assert all(
            (isinstance(x, EvaluatorException) for x in self.exceptions))
Esempio n. 13
0
def test_serialization():
    """A test to ensure that data sets are JSON serializable."""

    data_set = PhysicalPropertyDataSet()
    data_set.add_properties(create_dummy_property(Density))

    data_set_json = data_set.json()

    parsed_data_set = PhysicalPropertyDataSet.parse_json(data_set_json)
    assert len(data_set) == len(parsed_data_set)

    parsed_data_set_json = parsed_data_set.json()
    assert parsed_data_set_json == data_set_json
def _estimate_required_simulations(properties_of_interest, data_set):
    """Attempt to estimate how many simulations the evaluator framework
    will try and run to estimate the given data set of properties.

    Parameters
    ----------
    properties_of_interest: list of tuple of type and SubstanceType
        A list of the property types which are of interest to optimise against.
    data_set: PhysicalPropertyDataSet
        The data set containing the data set of properties of interest.

    Returns
    -------
    int
        The estimated number of simulations required.
    """

    data_set = PhysicalPropertyDataSet.parse_json(data_set.json())

    options = RequestOptions()
    calculation_layer = "SimulationLayer"

    for property_type, _ in properties_of_interest:

        default_schema = property_type.default_simulation_schema()
        options.add_schema(calculation_layer, property_type.__name__,
                           default_schema)

    workflow_graph, _ = SimulationLayer._build_workflow_graph(
        "", LocalFileStorage(), data_set.properties, "", [], options)

    number_of_simulations = 0

    for protocol_id, protocol in workflow_graph.protocols.items():

        if not isinstance(protocol, ConditionalGroup):
            continue

        number_of_simulations += 1

    return number_of_simulations
Esempio n. 15
0
def main():

    setup_timestamp_logging()

    # Load in the force field
    force_field_path = "smirnoff99Frosst-1.1.0.offxml"
    force_field_source = SmirnoffForceFieldSource.from_path(force_field_path)

    # Create a data set containing three solvation free energies.
    data_set = PhysicalPropertyDataSet.from_json("hydration_data_set.json")
    data_set.json("hydration_data_set.json", format=True)

    # Set up a server object to run the calculations using.
    server = setup_server(backend_type=BackendType.LocalGPU,
                          max_number_of_workers=1,
                          port=8002)

    with server:

        # Request the estimates.
        property_estimator = EvaluatorClient(
            ConnectionOptions(server_port=8002))

        options = RequestOptions()
        options.calculation_layers = ["SimulationLayer"]
        options.add_schema("SimulationLayer", "SolvationFreeEnergy",
                           _get_fixed_lambda_schema())

        request, _ = property_estimator.request_estimate(
            property_set=data_set,
            force_field_source=force_field_source,
            options=options,
        )

        # Wait for the results.
        results, _ = request.results(True, 60)

        # Save the result to file.
        results.json(f"results.json", True)
Esempio n. 16
0
def test_properties_by_type():

    density = create_dummy_property(Density)
    dielectric = create_dummy_property(DielectricConstant)

    data_set = PhysicalPropertyDataSet()
    data_set.add_properties(density, dielectric)

    densities = [x for x in data_set.properties_by_type("Density")]
    assert len(densities) == 1
    assert densities[0] == density

    dielectrics = [
        x for x in data_set.properties_by_type("DielectricConstant")
    ]
    assert len(dielectrics) == 1
    assert dielectrics[0] == dielectric
Esempio n. 17
0
def test_filter_by_smiles():
    """A test to ensure that data sets may be filtered by which smiles their
    measured properties contain."""

    methanol_substance = Substance()
    methanol_substance.add_component(Component("CO"), MoleFraction(1.0))

    ethanol_substance = Substance()
    ethanol_substance.add_component(Component("CCO"), MoleFraction(1.0))

    property_a = create_dummy_property(Density)
    property_a.substance = methanol_substance

    property_b = create_dummy_property(Density)
    property_b.substance = ethanol_substance

    data_set = PhysicalPropertyDataSet()
    data_set.add_properties(property_a, property_b)

    data_set.filter_by_smiles("CO")

    assert len(data_set) == 1
    assert methanol_substance in data_set.substances
    assert ethanol_substance not in data_set.substances
def main():

    # Set up logging
    logging.basicConfig(level=logging.INFO)

    output_directory = "training_sets"
    os.makedirs(output_directory, exist_ok=True)

    rho_pure_h_vap = PhysicalPropertyDataSet.from_json(
        "../../../pure_optimisation/data_set_generation/expanded_set/training_set.json"
    )

    rho_pure = PhysicalPropertyDataSet.from_json(
        "../../../pure_optimisation/data_set_generation/expanded_set/training_set.json"
    )
    rho_pure.filter_by_property_types("Density")

    h_mix_rho_x = PhysicalPropertyDataSet.from_json(
        "../../../mixture_optimisation/data_set_generation/"
        "expanded_set/training_sets/h_mix_rho_x_training_set.json")

    h_mix_rho_x_rho_pure = PhysicalPropertyDataSet()
    h_mix_rho_x_rho_pure.merge(rho_pure)
    h_mix_rho_x_rho_pure.merge(h_mix_rho_x)
    h_mix_rho_x_rho_pure.json(
        os.path.join(output_directory, "h_mix_rho_x_rho_pure.json"))
    h_mix_rho_x_rho_pure.to_pandas().to_csv(
        os.path.join(output_directory, "h_mix_rho_x_rho_pure.csv"))

    h_mix_rho_x_rho_pure_h_vap = PhysicalPropertyDataSet()
    h_mix_rho_x_rho_pure_h_vap.merge(rho_pure_h_vap)
    h_mix_rho_x_rho_pure_h_vap.merge(h_mix_rho_x)
    h_mix_rho_x_rho_pure_h_vap.json(
        os.path.join(output_directory, "h_mix_rho_x_rho_pure_h_vap.json"))
    h_mix_rho_x_rho_pure_h_vap.to_pandas().to_csv(
        os.path.join(output_directory, "h_mix_rho_x_rho_pure_h_vap.csv"))
Esempio n. 19
0
def test_validate_data_set():

    valid_property = Density(
        ThermodynamicState(298 * unit.kelvin, 1 * unit.atmosphere),
        PropertyPhase.Liquid,
        Substance.from_components("O"),
        0.0 * unit.gram / unit.milliliter,
        0.0 * unit.gram / unit.milliliter,
    )

    data_set = PhysicalPropertyDataSet()
    data_set.add_properties(valid_property)

    data_set.validate()

    invalid_property = Density(
        ThermodynamicState(-1 * unit.kelvin, 1 * unit.atmosphere),
        PropertyPhase.Liquid,
        Substance.from_components("O"),
        0.0 * unit.gram / unit.milliliter,
        0.0 * unit.gram / unit.milliliter,
    )

    with pytest.raises(AssertionError):
        data_set.add_properties(invalid_property)

    data_set.add_properties(invalid_property, validate=False)

    with pytest.raises(AssertionError):
        data_set.validate()
Esempio n. 20
0
def test_to_pandas():
    """A test to ensure that data sets are convertable to pandas objects."""

    source = CalculationSource("Dummy", {})

    pure_substance = Substance.from_components("C")
    binary_substance = Substance.from_components("C", "O")

    data_set = PhysicalPropertyDataSet()

    for temperature in [
            298 * unit.kelvin, 300 * unit.kelvin, 302 * unit.kelvin
    ]:

        thermodynamic_state = ThermodynamicState(temperature=temperature,
                                                 pressure=1.0 *
                                                 unit.atmosphere)

        density_property = Density(
            thermodynamic_state=thermodynamic_state,
            phase=PropertyPhase.Liquid,
            substance=pure_substance,
            value=1 * unit.gram / unit.milliliter,
            uncertainty=0.11 * unit.gram / unit.milliliter,
            source=source,
        )

        dielectric_property = DielectricConstant(
            thermodynamic_state=thermodynamic_state,
            phase=PropertyPhase.Liquid,
            substance=pure_substance,
            value=1 * unit.dimensionless,
            uncertainty=0.11 * unit.dimensionless,
            source=source,
        )

        data_set.add_properties(density_property)
        data_set.add_properties(dielectric_property)

    for temperature in [
            298 * unit.kelvin, 300 * unit.kelvin, 302 * unit.kelvin
    ]:

        thermodynamic_state = ThermodynamicState(temperature=temperature,
                                                 pressure=1.0 *
                                                 unit.atmosphere)

        enthalpy_property = EnthalpyOfMixing(
            thermodynamic_state=thermodynamic_state,
            phase=PropertyPhase.Liquid,
            substance=binary_substance,
            value=1 * unit.kilojoules / unit.mole,
            uncertainty=0.11 * unit.kilojoules / unit.mole,
            source=source,
        )

        excess_property = ExcessMolarVolume(
            thermodynamic_state=thermodynamic_state,
            phase=PropertyPhase.Liquid,
            substance=binary_substance,
            value=1 * unit.meter**3 / unit.mole,
            uncertainty=0.11 * unit.meter**3 / unit.mole,
            source=source,
        )

        data_set.add_properties(enthalpy_property)
        data_set.add_properties(excess_property)

    data_set_pandas = data_set.to_pandas()

    required_columns = [
        "Temperature (K)",
        "Pressure (kPa)",
        "Phase",
        "N Components",
        "Source",
        "Component 1",
        "Role 1",
        "Mole Fraction 1",
        "Exact Amount 1",
        "Component 2",
        "Role 2",
        "Mole Fraction 2",
        "Exact Amount 2",
    ]

    assert all(x in data_set_pandas for x in required_columns)

    assert data_set_pandas is not None
    assert data_set_pandas.shape == (12, 21)

    data_set_without_na = data_set_pandas.dropna(axis=1, how="all")
    assert data_set_without_na.shape == (12, 19)
Esempio n. 21
0
def data_set_from_data_frame(data_frame):
    """Converts a `pandas.DataFrame` to a `PhysicalPropertyDataSet` object.
    See the `PhysicalPropertyDataSet.to_pandas()` function for information
    on the required columns.

    Parameters
    ----------
    data_frame: pandas.DataFrame
        The data frame to convert.

    Returns
    -------
    PhysicalPropertyDataSet
        The converted data set.
    """

    return_value = PhysicalPropertyDataSet()

    if len(data_frame) == 0:
        return return_value

    # Make sure the base columns are present.
    required_base_columns = [
        "Temperature (K)",
        "Pressure (kPa)",
        "Phase",
        "N Components",
        "Source",
    ]

    assert all(x in data_frame for x in required_base_columns)

    # Make sure the substance columns are present.
    max_components = max(int(x) for x in data_frame["N Components"])
    assert max_components > 0

    required_components_columns = [
        x for i in range(max_components) for x in [
            f"Component {i + 1}",
            f"Role {i + 1}",
            f"Mole Fraction {i + 1}",
            f"Exact Amount {i + 1}",
        ]
    ]

    assert all(x in data_frame for x in required_components_columns)

    property_types = []

    for column_name in data_frame:

        if " Value" not in column_name:
            continue

        column_name_split = column_name.split(" ")

        assert len(column_name_split) >= 2

        property_type = getattr(evaluator.properties, column_name_split[0])
        property_types.append(property_type)

    assert len(property_types) > 0

    # Make sure we don't have duplicate property columns.
    assert len(set(property_types)) == len(property_types)

    properties = []

    for _, row in data_frame.iterrows():

        # Create the substance from the component columns
        number_of_components = row["N Components"]

        substance = Substance()

        for component_index in range(number_of_components):

            smiles = row[f"Component {component_index + 1}"]
            role = Component.Role[row[f"Role {component_index + 1}"]]
            mole_fraction = row[f"Mole Fraction {component_index + 1}"]
            exact_amount = row[f"Exact Amount {component_index + 1}"]

            assert not numpy.isnan(mole_fraction) or not numpy.isnan(
                exact_amount)

            component = Component(smiles, role)

            if not numpy.isnan(mole_fraction):
                substance.add_component(component, MoleFraction(mole_fraction))
            if not numpy.isnan(exact_amount):
                substance.add_component(component, ExactAmount(exact_amount))

        # Extract the state
        pressure = row["Pressure (kPa)"] * unit.kilopascal
        temperature = row["Temperature (K)"] * unit.kelvin

        thermodynamic_state = ThermodynamicState(temperature, pressure)

        phase = PropertyPhase.from_string(row["Phase"])

        source = MeasurementSource(reference=row["Source"])

        for property_type in property_types:

            default_unit = property_type.default_unit()
            value_header = f"{property_type.__name__} Value ({default_unit:~})"

            if numpy.isnan(row[value_header]):
                continue

            value = row[value_header] * default_unit
            uncertainty = 0.0 * default_unit

            physical_property = property_type(
                thermodynamic_state=thermodynamic_state,
                phase=phase,
                substance=substance,
                value=value,
                uncertainty=uncertainty,
                source=source,
            )

            properties.append(physical_property)

    return_value.add_properties(*properties)
    return return_value
Esempio n. 22
0
def main():

    # Set up logging
    logging.basicConfig(level=logging.INFO)

    output_directory = "training_sets"
    os.makedirs(output_directory, exist_ok=True)

    pure_density_h_vap = PhysicalPropertyDataSet.from_json(
        "../../pure_optimisation/data_set_generation/training_set.json"
    )

    pure_density = PhysicalPropertyDataSet.from_json(
        "../../pure_optimisation/data_set_generation/training_set.json"
    )
    pure_density.filter_by_property_types("Density")

    h_mix_v_excess = PhysicalPropertyDataSet.from_json(
        "../../mixture_optimisation/data_set_generation/"
        "training_sets/h_mix_v_excess_training_set.json"
    )
    h_mix_binary_density = PhysicalPropertyDataSet.from_json(
        "../../mixture_optimisation/data_set_generation/"
        "training_sets/h_mix_density_training_set.json"
    )

    h_mix_binary_density_pure_density = PhysicalPropertyDataSet()
    h_mix_binary_density_pure_density.merge(pure_density)
    h_mix_binary_density_pure_density.merge(h_mix_binary_density)
    h_mix_binary_density_pure_density.json(
        os.path.join(output_directory, "h_mix_binary_density_pure_density.json")
    )
    h_mix_binary_density_pure_density.to_pandas().to_csv(
        os.path.join(output_directory, "h_mix_binary_density_pure_density.csv")
    )

    h_mix_v_excess_pure_density = PhysicalPropertyDataSet()
    h_mix_v_excess_pure_density.merge(pure_density)
    h_mix_v_excess_pure_density.merge(h_mix_v_excess)
    h_mix_v_excess_pure_density.json(
        os.path.join(output_directory, "h_mix_v_excess_pure_density.json")
    )
    h_mix_v_excess_pure_density.to_pandas().to_csv(
        os.path.join(output_directory, "h_mix_v_excess_pure_density.csv")
    )

    h_mix_binary_density_pure_density_h_vap = PhysicalPropertyDataSet()
    h_mix_binary_density_pure_density_h_vap.merge(pure_density_h_vap)
    h_mix_binary_density_pure_density_h_vap.merge(h_mix_binary_density)
    h_mix_binary_density_pure_density_h_vap.json(
        os.path.join(output_directory, "h_mix_binary_density_pure_density_h_vap.json")
    )
    h_mix_binary_density_pure_density_h_vap.to_pandas().to_csv(
        os.path.join(output_directory, "h_mix_binary_density_pure_density_h_vap.csv")
    )

    h_mix_v_excess_pure_density_h_vap = PhysicalPropertyDataSet()
    h_mix_v_excess_pure_density_h_vap.merge(pure_density_h_vap)
    h_mix_v_excess_pure_density_h_vap.merge(h_mix_v_excess)
    h_mix_v_excess_pure_density_h_vap.json(
        os.path.join(output_directory, "h_mix_v_excess_pure_density_h_vap.json")
    )
    h_mix_v_excess_pure_density_h_vap.to_pandas().to_csv(
        os.path.join(output_directory, "h_mix_v_excess_pure_density_h_vap.csv")
    )
Esempio n. 23
0
def _parse_thermoml_archives(file_paths, retain_values, retain_uncertainties, **_):
    """Loads a number of ThermoML data xml files (making sure to
    catch errors raised by individual files), and concatenates
    them into data sets containing a single type of property.

    Parameters
    ----------
    file_paths: list of str
        The file paths of the ThermoML xml files to load.
    retain_values: bool
        If False, all values for the measured properties will
        be stripped from the final data set.
    retain_uncertainties: bool
        If False, all uncertainties in measured property values will
        be stripped from the final data set.

    Returns
    -------
    dict of str and pandas.DataFrame
        The parsed data.
    """

    properties_by_type = defaultdict(list)

    try:

        # We make sure to wrap each of the 'error prone' calls in this method
        # in try-catch blocks to stop workers from being killed.
        for file_path in file_paths:

            try:
                data_set = ThermoMLDataSet.from_file(file_path)

            except Exception:

                logger.exception(f"An exception was raised when loading {file_path}")
                continue

            # A data set will be none if no 'valid' properties were found
            # in the archive file.
            if data_set is None:
                continue

            for physical_property in data_set:

                if not retain_values:
                    physical_property.value = UNDEFINED
                if not retain_uncertainties:
                    physical_property.uncertainty = UNDEFINED

                property_type = physical_property.__class__.__name__
                properties_by_type[property_type].append(physical_property)

    except Exception:

        logger.exception(f"An uncaught exception was raised.")
        properties_by_type = {}

    data_frames = {}

    for property_type in properties_by_type:

        if len(properties_by_type[property_type]) == 0:
            continue

        data_set = PhysicalPropertyDataSet()
        data_set.add_properties(*properties_by_type[property_type])

        data_frames[property_type] = data_set.to_pandas()

    return data_frames
Esempio n. 24
0
def main():

    training_set_smiles = [
        "CCO",
        "CC(=O)O",
        "COC=O",
        "CC(C)(C)O",
        "CC(C)O",
        "CO",
        "CCOC(C)=O",
        "CCOC(=O)CC(=O)OCC",
        "CC(C)CO",
        "CCCCO",
        "CCCCOC(C)=O",
        "CCCOC(C)=O",
    ]

    # Ensure the smiles patterns are standardized.
    smiles = [Component(x).smiles for x in training_set_smiles]

    # Load in the Hvap data
    h_vap_data_frame = pandas.read_csv(
        os.path.join(
            "..",
            "..",
            "..",
            "data_availability",
            "sourced_h_vap_data",
            "enthalpy_of_vaporization_pure.csv",
        ))
    h_vap_data_frame = filter_by_smiles(h_vap_data_frame,
                                        smiles_to_include=smiles,
                                        smiles_to_exclude=None)

    h_vap_data_set = data_set_from_data_frame(h_vap_data_frame)

    # # Load in the density data
    density_data_frame = pandas.read_csv(
        os.path.join(
            "..",
            "..",
            "..",
            "data_availability",
            "data_by_environments",
            "alcohol_ester",
            "all_data",
            "density_pure.csv",
        ))
    density_data_frame = filter_by_smiles(density_data_frame,
                                          smiles_to_include=smiles,
                                          smiles_to_exclude=None)

    density_data_set = data_set_from_data_frame(density_data_frame)

    # Retain the density measurements which were made closest to 298.15K and 1 atm.
    target_state_point = StatePoint(
        temperature=298.15 * unit.kelvin,
        pressure=1.0 * unit.atmosphere,
        mole_fractions=(1.0, ),
    )

    final_data_set = PhysicalPropertyDataSet()

    for substance in density_data_set.substances:

        properties_per_state = defaultdict(list)

        # Refactor the properties into more convenient data structures.
        for physical_property in density_data_set.properties_by_substance(
                substance):

            state_point = StatePoint.from_physical_property(physical_property)
            properties_per_state[state_point].append(physical_property)

        # Sort the state points based on their distance to the target state.
        sorted_states_points = list(
            sorted(
                properties_per_state.keys(),
                key=functools.partial(StatePoint.individual_distances,
                                      target_state_point),
            ))

        final_data_set.add_properties(
            properties_per_state[sorted_states_points[0]][0])

    final_data_set.merge(h_vap_data_set)

    final_data_set.json("training_set.json", format=True)
    final_data_set.to_pandas().to_csv("training_set.csv", index=False)
def _write_smiles_section(smiles_tuple, exercised_vdw_smirks_patterns,
                          full_data_set, property_tuples):

    smiles_header = " + ".join([
        _sanitize_identifier(smiles_pattern) for smiles_pattern in smiles_tuple
    ])

    row_template = [
        r"\newpage",
        "",
        r"\hrulefill",
        "",
        r"\vspace{.3cm}",
        r"\begin{center}",
        f"    \\large{{\\textbf{{{smiles_header}}}}}",
        r"\end{center}"
        r"\vspace{.3cm}",
        "",
    ]

    for smiles_pattern in smiles_tuple:

        exercised_smirks = [
            smirks for smirks in exercised_vdw_smirks_patterns
            if smiles_pattern in exercised_vdw_smirks_patterns[smirks]
        ]

        exercised_smirks_strings = [
            f"\\item {{{_sanitize_identifier(smirks)}}}"
            for smirks in exercised_smirks
        ]

        image_file_name = smiles_pattern.replace("/", "").replace("\\", "")

        row_template.extend([
            r"\begin{tabular}{ m{5cm} m{9cm} }",
            "    {Structure} & {SMIRKS Exercised} \\\\",
            f'    {{\\catcode`\\#=12 \\includegraphics{{{"./images/" + image_file_name + ".png"}}}}} & '
            f'\\begin{{itemize}} {" ".join(exercised_smirks_strings)} \\end{{itemize}} \\\\',
            r"\end{tabular}",
        ])

    for property_type, substance_type in property_tuples:

        def filter_by_substance_type(property_to_filter):
            return substance_type_to_int[substance_type] == len(
                property_to_filter.substance.components)

        def filter_by_smiles_tuple(property_to_filter):

            smiles_list = list(smiles_tuple)

            for component in property_to_filter.substance.components:

                if component.smiles not in smiles_list:
                    return False

                smiles_list.remove(component.smiles)

            return len(smiles_list) == 0

        data_set = PhysicalPropertyDataSet.parse_json(full_data_set.json())
        data_set.filter_by_property_types(property_type)
        data_set.filter_by_function(filter_by_substance_type)
        data_set.filter_by_function(filter_by_smiles_tuple)

        for physical_property in data_set:

            if len(physical_property.source.doi) > 0:
                continue

            physical_property.source = MeasurementSource(
                reference=os.path.basename(physical_property.source.reference))

        pandas_data_frame = data_set.to_pandas()

        if pandas_data_frame.shape[0] == 0:
            continue

        headers_to_keep = ["Temperature (K)", "Pressure (kPa)"]
        header_to_sort = ["Pressure (kPa)", "Temperature (K)"]

        mole_fraction_index = 0

        while f"Mole Fraction {mole_fraction_index + 1}" in pandas_data_frame:

            headers_to_keep.append(f"Mole Fraction {mole_fraction_index + 1}")
            header_to_sort.append(f"Mole Fraction {mole_fraction_index + 1}")
            mole_fraction_index += 1

        headers_to_keep.append("Source")

        pandas_data_frame = pandas_data_frame[headers_to_keep]
        pandas_data_frame = pandas_data_frame.sort_values(header_to_sort)

        property_name = " ".join(
            re.sub(
                "([A-Z][a-z]+)",
                r" \1",
                re.sub("([A-Z]+)", r" \1", property_type.__name__),
            ).split())

        row_template.append(
            f"\n{str(substance_type.value).title()} {property_name.title()} Data\n"
        )
        row_template.append("\\vspace{.3cm}\n")
        row_template.append(
            tabulate(pandas_data_frame,
                     headers="keys",
                     tablefmt="latex",
                     showindex=False))
        row_template.append("\\vspace{.3cm}\n")

    return "\n\n".join(row_template) + "\n"
def select_data_points(data_directory, chosen_substances, target_state_points):
    """The method attempts to find a set of data points for each
    property which are clustered around the set of conditions specified
    in the `target_state_points` input array.

    The points will be chosen so as to try and maximise the number of
    properties measured at the same condition (e.g. ideally we would
    have a data point for each property at T=298.15 and p=1atm) as this
    will maximise the chances that we can extract all properties from a
    single simulation.

    Parameters
    ----------
    data_directory: str
        The directory which contains the processed pandas
        data sets
    chosen_substances: list of tuple of str, optional
        The substances to choose data points for. If None,
        no filtering of substances will be performed by this function.
    target_state_points: dict of tuple of type and SubstanceType and list of StatePoint
        A list of the state points for which we would ideally have data
        points for. The value tuple should be of the form
        (temperature, pressure, (mole fraction 0, ..., mole fraction N))

    Returns
    -------
    PhysicalPropertyDataSet
        A data set which contains the chosen data points.
    """

    # Load the full data set from the processed data files
    data_frames = []

    for property_type, substance_type in target_state_points:

        data_frame = load_processed_data_set(data_directory, property_type,
                                             substance_type)
        data_frames.append(data_frame)

    full_data_frame = pandas.concat(data_frames, ignore_index=True, sort=False)
    data_set = data_set_from_data_frame(full_data_frame)

    properties_by_substance = defaultdict(list)

    # Partition the properties by their substance components,
    # filtering out any not chosen substances.
    for substance in data_set.substances:

        substance_tuple = tuple(
            sorted([component.smiles for component in substance.components]))

        if chosen_substances is not None and substance_tuple not in chosen_substances:
            continue

        properties_by_substance[substance_tuple].extend(
            data_set.properties_by_substance(substance))

    # Start to choose the state points.
    return_data_set = PhysicalPropertyDataSet()

    for substance_tuple in properties_by_substance:

        # Cluster the data points around the closest states of interest.
        clustered_properties = _cluster_properties_around_states(
            properties_by_substance[substance_tuple], target_state_points)

        # For each cluster, we try to find the state points for which we have
        # measured the most types of properties (i.e. prioritise states
        # for which we have a density, dielectric and enthalpy measurement
        # over those for which we only have a density measurement).
        for target_state_point, physical_properties in clustered_properties.items(
        ):

            properties_per_state = defaultdict(list)
            property_types_per_state = defaultdict(set)

            # Refactor the properties into more convenient data structures.
            for physical_property in physical_properties:

                state_point = StatePoint.from_physical_property(
                    physical_property)
                property_tuple = property_to_type_tuple(physical_property)

                properties_per_state[state_point].append(physical_property)
                property_types_per_state[state_point].add(property_tuple)

            # Sort the state points based on their distance to the target state.
            sorted_states_points = list(
                sorted(
                    properties_per_state.keys(),
                    key=functools.partial(StatePoint.individual_distances,
                                          target_state_point),
                ))

            # Keep track of the properties which we need to choose a state point for
            properties_to_cover = set(
                property_tuple for property_tuple in target_state_points)
            # as well as the chosen state points
            chosen_state_points = set()

            # Iteratively consider state points which have all data points, down
            # to state points for which we only have single property measurements.
            for target_number_of_properties in reversed(
                    range(1,
                          len(target_state_points) + 1)):

                for state_point in sorted_states_points:

                    property_types_at_state = property_types_per_state[
                        state_point]

                    if len(property_types_at_state
                           ) != target_number_of_properties:
                        continue

                    if (len(
                            properties_to_cover.intersection(
                                property_types_at_state)) == 0):
                        continue

                    chosen_state_points.add(state_point)

                    properties_to_cover = properties_to_cover.symmetric_difference(
                        properties_to_cover.intersection(
                            property_types_at_state))

            # Add the properties which were measured at the chosen state points
            # to the returned data set.
            for state_point in chosen_state_points:

                if len(properties_per_state[state_point]) == 0:
                    continue

                return_data_set.add_properties(
                    *properties_per_state[state_point])

    return return_data_set
def generate_report(
    data_set_path="curated_data_set.json",
    report_name="report",
    vdw_smirks_of_interest=None,
):
    """A helper utility which will take as input a PhysicalPropertyDataSet
    and generate a report of its contents and coverage.

    Parameters
    ----------
    data_set_path: str
        The path to the data set.
    report_name: str
        The name of the report files to generate.
    vdw_smirks_of_interest: list of str, optional
        The vdW smirks patterns which should be included in the
        summary table. If `None`, all vdW smirks will be included.
    """

    with open(data_set_path) as file:
        data_set = PhysicalPropertyDataSet.parse_json(file.read())

    all_substances = set()

    all_smiles = set()
    all_smiles_tuples = set()

    all_property_types = set()

    data_count_per_substance = defaultdict(lambda: defaultdict(int))
    data_per_substance = defaultdict(lambda: defaultdict(list))

    for physical_property in data_set:

        substance_type = int_to_substance_type[
            physical_property.substance.number_of_components]
        property_type_tuple = (type(physical_property), substance_type)

        all_property_types.add(property_type_tuple)
        all_substances.add(physical_property.substance)

        for component in physical_property.substance.components:
            all_smiles.add(component.smiles)

        all_smiles_tuples.add(
            tuple(
                sorted([
                    component.smiles
                    for component in physical_property.substance.components
                ])))

        data_count_per_substance[
            physical_property.substance][property_type_tuple] += 1
        data_per_substance[physical_property.substance][
            property_type_tuple].append(physical_property)

    # Determine the number of unique molecules
    number_of_substances = len(all_smiles)

    # Determine the list of all exercised vdW smirks patterns.
    all_vdw_smirks_patterns = vdw_smirks_of_interest

    if all_vdw_smirks_patterns is None:
        all_vdw_smirks_patterns = [
            smirks for smirks in find_parameter_smirks_matches("vdW").keys()
        ]

    exercised_vdw_smirks_patterns = find_parameter_smirks_matches(
        "vdW", *all_smiles)

    # Invert the exercised_vdw_smirks_patterns dictionary.
    vdw_smirks_patterns_by_smiles = invert_dict_of_list(
        exercised_vdw_smirks_patterns)

    # Count the number of data points per smirks pattern.
    data_points_per_vdw_smirks = defaultdict(lambda: defaultdict(int))

    for substance in data_count_per_substance:

        exercised_smirks = set()

        for component in substance.components:
            exercised_smirks.update(
                vdw_smirks_patterns_by_smiles[component.smiles])

        for smirks in exercised_smirks:

            if smirks not in all_vdw_smirks_patterns:
                continue

            for data_tuple in data_count_per_substance[substance]:
                data_points_per_vdw_smirks[smirks][data_tuple] += 1

    number_of_simulations = _estimate_required_simulations(
        all_property_types, data_set)

    _create_molecule_images(all_smiles, "images")

    smiles_sections = "\n".join([
        _write_smiles_section(
            smiles_tuple,
            exercised_vdw_smirks_patterns,
            data_set,
            all_property_types,
        ) for smiles_tuple in all_smiles_tuples
    ])

    latex_document = "\n\n".join([
        _write_header(),
        _write_title(
            number_of_substances,
            len(data_set),
            number_of_simulations,
        ),
        _write_smirks_exercised_table(all_property_types,
                                      all_vdw_smirks_patterns,
                                      data_points_per_vdw_smirks),
        _write_unique_substances_per_property_table(all_property_types,
                                                    data_count_per_substance),
        _write_substances_per_data_type_sections(all_property_types,
                                                 data_count_per_substance),
        r"\pagebreak",
        smiles_sections,
        r"\end{document}",
    ])

    report_path = report_name + ".tex"

    with open(report_path, "w") as file:
        file.write(latex_document)

    if shutil.which("pdflatex") is not None:

        subprocess.call(
            [
                "pdflatex", "-synctex=1", "-interaction=nonstopmode",
                report_path
            ],
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
        )