Ejemplo n.º 1
0
def test_filter_ionic_liquid():
    thermodynamic_state = ThermodynamicState(
        temperature=298.15 * unit.kelvin,
        pressure=101.325 * unit.kilopascal,
    )

    # Ensure ionic liquids are filtered.
    data_set = PhysicalPropertyDataSet()
    data_set.add_properties(
        Density(
            thermodynamic_state=thermodynamic_state,
            phase=PropertyPhase.Liquid,
            value=1.0 * Density.default_unit(),
            uncertainty=1.0 * Density.default_unit(),
            source=MeasurementSource(doi=" "),
            substance=Substance.from_components("[Na+].[Cl-]"),
        ),
        Density(
            thermodynamic_state=thermodynamic_state,
            phase=PropertyPhase.Liquid,
            value=1.0 * Density.default_unit(),
            uncertainty=1.0 * Density.default_unit(),
            source=MeasurementSource(doi=" "),
            substance=Substance.from_components("C"),
        ),
    )

    data_frame = data_set.to_pandas()

    filtered_frame = FilterByIonicLiquid.apply(
        data_frame,
        FilterByIonicLiquidSchema(),
    )

    assert len(filtered_frame) == 1
def results_to_pandas(force_fields: List[str]) -> pandas.DataFrame:
    """Imports the experimental and estimated data sets and stores them in a
    pandas data frame.
    """

    # Load in the experimental data set.
    training_set = {
        physical_property.id: physical_property
        for physical_property in PhysicalPropertyDataSet.from_json(
            os.path.join("raw_data_v2", "curated_data_set.json"))
    }

    # Load in the results.
    estimated_results = {
        force_field: {
            physical_property.id: physical_property
            for physical_property in PhysicalPropertyDataSet.from_json(
                os.path.join("raw_data_v2", f"{force_field}.json"))
        }
        for force_field in force_fields
    }

    # Refactor the experimental and estimated data into a single data frame.
    data_rows = []

    for property_id in training_set:

        experimental_property = training_set[property_id]

        estimated_properties = {
            force_field: estimated_results[force_field].get(property_id, None)
            for force_field in force_fields
        }

        if (any(estimated_property is None
                for estimated_property in estimated_properties.values())
                or property_id in OUTLIERS):
            print(f"Skipping property {property_id}")
            continue

        data_rows.extend({
            "Id":
            property_id,
            "Type": (f"{experimental_property.__class__.__name__}_"
                     f"{len(experimental_property.substance)}"),
            "Force Field":
            force_field,
            "NIST ThermoML":
            experimental_property.value.to(
                experimental_property.default_unit()).magnitude,
            "Estimated":
            estimated_properties[force_field].value.to(
                experimental_property.default_unit()).magnitude,
            "Estimated Uncertainty":
            estimated_properties[force_field].uncertainty.to(
                experimental_property.default_unit()).magnitude,
        } for force_field in force_fields)

    return pandas.DataFrame(data_rows)
Ejemplo n.º 3
0
def create_filterable_data_set():
    """Creates a dummy data with a diverse set of properties to
    be filtered, namely:

        - a liquid density measured at 298 K and 0.5 atm with 1 component containing only carbon.
        - a gaseous dielectric measured at 288 K and 1 atm with 2 components containing only nitrogen.
        - a solid EoM measured at 308 K and 1.5 atm with 3 components containing only oxygen.

    Returns
    -------
    PhysicalPropertyDataSet
        The created data set.
    """

    source = CalculationSource("Dummy", {})
    carbon_substance = create_dummy_substance(number_of_components=1,
                                              elements=["C"])

    density_property = Density(
        thermodynamic_state=ThermodynamicState(temperature=298 * unit.kelvin,
                                               pressure=0.5 * unit.atmosphere),
        phase=PropertyPhase.Liquid,
        substance=carbon_substance,
        value=1 * unit.gram / unit.milliliter,
        uncertainty=0.11 * unit.gram / unit.milliliter,
        source=source,
    )

    nitrogen_substance = create_dummy_substance(number_of_components=2,
                                                elements=["N"])

    dielectric_property = DielectricConstant(
        thermodynamic_state=ThermodynamicState(temperature=288 * unit.kelvin,
                                               pressure=1 * unit.atmosphere),
        phase=PropertyPhase.Gas,
        substance=nitrogen_substance,
        value=1 * unit.dimensionless,
        uncertainty=0.11 * unit.dimensionless,
        source=source,
    )

    oxygen_substance = create_dummy_substance(number_of_components=3,
                                              elements=["O"])

    enthalpy_property = EnthalpyOfMixing(
        thermodynamic_state=ThermodynamicState(temperature=308 * unit.kelvin,
                                               pressure=1.5 * unit.atmosphere),
        phase=PropertyPhase.Solid,
        substance=oxygen_substance,
        value=1 * unit.kilojoules / unit.mole,
        uncertainty=0.11 * unit.kilojoules / unit.mole,
        source=source,
    )

    data_set = PhysicalPropertyDataSet()
    data_set.add_properties(density_property, dielectric_property,
                            enthalpy_property)

    return data_set
Ejemplo n.º 4
0
def estimated_reference_sets():
    estimated_density = Density(
        thermodynamic_state=ThermodynamicState(298.15 * unit.kelvin,
                                               pressure=1.0 * unit.atmosphere),
        phase=PropertyPhase.Liquid,
        substance=Substance.from_components("O", "CC=O"),
        value=1.0 * unit.kilogram / unit.meter**3,
        uncertainty=0.1 * unit.kilogram / unit.meter**3,
    )
    estimated_density.id = "1"
    estimated_enthalpy = EnthalpyOfMixing(
        thermodynamic_state=ThermodynamicState(298.15 * unit.kelvin,
                                               pressure=1.0 * unit.atmosphere),
        phase=PropertyPhase.Liquid,
        substance=Substance.from_components("O", "CC=O"),
        value=1.0 * unit.kilocalorie / unit.mole,
        uncertainty=0.1 * unit.kilojoule / unit.mole,
    )
    estimated_enthalpy.id = "2"

    estimated_data_set = PhysicalPropertyDataSet()
    estimated_data_set.add_properties(estimated_density, estimated_enthalpy)

    reference_density = DataSetEntry(
        id=1,
        property_type="Density",
        temperature=298.15,
        pressure=101.325,
        value=0.001,
        std_error=0.0001,
        doi=" ",
        components=[
            Component(smiles="O", mole_fraction=0.5),
            Component(smiles="CC=O", mole_fraction=0.5),
        ],
    )
    reference_enthalpy = DataSetEntry(
        id=2,
        property_type="EnthalpyOfMixing",
        temperature=298.15,
        pressure=101.325,
        value=4.184,
        std_error=0.1,
        doi=" ",
        components=[
            Component(smiles="O", mole_fraction=0.5),
            Component(smiles="CC=O", mole_fraction=0.5),
        ],
    )

    reference_data_set = DataSet(
        id="ref",
        description=" ",
        authors=[Author(name=" ", email="*****@*****.**", institute=" ")],
        entries=[reference_density, reference_enthalpy],
    )

    return estimated_data_set, reference_data_set
Ejemplo n.º 5
0
def test_sources_substances():

    physical_property = create_dummy_property(Density)

    data_set = PhysicalPropertyDataSet()
    data_set.add_properties(physical_property)

    assert next(iter(data_set.sources)) == physical_property.source
    assert next(iter(data_set.substances)) == physical_property.substance
Ejemplo n.º 6
0
    def to_evaluator(self) -> "PhysicalPropertyDataSet":

        from openff.evaluator.datasets import PhysicalPropertyDataSet

        physical_properties = [entry.to_evaluator() for entry in self.entries]

        evaluator_set = PhysicalPropertyDataSet()
        evaluator_set.add_properties(*physical_properties)

        return evaluator_set
Ejemplo n.º 7
0
def test_reindex_data_set_no_mole_fraction():
    """Tests that the ``reindex_data_set`` function behaves as expected
    when exact amounts are present."""

    setup_timestamp_logging(logging.INFO)

    substance = substances.Substance()
    substance.add_component(substances.Component(smiles="O"),
                            amount=substances.MoleFraction(1.0))
    substance.add_component(
        substances.Component(smiles="CO",
                             role=substances.Component.Role.Solute),
        amount=substances.ExactAmount(1),
    )

    evaluator_data_set = PhysicalPropertyDataSet()

    evaluator_data_set.add_properties(
        SolvationFreeEnergy(
            thermodynamic_state=ThermodynamicState(
                temperature=298.15 * unit.kelvin,
                pressure=1.0 * unit.atmosphere),
            phase=PropertyPhase.Liquid,
            substance=substance,
            value=1.0 * SolvationFreeEnergy.default_unit(),
            uncertainty=1.0 * SolvationFreeEnergy.default_unit(),
        ), )

    data_set = DataSet(
        id="data-set",
        description=" ",
        authors=[Author(name=" ", email="*****@*****.**", institute=" ")],
        entries=[
            DataSetEntry(
                id=1,
                property_type="SolvationFreeEnergy",
                temperature=298.15,
                pressure=101.325,
                value=1.0,
                std_error=1.0,
                doi=" ",
                components=[
                    Component(smiles="O", mole_fraction=1.0),
                    Component(smiles="CO",
                              mole_fraction=0.0,
                              exact_amount=1,
                              role="Solute"),
                ],
            )
        ],
    )

    reindex_data_set(evaluator_data_set, data_set)
    assert evaluator_data_set.properties[0].id == "1"
Ejemplo n.º 8
0
def data_frame() -> pandas.DataFrame:

    temperatures = [298.15, 318.15]
    pressures = [101.325, 101.0]

    properties = [Density, EnthalpyOfMixing]

    mole_fractions = [(1.0, ), (1.0, ), (0.25, 0.75), (0.75, 0.25)]
    smiles = {1: [("C(F)(Cl)(Br)", ), ("C", )], 2: [("CO", "C"), ("C", "CO")]}

    loop_variables = [(
        temperature,
        pressure,
        property_type,
        mole_fraction,
    ) for temperature in temperatures for pressure in pressures
                      for property_type in properties
                      for mole_fraction in mole_fractions]

    data_entries = []

    for temperature, pressure, property_type, mole_fraction in loop_variables:

        n_components = len(mole_fraction)

        for smiles_tuple in smiles[n_components]:

            substance = Substance()

            for smiles_pattern, x in zip(smiles_tuple, mole_fraction):
                substance.add_component(Component(smiles_pattern),
                                        MoleFraction(x))

            data_entries.append(
                property_type(
                    thermodynamic_state=ThermodynamicState(
                        temperature=temperature * unit.kelvin,
                        pressure=pressure * unit.kilopascal,
                    ),
                    phase=PropertyPhase.Liquid,
                    value=1.0 * property_type.default_unit(),
                    uncertainty=1.0 * property_type.default_unit(),
                    source=MeasurementSource(doi=" "),
                    substance=substance,
                ))

    data_set = PhysicalPropertyDataSet()
    data_set.add_properties(*data_entries)

    return data_set.to_pandas()
Ejemplo n.º 9
0
def main():

    setup_timestamp_logging()

    # Load in the force field
    force_field_path = "smirnoff99Frosst-1.1.0.offxml"
    force_field_source = SmirnoffForceFieldSource.from_path(force_field_path)

    # Load in the data set containing the pure and binary properties.
    data_set = PhysicalPropertyDataSet.from_json("pure_data_set.json")
    data_set.merge(PhysicalPropertyDataSet.from_json("binary_data_set.json"))

    # Set up a server object to run the calculations using.
    server = setup_server(backend_type=BackendType.LocalGPU,
                          max_number_of_workers=1,
                          port=8001)

    with server:

        # Request the estimates.
        property_estimator = EvaluatorClient(
            ConnectionOptions(server_port=8001))

        for calculation_layer in ["SimulationLayer", "ReweightingLayer"]:

            options = RequestOptions()
            options.calculation_layers = [calculation_layer]

            parameter_gradient_keys = [
                ParameterGradientKey(tag="vdW",
                                     smirks="[#6X4:1]",
                                     attribute="epsilon"),
                ParameterGradientKey(tag="vdW",
                                     smirks="[#6X4:1]",
                                     attribute="rmin_half"),
            ]

            request, _ = property_estimator.request_estimate(
                property_set=data_set,
                force_field_source=force_field_source,
                options=options,
                parameter_gradient_keys=parameter_gradient_keys,
            )

            # Wait for the results.
            results, _ = request.results(True, 5)

            layer_name = re.sub(r"(?<!^)(?=[A-Z])", "_",
                                calculation_layer).lower()
            results.json(f"pure_binary_{layer_name}.json", True)
Ejemplo n.º 10
0
def test_protocol_replacement(force_field_source, expected_protocol_type):

    data_set = PhysicalPropertyDataSet()

    for property_type in property_types:
        physical_property = create_dummy_property(property_type)
        data_set.add_properties(physical_property)

    options = EvaluatorClient.default_request_options(data_set,
                                                      force_field_source)
    options_json = options.json(format=True)

    assert options_json.find('BaseBuildSystem"') < 0
    assert options_json.find(expected_protocol_type) >= 0
Ejemplo n.º 11
0
def test_analyze_non_integer_ids(mock_target, caplog):

    optimization, target, directory = mock_target

    reference_data_set: PhysicalPropertyDataSet = PhysicalPropertyDataSet.from_json(
        os.path.join(directory, "training-set.json"))
    assert len(reference_data_set
               ) == 1  # Sanity check in case this changes in future.
    reference_data_set.properties[0].id = "a"
    reference_data_set.json(os.path.join(directory, "training-set.json"))

    results = RequestResult()
    results.estimated_properties = reference_data_set
    results.json(os.path.join(directory, "results.json"))

    with caplog.at_level(logging.WARNING):

        target_result = EvaluatorAnalysisFactory.analyze(
            optimization=optimization,
            target=target,
            target_directory=directory,
            result_directory=directory,
            reindex=False,
        )

    assert ("The reference data set contains properties "
            "with ids that cannot be cast to integers" in caplog.text)

    assert numpy.isclose(target_result.objective_function, 1.0)
    assert len(target_result.statistic_entries) == 1
Ejemplo n.º 12
0
def test_submission():

    with tempfile.TemporaryDirectory() as directory:

        with temporarily_change_directory(directory):

            with DaskLocalCluster() as calculation_backend:

                # Spin up a server instance.
                server = EvaluatorServer(
                    calculation_backend=calculation_backend,
                    working_directory=directory,
                )

                with server:

                    # Connect a client.
                    client = EvaluatorClient()

                    # Submit an empty data set.
                    force_field_path = "smirnoff99Frosst-1.1.0.offxml"
                    force_field_source = SmirnoffForceFieldSource.from_path(
                        force_field_path)

                    request, error = client.request_estimate(
                        PhysicalPropertyDataSet(), force_field_source)
                    assert error is None
                    assert isinstance(request, Request)

                    result, error = request.results(polling_interval=0.01)
                    assert error is None
                    assert isinstance(result, RequestResult)
Ejemplo n.º 13
0
def data_frame() -> pandas.DataFrame:

    temperatures = [303.15, 298.15]
    property_types = [Density, EnthalpyOfVaporization]

    data_set_entries = []

    def _temperature_noise():
        return (numpy.random.rand() / 2.0 + 0.51) / 10.0

    for temperature in temperatures:

        for index, property_type in enumerate(property_types):

            noise = _temperature_noise()
            noise *= 1 if index == 0 else -1

            data_set_entries.append(
                property_type(
                    thermodynamic_state=ThermodynamicState(
                        temperature=temperature * unit.kelvin,
                        pressure=101.325 * unit.kilopascal,
                    ),
                    phase=PropertyPhase.Liquid,
                    value=1.0 * property_type.default_unit(),
                    uncertainty=1.0 * property_type.default_unit(),
                    source=MeasurementSource(doi=" "),
                    substance=Substance.from_components("C"),
                ), )
            data_set_entries.append(
                property_type(
                    thermodynamic_state=ThermodynamicState(
                        temperature=(temperature + noise) * unit.kelvin,
                        pressure=101.325 * unit.kilopascal,
                    ),
                    phase=PropertyPhase.Liquid,
                    value=1.0 * property_type.default_unit(),
                    uncertainty=1.0 * property_type.default_unit(),
                    source=MeasurementSource(doi=" "),
                    substance=Substance.from_components("C"),
                ), )

    data_set = PhysicalPropertyDataSet()
    data_set.add_properties(*data_set_entries)

    data_frame = data_set.to_pandas()
    return data_frame
Ejemplo n.º 14
0
def test_same_component_batching():

    thermodynamic_state = ThermodynamicState(temperature=1.0 * unit.kelvin,
                                             pressure=1.0 * unit.atmosphere)

    data_set = PhysicalPropertyDataSet()
    data_set.add_properties(
        Density(
            thermodynamic_state=thermodynamic_state,
            substance=Substance.from_components("O", "C"),
            value=0.0 * unit.kilogram / unit.meter**3,
        ),
        EnthalpyOfVaporization(
            thermodynamic_state=thermodynamic_state,
            substance=Substance.from_components("O", "C"),
            value=0.0 * unit.kilojoule / unit.mole,
        ),
        Density(
            thermodynamic_state=thermodynamic_state,
            substance=Substance.from_components("O", "CO"),
            value=0.0 * unit.kilogram / unit.meter**3,
        ),
        EnthalpyOfVaporization(
            thermodynamic_state=thermodynamic_state,
            substance=Substance.from_components("O", "CO"),
            value=0.0 * unit.kilojoule / unit.mole,
        ),
    )

    options = RequestOptions()

    submission = EvaluatorClient._Submission()
    submission.dataset = data_set
    submission.options = options

    with DaskLocalCluster() as calculation_backend:

        server = EvaluatorServer(calculation_backend)
        batches = server._batch_by_same_component(submission, "")

    assert len(batches) == 2

    assert len(batches[0].queued_properties) == 2
    assert len(batches[1].queued_properties) == 2
Ejemplo n.º 15
0
def test_benchmark_analysis(caplog, monkeypatch, dummy_conda_env):

    from openff.evaluator.client import RequestResult
    from openff.evaluator.datasets import PhysicalPropertyDataSet

    benchmark = create_benchmark(
        "project-1", "study-1", "benchmark-1", ["data-set-1"], "optimization-1", None
    )

    # Create a reference data set.
    reference_data_set = create_data_set("data-set-1")
    reference_data_set.entries.append(reference_data_set.entries[0].copy())
    reference_data_set.entries[0].id = 1
    reference_data_set.entries[1].id = 2

    # Create a set of evaluator results
    estimated_data_set = PhysicalPropertyDataSet()
    estimated_data_set.add_properties(reference_data_set.entries[0].to_evaluator())

    unsuccessful_properties = PhysicalPropertyDataSet()
    unsuccessful_properties.add_properties(reference_data_set.entries[1].to_evaluator())

    results = RequestResult()
    results.estimated_properties = estimated_data_set
    results.unsuccessful_properties = unsuccessful_properties

    with temporary_cd(os.path.dirname(dummy_conda_env)):

        # Save the expected input files.
        with open("benchmark.json", "w") as file:
            file.write(benchmark.json())

        with open("test-set-collection.json", "w") as file:
            file.write(DataSetCollection(data_sets=[reference_data_set]).json())

        results.json("results.json")

        with caplog.at_level(logging.WARNING):
            BenchmarkAnalysisFactory.analyze(True)

        assert (
            "1 properties could not be estimated and so were not analyzed"
            in caplog.text
        )

        assert os.path.isdir("analysis")
        assert os.path.isfile(os.path.join("analysis", "benchmark-results.json"))

        results_object = BenchmarkResult.parse_file(
            os.path.join("analysis", "benchmark-results.json")
        )
        assert len(results_object.calculation_environment) > 0
        assert len(results_object.analysis_environment) > 0
Ejemplo n.º 16
0
def test_default_options():
    """Test creating the default estimation options."""

    data_set = PhysicalPropertyDataSet()
    force_field_source = SmirnoffForceFieldSource.from_path(
        "smirnoff99Frosst-1.1.0.offxml")

    for property_type in property_types:
        physical_property = create_dummy_property(property_type)
        data_set.add_properties(physical_property)

    options = EvaluatorClient.default_request_options(data_set,
                                                      force_field_source)
    options.validate()

    assert len(options.calculation_layers) == 2
    assert len(options.calculation_schemas) == len(property_types)
    assert all(
        len(x) == len(options.calculation_layers)
        for x in options.calculation_schemas.values())
Ejemplo n.º 17
0
def data_frame() -> pandas.DataFrame:

    data_set = PhysicalPropertyDataSet()
    data_set.add_properties(
        Density(
            thermodynamic_state=ThermodynamicState(
                temperature=298.15 * unit.kelvin,
                pressure=101.325 * unit.kilopascal,
            ),
            phase=PropertyPhase.Liquid,
            value=1.0 * Density.default_unit(),
            uncertainty=1.0 * Density.default_unit(),
            source=MeasurementSource(doi=" "),
            substance=Substance.from_components("C"),
        ),
        Density(
            thermodynamic_state=ThermodynamicState(
                temperature=305.15 * unit.kelvin,
                pressure=101.325 * unit.kilopascal,
            ),
            phase=PropertyPhase.Liquid,
            value=1.0 * Density.default_unit(),
            uncertainty=1.0 * Density.default_unit(),
            source=MeasurementSource(doi=" "),
            substance=Substance.from_components("C"),
        ),
        Density(
            thermodynamic_state=ThermodynamicState(
                temperature=298.15 * unit.kelvin,
                pressure=105.325 * unit.kilopascal,
            ),
            phase=PropertyPhase.Liquid,
            value=1.0 * Density.default_unit(),
            uncertainty=1.0 * Density.default_unit(),
            source=MeasurementSource(doi=" "),
            substance=Substance.from_components("C"),
        ),
    )

    return data_set.to_pandas()
Ejemplo n.º 18
0
def test_launch_batch():

    # Set up a dummy data set
    data_set = PhysicalPropertyDataSet()
    data_set.add_properties(create_dummy_property(Density),
                            create_dummy_property(Density))

    batch = Batch()
    batch.force_field_id = ""
    batch.options = RequestOptions()
    batch.options.calculation_layers = ["QuickCalculationLayer"]
    batch.options.calculation_schemas = {
        "Density": {
            "QuickCalculationLayer": CalculationLayerSchema()
        }
    }
    batch.parameter_gradient_keys = []
    batch.queued_properties = [*data_set]
    batch.validate()

    with tempfile.TemporaryDirectory() as directory:

        with temporarily_change_directory(directory):

            with DaskLocalCluster() as calculation_backend:

                server = EvaluatorServer(
                    calculation_backend=calculation_backend,
                    working_directory=directory,
                )

                server._queued_batches[batch.id] = batch
                server._launch_batch(batch)

                while len(batch.queued_properties) > 0:
                    sleep(0.01)

                assert len(batch.estimated_properties) == 1
                assert len(batch.unsuccessful_properties) == 1
Ejemplo n.º 19
0
class RequestResult(AttributeClass):
    """The current results of an estimation request - these
    results may be partial if the server hasn't yet completed
    the request.
    """

    queued_properties = Attribute(
        docstring="The set of properties which have yet to be, or "
        "are currently being estimated.",
        type_hint=PhysicalPropertyDataSet,
        default_value=PhysicalPropertyDataSet(),
    )

    estimated_properties = Attribute(
        docstring=
        "The set of properties which have been successfully estimated.",
        type_hint=PhysicalPropertyDataSet,
        default_value=PhysicalPropertyDataSet(),
    )
    unsuccessful_properties = Attribute(
        docstring=
        "The set of properties which could not be successfully estimated.",
        type_hint=PhysicalPropertyDataSet,
        default_value=PhysicalPropertyDataSet(),
    )

    exceptions = Attribute(
        docstring="The set of properties which have yet to be, or "
        "are currently being estimated.",
        type_hint=list,
        default_value=[],
    )

    def validate(self, attribute_type=None):
        super(RequestResult, self).validate(attribute_type)
        assert all(
            (isinstance(x, EvaluatorException) for x in self.exceptions))
Ejemplo n.º 20
0
    def apply(cls, data_set, schema, n_processes=1):
        """Apply each component of this curation workflow to an initial data set in
        sequence.

        Parameters
        ----------
        data_set
            The data set to apply the workflow to. This may either be a
            data set object or it's pandas representation.
        schema
            The schema which defines the components to apply.
        n_processes
            The number of processes that each component is allowed to
            parallelize across.

        Returns
        -------
            The data set which has had the curation workflow applied to it.
        """

        component_classes = CurationComponent.components

        data_frame = data_set

        if isinstance(data_frame, PhysicalPropertyDataSet):
            data_frame = data_frame.to_pandas()

        data_frame = data_frame.copy()
        data_frame = data_frame.fillna(value=numpy.nan)

        for component_schema in schema.component_schemas:

            component_class_name = component_schema.__class__.__name__.replace(
                "Schema", "")
            component_class = component_classes[component_class_name]

            logger.info(f"Applying {component_class_name}")

            data_frame = component_class.apply(data_frame, component_schema,
                                               n_processes)

            logger.info(f"{component_class_name} applied")

            data_frame = data_frame.fillna(value=numpy.nan)

        if isinstance(data_set, PhysicalPropertyDataSet):
            data_frame = PhysicalPropertyDataSet.from_pandas(data_frame)

        return data_frame
Ejemplo n.º 21
0
def test_from_pandas():
    """A test to ensure that data sets may be created from pandas objects."""

    thermodynamic_state = ThermodynamicState(temperature=298.15 * unit.kelvin,
                                             pressure=1.0 * unit.atmosphere)

    original_data_set = PhysicalPropertyDataSet()
    original_data_set.add_properties(
        Density(
            thermodynamic_state=thermodynamic_state,
            phase=PropertyPhase.Liquid,
            substance=Substance.from_components("CO", "O"),
            value=1.0 * unit.kilogram / unit.meter**3,
            uncertainty=1.0 * unit.kilogram / unit.meter**3,
            source=MeasurementSource(doi="10.5281/zenodo.596537"),
        ),
        EnthalpyOfVaporization(
            thermodynamic_state=thermodynamic_state,
            phase=PropertyPhase.from_string("Liquid + Gas"),
            substance=Substance.from_components("C"),
            value=2.0 * unit.kilojoule / unit.mole,
            source=MeasurementSource(reference="2"),
        ),
        DielectricConstant(
            thermodynamic_state=thermodynamic_state,
            phase=PropertyPhase.Liquid,
            substance=Substance.from_components("C"),
            value=3.0 * unit.dimensionless,
            source=MeasurementSource(reference="3"),
        ),
    )

    data_frame = original_data_set.to_pandas()

    recreated_data_set = PhysicalPropertyDataSet.from_pandas(data_frame)
    assert len(original_data_set) == len(recreated_data_set)

    for original_property in original_data_set:

        recreated_property = next(x for x in recreated_data_set
                                  if x.id == original_property.id)

        assert (original_property.thermodynamic_state ==
                recreated_property.thermodynamic_state)
        assert original_property.phase == recreated_property.phase
        assert original_property.substance == recreated_property.substance
        assert numpy.isclose(original_property.value, recreated_property.value)

        if original_property.uncertainty == UNDEFINED:
            assert original_property.uncertainty == recreated_property.uncertainty
        else:
            assert numpy.isclose(original_property.uncertainty,
                                 recreated_property.uncertainty)

        assert original_property.source.doi == recreated_property.source.doi
        assert original_property.source.reference == recreated_property.source.reference
Ejemplo n.º 22
0
def simple_evaluator_data_set():
    """Create a simple evaluator `PhysicalPropertyDataSet` which contains
    a simple binary density measurement.

    Returns
    -------
    PhysicalPropertyDataSet
    """

    evaluator_density = Density(
        thermodynamic_state=ThermodynamicState(298.15 * unit.kelvin,
                                               pressure=1.0 * unit.atmosphere),
        phase=PropertyPhase.Liquid,
        substance=Substance.from_components("O", "CC=O"),
        value=1.0 * unit.kilogram / unit.meter**3,
        uncertainty=0.1 * unit.kilogram / unit.meter**3,
        source=MeasurementSource(doi="10.1000/xyz123"),
    )
    evaluator_density.id = "1"

    evaluator_data_set = PhysicalPropertyDataSet()
    evaluator_data_set.add_properties(evaluator_density)

    return evaluator_data_set
Ejemplo n.º 23
0
    def test_generate_evaluator_target(self, requests_mock):

        data_set = create_data_set("data-set-1")
        mock_get_data_set(requests_mock, data_set)

        target = create_evaluator_target("evaluator-target-1", [data_set.id])

        with temporary_cd():

            OptimizationInputFactory._generate_evaluator_target(
                target, 8000, None)

            assert os.path.isfile("training-set.json")
            off_data_set = PhysicalPropertyDataSet.from_json(
                "training-set.json")
            assert off_data_set.json() == data_set.to_evaluator().json()

            assert os.path.isfile("options.json")
Ejemplo n.º 24
0
def test_serialization():
    """A test to ensure that data sets are JSON serializable."""

    data_set = PhysicalPropertyDataSet()
    data_set.add_properties(create_dummy_property(Density))

    data_set_json = data_set.json()

    parsed_data_set = PhysicalPropertyDataSet.parse_json(data_set_json)
    assert len(data_set) == len(parsed_data_set)

    parsed_data_set_json = parsed_data_set.json()
    assert parsed_data_set_json == data_set_json
Ejemplo n.º 25
0
    def apply(cls, data_set, schema, n_processes=1):
        """Apply this curation component to a data set.

        Parameters
        ----------
        data_set
            The data frame to apply the component to.
        schema
            The schema which defines how this component should be applied.
        n_processes
            The number of processes that this component is allowed to
            parallelize across.

        Returns
        -------
            The data set which has had the component applied to it.
        """

        data_frame = data_set

        if isinstance(data_frame, PhysicalPropertyDataSet):
            data_frame = data_frame.to_pandas()

        modified_data_frame = cls._apply(data_frame, schema, n_processes)

        n_data_points = len(data_frame)
        n_filtered = len(modified_data_frame)

        if n_filtered != n_data_points:

            direction = "removed" if n_filtered < n_data_points else "added"

            logger.info(
                f"{abs(n_filtered - n_data_points)} data points were {direction} after "
                f"applying the {cls.__name__} component.")

        if isinstance(data_set, PhysicalPropertyDataSet):

            modified_data_frame = PhysicalPropertyDataSet.from_pandas(
                modified_data_frame)

        return modified_data_frame
Ejemplo n.º 26
0
def main():

    setup_timestamp_logging()

    # Load in the force field
    force_field_path = "smirnoff99Frosst-1.1.0.offxml"
    force_field_source = SmirnoffForceFieldSource.from_path(force_field_path)

    # Create a data set containing three solvation free energies.
    data_set = PhysicalPropertyDataSet.from_json("hydration_data_set.json")
    data_set.json("hydration_data_set.json", format=True)

    # Set up a server object to run the calculations using.
    server = setup_server(backend_type=BackendType.LocalGPU,
                          max_number_of_workers=1,
                          port=8002)

    with server:

        # Request the estimates.
        property_estimator = EvaluatorClient(
            ConnectionOptions(server_port=8002))

        options = RequestOptions()
        options.calculation_layers = ["SimulationLayer"]
        options.add_schema("SimulationLayer", "SolvationFreeEnergy",
                           _get_fixed_lambda_schema())

        request, _ = property_estimator.request_estimate(
            property_set=data_set,
            force_field_source=force_field_source,
            options=options,
        )

        # Wait for the results.
        results, _ = request.results(True, 60)

        # Save the result to file.
        results.json("results.json", True)
Ejemplo n.º 27
0
def test_properties_by_type():

    density = create_dummy_property(Density)
    dielectric = create_dummy_property(DielectricConstant)

    data_set = PhysicalPropertyDataSet()
    data_set.add_properties(density, dielectric)

    densities = [x for x in data_set.properties_by_type("Density")]
    assert len(densities) == 1
    assert densities[0] == density

    dielectrics = [
        x for x in data_set.properties_by_type("DielectricConstant")
    ]
    assert len(dielectrics) == 1
    assert dielectrics[0] == dielectric
Ejemplo n.º 28
0
def data_set(data_frame: pandas.DataFrame) -> PhysicalPropertyDataSet:
    return PhysicalPropertyDataSet.from_pandas(data_frame)
Ejemplo n.º 29
0
def test_analysed_result_from_evaluator():
    """Tests the `AnalysedResult.from_evaluator` function."""
    expected_mean = 0.0
    expected_std = numpy.random.rand() + 1.0

    values = numpy.random.normal(expected_mean, expected_std, 1000)

    estimated_properties = []
    reference_entries = []

    for index, value in enumerate(values):
        property_id = index + 1

        estimated_density = Density(
            thermodynamic_state=ThermodynamicState(298.15 * unit.kelvin,
                                                   pressure=1.0 *
                                                   unit.atmosphere),
            phase=PropertyPhase.Liquid,
            substance=Substance.from_components("O"),
            value=value * Density.default_unit(),
            uncertainty=0.0 * Density.default_unit(),
        )
        estimated_density.id = str(property_id)
        estimated_properties.append(estimated_density)

        reference_density = DataSetEntry(
            id=property_id,
            property_type="Density",
            temperature=298.15,
            pressure=101.325,
            value=expected_mean,
            std_error=None,
            doi=" ",
            components=[Component(smiles="O", mole_fraction=1.0)],
        )
        reference_entries.append(reference_density)

    estimated_data_set = PhysicalPropertyDataSet()
    estimated_data_set.add_properties(*estimated_properties)

    reference_data_set = DataSet(
        id="ref",
        description=" ",
        authors=[Author(name=" ", email="*****@*****.**", institute=" ")],
        entries=reference_entries,
    )

    analysis_environments = [ChemicalEnvironment.Aqueous]

    analysed_results = DataSetResult.from_evaluator(
        reference_data_set=reference_data_set,
        estimated_data_set=estimated_data_set,
        analysis_environments=analysis_environments,
        statistic_types=[StatisticType.RMSE],
        bootstrap_iterations=1000,
    )

    assert len(analysed_results.result_entries) == len(estimated_properties)

    full_statistics = next(
        iter(x for x in analysed_results.statistic_entries
             if x.category is None))

    assert full_statistics.property_type == "Density"
    assert full_statistics.n_components == 1
    assert full_statistics.statistic_type == StatisticType.RMSE
    assert numpy.isclose(full_statistics.value, expected_std, rtol=0.10)
def main():

    os.makedirs("raw_data_v2", exist_ok=True)

    for data_set_name in [
            "curated_data_set",
            "gaff 1.81",
            "gaff 2.11",
            "parsley 1.0.0",
            "smirnoff99frosst 1.1.0",
    ]:

        with open(os.path.join("raw_data", f"{data_set_name}.json")) as file:
            raw_data_set = json.load(file)

        assert (raw_data_set["@type"] ==
                "propertyestimator.datasets.datasets.PhysicalPropertyDataSet")

        physical_properties = []

        for raw_data_set_entries in raw_data_set["properties"].values():

            for raw_data_set_entry in raw_data_set_entries:

                # Extract the substance this entry was measured for.
                substance = Substance()

                for raw_component in raw_data_set_entry["substance"][
                        "components"]:

                    component = Component(
                        smiles=raw_component["smiles"],
                        role=Component.Role[raw_component["role"]["value"]],
                    )

                    raw_amounts = raw_data_set_entry["substance"]["amounts"][
                        raw_component["smiles"]]

                    for raw_amount in raw_amounts["value"]:

                        if (raw_amount["@type"] ==
                                "propertyestimator.substances.Substance->MoleFraction"
                            ):

                            substance.add_component(
                                component, MoleFraction(raw_amount["value"]))

                        elif (raw_amount["@type"] ==
                              "propertyestimator.substances.Substance->ExactAmount"
                              ):

                            substance.add_component(
                                component, ExactAmount(raw_amount["value"]))

                        else:
                            raise NotImplementedError()

                # Extract the source of the property
                if (raw_data_set_entry["source"]["@type"] ==
                        "propertyestimator.properties.properties.CalculationSource"
                    ):
                    source = CalculationSource(
                        fidelity=raw_data_set_entry["source"]["fidelity"])
                elif (raw_data_set_entry["source"]["@type"] ==
                      "propertyestimator.properties.properties.MeasurementSource"
                      ):
                    source = MeasurementSource(doi=correct_doi(
                        raw_data_set_entry["source"]["reference"]))
                else:
                    raise NotImplementedError()

                # Generate the new property object.
                property_class = getattr(
                    properties, raw_data_set_entry["@type"].split(".")[-1])

                physical_property = property_class(
                    thermodynamic_state=ThermodynamicState(
                        temperature=(
                            raw_data_set_entry["thermodynamic_state"]
                            ["temperature"]["value"] *
                            unit.Unit(raw_data_set_entry["thermodynamic_state"]
                                      ["temperature"]["unit"])),
                        pressure=(
                            raw_data_set_entry["thermodynamic_state"]
                            ["pressure"]["value"] *
                            unit.Unit(raw_data_set_entry["thermodynamic_state"]
                                      ["pressure"]["unit"])),
                    ),
                    phase=PropertyPhase(raw_data_set_entry["phase"]),
                    substance=substance,
                    value=(raw_data_set_entry["value"]["value"] *
                           unit.Unit(raw_data_set_entry["value"]["unit"])),
                    uncertainty=(
                        None if isinstance(source, MeasurementSource) else
                        (raw_data_set_entry["uncertainty"]["value"] *
                         unit.Unit(raw_data_set_entry["uncertainty"]["unit"])
                         )),
                    source=source,
                )
                physical_property.id = raw_data_set_entry["id"]

                physical_properties.append(physical_property)

        data_set = PhysicalPropertyDataSet()
        data_set.add_properties(*physical_properties)

        data_set.json(os.path.join("raw_data_v2", f"{data_set_name}.json"),
                      format=True)
        data_set.to_pandas().to_csv(
            os.path.join("raw_data_v2", f"{data_set_name}.csv"))