Ejemplo n.º 1
0
def test_filter_ionic_liquid():
    thermodynamic_state = ThermodynamicState(
        temperature=298.15 * unit.kelvin,
        pressure=101.325 * unit.kilopascal,
    )

    # Ensure ionic liquids are filtered.
    data_set = PhysicalPropertyDataSet()
    data_set.add_properties(
        Density(
            thermodynamic_state=thermodynamic_state,
            phase=PropertyPhase.Liquid,
            value=1.0 * Density.default_unit(),
            uncertainty=1.0 * Density.default_unit(),
            source=MeasurementSource(doi=" "),
            substance=Substance.from_components("[Na+].[Cl-]"),
        ),
        Density(
            thermodynamic_state=thermodynamic_state,
            phase=PropertyPhase.Liquid,
            value=1.0 * Density.default_unit(),
            uncertainty=1.0 * Density.default_unit(),
            source=MeasurementSource(doi=" "),
            substance=Substance.from_components("C"),
        ),
    )

    data_frame = data_set.to_pandas()

    filtered_frame = FilterByIonicLiquid.apply(
        data_frame,
        FilterByIonicLiquidSchema(),
    )

    assert len(filtered_frame) == 1
Ejemplo n.º 2
0
def create_filterable_data_set():
    """Creates a dummy data with a diverse set of properties to
    be filtered, namely:

        - a liquid density measured at 298 K and 0.5 atm with 1 component containing only carbon.
        - a gaseous dielectric measured at 288 K and 1 atm with 2 components containing only nitrogen.
        - a solid EoM measured at 308 K and 1.5 atm with 3 components containing only oxygen.

    Returns
    -------
    PhysicalPropertyDataSet
        The created data set.
    """

    source = CalculationSource("Dummy", {})
    carbon_substance = create_dummy_substance(number_of_components=1,
                                              elements=["C"])

    density_property = Density(
        thermodynamic_state=ThermodynamicState(temperature=298 * unit.kelvin,
                                               pressure=0.5 * unit.atmosphere),
        phase=PropertyPhase.Liquid,
        substance=carbon_substance,
        value=1 * unit.gram / unit.milliliter,
        uncertainty=0.11 * unit.gram / unit.milliliter,
        source=source,
    )

    nitrogen_substance = create_dummy_substance(number_of_components=2,
                                                elements=["N"])

    dielectric_property = DielectricConstant(
        thermodynamic_state=ThermodynamicState(temperature=288 * unit.kelvin,
                                               pressure=1 * unit.atmosphere),
        phase=PropertyPhase.Gas,
        substance=nitrogen_substance,
        value=1 * unit.dimensionless,
        uncertainty=0.11 * unit.dimensionless,
        source=source,
    )

    oxygen_substance = create_dummy_substance(number_of_components=3,
                                              elements=["O"])

    enthalpy_property = EnthalpyOfMixing(
        thermodynamic_state=ThermodynamicState(temperature=308 * unit.kelvin,
                                               pressure=1.5 * unit.atmosphere),
        phase=PropertyPhase.Solid,
        substance=oxygen_substance,
        value=1 * unit.kilojoules / unit.mole,
        uncertainty=0.11 * unit.kilojoules / unit.mole,
        source=source,
    )

    data_set = PhysicalPropertyDataSet()
    data_set.add_properties(density_property, dielectric_property,
                            enthalpy_property)

    return data_set
Ejemplo n.º 3
0
def estimated_reference_sets():
    estimated_density = Density(
        thermodynamic_state=ThermodynamicState(298.15 * unit.kelvin,
                                               pressure=1.0 * unit.atmosphere),
        phase=PropertyPhase.Liquid,
        substance=Substance.from_components("O", "CC=O"),
        value=1.0 * unit.kilogram / unit.meter**3,
        uncertainty=0.1 * unit.kilogram / unit.meter**3,
    )
    estimated_density.id = "1"
    estimated_enthalpy = EnthalpyOfMixing(
        thermodynamic_state=ThermodynamicState(298.15 * unit.kelvin,
                                               pressure=1.0 * unit.atmosphere),
        phase=PropertyPhase.Liquid,
        substance=Substance.from_components("O", "CC=O"),
        value=1.0 * unit.kilocalorie / unit.mole,
        uncertainty=0.1 * unit.kilojoule / unit.mole,
    )
    estimated_enthalpy.id = "2"

    estimated_data_set = PhysicalPropertyDataSet()
    estimated_data_set.add_properties(estimated_density, estimated_enthalpy)

    reference_density = DataSetEntry(
        id=1,
        property_type="Density",
        temperature=298.15,
        pressure=101.325,
        value=0.001,
        std_error=0.0001,
        doi=" ",
        components=[
            Component(smiles="O", mole_fraction=0.5),
            Component(smiles="CC=O", mole_fraction=0.5),
        ],
    )
    reference_enthalpy = DataSetEntry(
        id=2,
        property_type="EnthalpyOfMixing",
        temperature=298.15,
        pressure=101.325,
        value=4.184,
        std_error=0.1,
        doi=" ",
        components=[
            Component(smiles="O", mole_fraction=0.5),
            Component(smiles="CC=O", mole_fraction=0.5),
        ],
    )

    reference_data_set = DataSet(
        id="ref",
        description=" ",
        authors=[Author(name=" ", email="*****@*****.**", institute=" ")],
        entries=[reference_density, reference_enthalpy],
    )

    return estimated_data_set, reference_data_set
Ejemplo n.º 4
0
def test_sources_substances():

    physical_property = create_dummy_property(Density)

    data_set = PhysicalPropertyDataSet()
    data_set.add_properties(physical_property)

    assert next(iter(data_set.sources)) == physical_property.source
    assert next(iter(data_set.substances)) == physical_property.substance
Ejemplo n.º 5
0
def test_from_pandas():
    """A test to ensure that data sets may be created from pandas objects."""

    thermodynamic_state = ThermodynamicState(temperature=298.15 * unit.kelvin,
                                             pressure=1.0 * unit.atmosphere)

    original_data_set = PhysicalPropertyDataSet()
    original_data_set.add_properties(
        Density(
            thermodynamic_state=thermodynamic_state,
            phase=PropertyPhase.Liquid,
            substance=Substance.from_components("CO", "O"),
            value=1.0 * unit.kilogram / unit.meter**3,
            uncertainty=1.0 * unit.kilogram / unit.meter**3,
            source=MeasurementSource(doi="10.5281/zenodo.596537"),
        ),
        EnthalpyOfVaporization(
            thermodynamic_state=thermodynamic_state,
            phase=PropertyPhase.from_string("Liquid + Gas"),
            substance=Substance.from_components("C"),
            value=2.0 * unit.kilojoule / unit.mole,
            source=MeasurementSource(reference="2"),
        ),
        DielectricConstant(
            thermodynamic_state=thermodynamic_state,
            phase=PropertyPhase.Liquid,
            substance=Substance.from_components("C"),
            value=3.0 * unit.dimensionless,
            source=MeasurementSource(reference="3"),
        ),
    )

    data_frame = original_data_set.to_pandas()

    recreated_data_set = PhysicalPropertyDataSet.from_pandas(data_frame)
    assert len(original_data_set) == len(recreated_data_set)

    for original_property in original_data_set:

        recreated_property = next(x for x in recreated_data_set
                                  if x.id == original_property.id)

        assert (original_property.thermodynamic_state ==
                recreated_property.thermodynamic_state)
        assert original_property.phase == recreated_property.phase
        assert original_property.substance == recreated_property.substance
        assert numpy.isclose(original_property.value, recreated_property.value)

        if original_property.uncertainty == UNDEFINED:
            assert original_property.uncertainty == recreated_property.uncertainty
        else:
            assert numpy.isclose(original_property.uncertainty,
                                 recreated_property.uncertainty)

        assert original_property.source.doi == recreated_property.source.doi
        assert original_property.source.reference == recreated_property.source.reference
Ejemplo n.º 6
0
    def to_evaluator(self) -> "PhysicalPropertyDataSet":

        from openff.evaluator.datasets import PhysicalPropertyDataSet

        physical_properties = [entry.to_evaluator() for entry in self.entries]

        evaluator_set = PhysicalPropertyDataSet()
        evaluator_set.add_properties(*physical_properties)

        return evaluator_set
Ejemplo n.º 7
0
def test_reindex_data_set_no_mole_fraction():
    """Tests that the ``reindex_data_set`` function behaves as expected
    when exact amounts are present."""

    setup_timestamp_logging(logging.INFO)

    substance = substances.Substance()
    substance.add_component(substances.Component(smiles="O"),
                            amount=substances.MoleFraction(1.0))
    substance.add_component(
        substances.Component(smiles="CO",
                             role=substances.Component.Role.Solute),
        amount=substances.ExactAmount(1),
    )

    evaluator_data_set = PhysicalPropertyDataSet()

    evaluator_data_set.add_properties(
        SolvationFreeEnergy(
            thermodynamic_state=ThermodynamicState(
                temperature=298.15 * unit.kelvin,
                pressure=1.0 * unit.atmosphere),
            phase=PropertyPhase.Liquid,
            substance=substance,
            value=1.0 * SolvationFreeEnergy.default_unit(),
            uncertainty=1.0 * SolvationFreeEnergy.default_unit(),
        ), )

    data_set = DataSet(
        id="data-set",
        description=" ",
        authors=[Author(name=" ", email="*****@*****.**", institute=" ")],
        entries=[
            DataSetEntry(
                id=1,
                property_type="SolvationFreeEnergy",
                temperature=298.15,
                pressure=101.325,
                value=1.0,
                std_error=1.0,
                doi=" ",
                components=[
                    Component(smiles="O", mole_fraction=1.0),
                    Component(smiles="CO",
                              mole_fraction=0.0,
                              exact_amount=1,
                              role="Solute"),
                ],
            )
        ],
    )

    reindex_data_set(evaluator_data_set, data_set)
    assert evaluator_data_set.properties[0].id == "1"
Ejemplo n.º 8
0
def test_benchmark_analysis(caplog, monkeypatch, dummy_conda_env):

    from openff.evaluator.client import RequestResult
    from openff.evaluator.datasets import PhysicalPropertyDataSet

    benchmark = create_benchmark(
        "project-1", "study-1", "benchmark-1", ["data-set-1"], "optimization-1", None
    )

    # Create a reference data set.
    reference_data_set = create_data_set("data-set-1")
    reference_data_set.entries.append(reference_data_set.entries[0].copy())
    reference_data_set.entries[0].id = 1
    reference_data_set.entries[1].id = 2

    # Create a set of evaluator results
    estimated_data_set = PhysicalPropertyDataSet()
    estimated_data_set.add_properties(reference_data_set.entries[0].to_evaluator())

    unsuccessful_properties = PhysicalPropertyDataSet()
    unsuccessful_properties.add_properties(reference_data_set.entries[1].to_evaluator())

    results = RequestResult()
    results.estimated_properties = estimated_data_set
    results.unsuccessful_properties = unsuccessful_properties

    with temporary_cd(os.path.dirname(dummy_conda_env)):

        # Save the expected input files.
        with open("benchmark.json", "w") as file:
            file.write(benchmark.json())

        with open("test-set-collection.json", "w") as file:
            file.write(DataSetCollection(data_sets=[reference_data_set]).json())

        results.json("results.json")

        with caplog.at_level(logging.WARNING):
            BenchmarkAnalysisFactory.analyze(True)

        assert (
            "1 properties could not be estimated and so were not analyzed"
            in caplog.text
        )

        assert os.path.isdir("analysis")
        assert os.path.isfile(os.path.join("analysis", "benchmark-results.json"))

        results_object = BenchmarkResult.parse_file(
            os.path.join("analysis", "benchmark-results.json")
        )
        assert len(results_object.calculation_environment) > 0
        assert len(results_object.analysis_environment) > 0
Ejemplo n.º 9
0
def test_serialization():
    """A test to ensure that data sets are JSON serializable."""

    data_set = PhysicalPropertyDataSet()
    data_set.add_properties(create_dummy_property(Density))

    data_set_json = data_set.json()

    parsed_data_set = PhysicalPropertyDataSet.parse_json(data_set_json)
    assert len(data_set) == len(parsed_data_set)

    parsed_data_set_json = parsed_data_set.json()
    assert parsed_data_set_json == data_set_json
Ejemplo n.º 10
0
def test_protocol_replacement(force_field_source, expected_protocol_type):

    data_set = PhysicalPropertyDataSet()

    for property_type in property_types:
        physical_property = create_dummy_property(property_type)
        data_set.add_properties(physical_property)

    options = EvaluatorClient.default_request_options(data_set,
                                                      force_field_source)
    options_json = options.json(format=True)

    assert options_json.find('BaseBuildSystem"') < 0
    assert options_json.find(expected_protocol_type) >= 0
Ejemplo n.º 11
0
def data_frame() -> pandas.DataFrame:

    temperatures = [298.15, 318.15]
    pressures = [101.325, 101.0]

    properties = [Density, EnthalpyOfMixing]

    mole_fractions = [(1.0, ), (1.0, ), (0.25, 0.75), (0.75, 0.25)]
    smiles = {1: [("C(F)(Cl)(Br)", ), ("C", )], 2: [("CO", "C"), ("C", "CO")]}

    loop_variables = [(
        temperature,
        pressure,
        property_type,
        mole_fraction,
    ) for temperature in temperatures for pressure in pressures
                      for property_type in properties
                      for mole_fraction in mole_fractions]

    data_entries = []

    for temperature, pressure, property_type, mole_fraction in loop_variables:

        n_components = len(mole_fraction)

        for smiles_tuple in smiles[n_components]:

            substance = Substance()

            for smiles_pattern, x in zip(smiles_tuple, mole_fraction):
                substance.add_component(Component(smiles_pattern),
                                        MoleFraction(x))

            data_entries.append(
                property_type(
                    thermodynamic_state=ThermodynamicState(
                        temperature=temperature * unit.kelvin,
                        pressure=pressure * unit.kilopascal,
                    ),
                    phase=PropertyPhase.Liquid,
                    value=1.0 * property_type.default_unit(),
                    uncertainty=1.0 * property_type.default_unit(),
                    source=MeasurementSource(doi=" "),
                    substance=substance,
                ))

    data_set = PhysicalPropertyDataSet()
    data_set.add_properties(*data_entries)

    return data_set.to_pandas()
Ejemplo n.º 12
0
def data_frame() -> pandas.DataFrame:

    temperatures = [303.15, 298.15]
    property_types = [Density, EnthalpyOfVaporization]

    data_set_entries = []

    def _temperature_noise():
        return (numpy.random.rand() / 2.0 + 0.51) / 10.0

    for temperature in temperatures:

        for index, property_type in enumerate(property_types):

            noise = _temperature_noise()
            noise *= 1 if index == 0 else -1

            data_set_entries.append(
                property_type(
                    thermodynamic_state=ThermodynamicState(
                        temperature=temperature * unit.kelvin,
                        pressure=101.325 * unit.kilopascal,
                    ),
                    phase=PropertyPhase.Liquid,
                    value=1.0 * property_type.default_unit(),
                    uncertainty=1.0 * property_type.default_unit(),
                    source=MeasurementSource(doi=" "),
                    substance=Substance.from_components("C"),
                ), )
            data_set_entries.append(
                property_type(
                    thermodynamic_state=ThermodynamicState(
                        temperature=(temperature + noise) * unit.kelvin,
                        pressure=101.325 * unit.kilopascal,
                    ),
                    phase=PropertyPhase.Liquid,
                    value=1.0 * property_type.default_unit(),
                    uncertainty=1.0 * property_type.default_unit(),
                    source=MeasurementSource(doi=" "),
                    substance=Substance.from_components("C"),
                ), )

    data_set = PhysicalPropertyDataSet()
    data_set.add_properties(*data_set_entries)

    data_frame = data_set.to_pandas()
    return data_frame
Ejemplo n.º 13
0
def test_properties_by_type():

    density = create_dummy_property(Density)
    dielectric = create_dummy_property(DielectricConstant)

    data_set = PhysicalPropertyDataSet()
    data_set.add_properties(density, dielectric)

    densities = [x for x in data_set.properties_by_type("Density")]
    assert len(densities) == 1
    assert densities[0] == density

    dielectrics = [
        x for x in data_set.properties_by_type("DielectricConstant")
    ]
    assert len(dielectrics) == 1
    assert dielectrics[0] == dielectric
Ejemplo n.º 14
0
def test_same_component_batching():

    thermodynamic_state = ThermodynamicState(temperature=1.0 * unit.kelvin,
                                             pressure=1.0 * unit.atmosphere)

    data_set = PhysicalPropertyDataSet()
    data_set.add_properties(
        Density(
            thermodynamic_state=thermodynamic_state,
            substance=Substance.from_components("O", "C"),
            value=0.0 * unit.kilogram / unit.meter**3,
        ),
        EnthalpyOfVaporization(
            thermodynamic_state=thermodynamic_state,
            substance=Substance.from_components("O", "C"),
            value=0.0 * unit.kilojoule / unit.mole,
        ),
        Density(
            thermodynamic_state=thermodynamic_state,
            substance=Substance.from_components("O", "CO"),
            value=0.0 * unit.kilogram / unit.meter**3,
        ),
        EnthalpyOfVaporization(
            thermodynamic_state=thermodynamic_state,
            substance=Substance.from_components("O", "CO"),
            value=0.0 * unit.kilojoule / unit.mole,
        ),
    )

    options = RequestOptions()

    submission = EvaluatorClient._Submission()
    submission.dataset = data_set
    submission.options = options

    with DaskLocalCluster() as calculation_backend:

        server = EvaluatorServer(calculation_backend)
        batches = server._batch_by_same_component(submission, "")

    assert len(batches) == 2

    assert len(batches[0].queued_properties) == 2
    assert len(batches[1].queued_properties) == 2
Ejemplo n.º 15
0
def test_default_options():
    """Test creating the default estimation options."""

    data_set = PhysicalPropertyDataSet()
    force_field_source = SmirnoffForceFieldSource.from_path(
        "smirnoff99Frosst-1.1.0.offxml")

    for property_type in property_types:
        physical_property = create_dummy_property(property_type)
        data_set.add_properties(physical_property)

    options = EvaluatorClient.default_request_options(data_set,
                                                      force_field_source)
    options.validate()

    assert len(options.calculation_layers) == 2
    assert len(options.calculation_schemas) == len(property_types)
    assert all(
        len(x) == len(options.calculation_layers)
        for x in options.calculation_schemas.values())
Ejemplo n.º 16
0
def data_frame() -> pandas.DataFrame:

    data_set = PhysicalPropertyDataSet()
    data_set.add_properties(
        Density(
            thermodynamic_state=ThermodynamicState(
                temperature=298.15 * unit.kelvin,
                pressure=101.325 * unit.kilopascal,
            ),
            phase=PropertyPhase.Liquid,
            value=1.0 * Density.default_unit(),
            uncertainty=1.0 * Density.default_unit(),
            source=MeasurementSource(doi=" "),
            substance=Substance.from_components("C"),
        ),
        Density(
            thermodynamic_state=ThermodynamicState(
                temperature=305.15 * unit.kelvin,
                pressure=101.325 * unit.kilopascal,
            ),
            phase=PropertyPhase.Liquid,
            value=1.0 * Density.default_unit(),
            uncertainty=1.0 * Density.default_unit(),
            source=MeasurementSource(doi=" "),
            substance=Substance.from_components("C"),
        ),
        Density(
            thermodynamic_state=ThermodynamicState(
                temperature=298.15 * unit.kelvin,
                pressure=105.325 * unit.kilopascal,
            ),
            phase=PropertyPhase.Liquid,
            value=1.0 * Density.default_unit(),
            uncertainty=1.0 * Density.default_unit(),
            source=MeasurementSource(doi=" "),
            substance=Substance.from_components("C"),
        ),
    )

    return data_set.to_pandas()
Ejemplo n.º 17
0
def test_validate_data_set():

    valid_property = Density(
        ThermodynamicState(298 * unit.kelvin, 1 * unit.atmosphere),
        PropertyPhase.Liquid,
        Substance.from_components("O"),
        0.0 * unit.gram / unit.milliliter,
        0.0 * unit.gram / unit.milliliter,
    )

    data_set = PhysicalPropertyDataSet()
    data_set.add_properties(valid_property)

    data_set.validate()

    invalid_property = Density(
        ThermodynamicState(-1 * unit.kelvin, 1 * unit.atmosphere),
        PropertyPhase.Liquid,
        Substance.from_components("O"),
        0.0 * unit.gram / unit.milliliter,
        0.0 * unit.gram / unit.milliliter,
    )

    with pytest.raises(AssertionError):
        data_set.add_properties(invalid_property)

    data_set.add_properties(invalid_property, validate=False)

    with pytest.raises(AssertionError):
        data_set.validate()
Ejemplo n.º 18
0
def test_launch_batch():

    # Set up a dummy data set
    data_set = PhysicalPropertyDataSet()
    data_set.add_properties(create_dummy_property(Density),
                            create_dummy_property(Density))

    batch = Batch()
    batch.force_field_id = ""
    batch.options = RequestOptions()
    batch.options.calculation_layers = ["QuickCalculationLayer"]
    batch.options.calculation_schemas = {
        "Density": {
            "QuickCalculationLayer": CalculationLayerSchema()
        }
    }
    batch.parameter_gradient_keys = []
    batch.queued_properties = [*data_set]
    batch.validate()

    with tempfile.TemporaryDirectory() as directory:

        with temporarily_change_directory(directory):

            with DaskLocalCluster() as calculation_backend:

                server = EvaluatorServer(
                    calculation_backend=calculation_backend,
                    working_directory=directory,
                )

                server._queued_batches[batch.id] = batch
                server._launch_batch(batch)

                while len(batch.queued_properties) > 0:
                    sleep(0.01)

                assert len(batch.estimated_properties) == 1
                assert len(batch.unsuccessful_properties) == 1
Ejemplo n.º 19
0
def simple_evaluator_data_set():
    """Create a simple evaluator `PhysicalPropertyDataSet` which contains
    a simple binary density measurement.

    Returns
    -------
    PhysicalPropertyDataSet
    """

    evaluator_density = Density(
        thermodynamic_state=ThermodynamicState(298.15 * unit.kelvin,
                                               pressure=1.0 * unit.atmosphere),
        phase=PropertyPhase.Liquid,
        substance=Substance.from_components("O", "CC=O"),
        value=1.0 * unit.kilogram / unit.meter**3,
        uncertainty=0.1 * unit.kilogram / unit.meter**3,
        source=MeasurementSource(doi="10.1000/xyz123"),
    )
    evaluator_density.id = "1"

    evaluator_data_set = PhysicalPropertyDataSet()
    evaluator_data_set.add_properties(evaluator_density)

    return evaluator_data_set
Ejemplo n.º 20
0
    def _apply(
        cls,
        data_frame: pandas.DataFrame,
        schema: ImportFreeSolvSchema,
        n_processes,
    ) -> pandas.DataFrame:

        from openff.evaluator import properties, substances, unit

        # Convert the data frame into data rows.
        free_solv_data_frame = cls._download_free_solv()

        data_entries = []

        for _, row in free_solv_data_frame.iterrows():

            # Extract and standardize the SMILES pattern of the
            solute_smiles = row["SMILES"].lstrip().rstrip()
            solute_smiles = substances.Component(solute_smiles).smiles

            # Build the substance.
            substance = Substance()
            substance.add_component(Component(smiles="O"), MoleFraction(1.0))
            substance.add_component(
                Component(smiles=solute_smiles, role=Component.Role.Solute),
                ExactAmount(1),
            )

            # Extract the value and uncertainty
            value = (float(row["experimental value (kcal/mol)"]) *
                     unit.kilocalorie / unit.mole)
            std_error = (float(row["experimental uncertainty (kcal/mol)"]) *
                         unit.kilocalorie / unit.mole)

            # Attempt to extract a DOI
            original_source = row[
                "experimental reference (original or paper this value was taken from)"]
            doi = cls._validate_doi(original_source)

            data_entry = SolvationFreeEnergy(
                thermodynamic_state=ThermodynamicState(
                    temperature=298.15 * unit.kelvin,
                    pressure=101.325 * unit.kilopascal,
                ),
                phase=PropertyPhase.Liquid,
                substance=substance,
                value=value.to(properties.SolvationFreeEnergy.default_unit()),
                uncertainty=std_error.to(
                    properties.SolvationFreeEnergy.default_unit()),
                source=MeasurementSource(doi=doi),
            )
            data_entries.append(data_entry)

        data_set = PhysicalPropertyDataSet()
        data_set.add_properties(*data_entries)

        free_solv_data_frame = data_set.to_pandas()

        data_frame = pandas.concat([data_frame, free_solv_data_frame],
                                   ignore_index=True,
                                   sort=False)

        return data_frame
Ejemplo n.º 21
0
def test_to_pandas():
    """A test to ensure that data sets are convertable to pandas objects."""

    source = CalculationSource("Dummy", {})

    pure_substance = Substance.from_components("C")
    binary_substance = Substance.from_components("C", "O")

    data_set = PhysicalPropertyDataSet()

    for temperature in [
            298 * unit.kelvin, 300 * unit.kelvin, 302 * unit.kelvin
    ]:

        thermodynamic_state = ThermodynamicState(temperature=temperature,
                                                 pressure=1.0 *
                                                 unit.atmosphere)

        density_property = Density(
            thermodynamic_state=thermodynamic_state,
            phase=PropertyPhase.Liquid,
            substance=pure_substance,
            value=1 * unit.gram / unit.milliliter,
            uncertainty=0.11 * unit.gram / unit.milliliter,
            source=source,
        )

        dielectric_property = DielectricConstant(
            thermodynamic_state=thermodynamic_state,
            phase=PropertyPhase.Liquid,
            substance=pure_substance,
            value=1 * unit.dimensionless,
            uncertainty=0.11 * unit.dimensionless,
            source=source,
        )

        data_set.add_properties(density_property)
        data_set.add_properties(dielectric_property)

    for temperature in [
            298 * unit.kelvin, 300 * unit.kelvin, 302 * unit.kelvin
    ]:

        thermodynamic_state = ThermodynamicState(temperature=temperature,
                                                 pressure=1.0 *
                                                 unit.atmosphere)

        enthalpy_property = EnthalpyOfMixing(
            thermodynamic_state=thermodynamic_state,
            phase=PropertyPhase.Liquid,
            substance=binary_substance,
            value=1 * unit.kilojoules / unit.mole,
            uncertainty=0.11 * unit.kilojoules / unit.mole,
            source=source,
        )

        excess_property = ExcessMolarVolume(
            thermodynamic_state=thermodynamic_state,
            phase=PropertyPhase.Liquid,
            substance=binary_substance,
            value=1 * unit.meter**3 / unit.mole,
            uncertainty=0.11 * unit.meter**3 / unit.mole,
            source=source,
        )

        data_set.add_properties(enthalpy_property)
        data_set.add_properties(excess_property)

    data_set_pandas = data_set.to_pandas()

    required_columns = [
        "Id",
        "Temperature (K)",
        "Pressure (kPa)",
        "Phase",
        "N Components",
        "Source",
        "Component 1",
        "Role 1",
        "Mole Fraction 1",
        "Exact Amount 1",
        "Component 2",
        "Role 2",
        "Mole Fraction 2",
        "Exact Amount 2",
    ]

    assert all(x in data_set_pandas for x in required_columns)

    assert data_set_pandas is not None
    assert data_set_pandas.shape == (12, 22)

    data_set_without_na = data_set_pandas.dropna(axis=1, how="all")
    assert data_set_without_na.shape == (12, 20)
Ejemplo n.º 22
0
def test_reindex_data_set():
    """Tests that the ``reindex_data_set`` function behaves as expected."""

    setup_timestamp_logging(logging.INFO)

    evaluator_data_set = PhysicalPropertyDataSet()

    evaluator_data_set.add_properties(
        Density(
            thermodynamic_state=ThermodynamicState(
                temperature=298.15 * unit.kelvin,
                pressure=1.0 * unit.atmosphere),
            phase=PropertyPhase.Liquid,
            substance=substances.Substance.from_components("O"),
            value=1.0 * Density.default_unit(),
            uncertainty=1.0 * Density.default_unit(),
        ),
        Density(
            thermodynamic_state=ThermodynamicState(
                temperature=298.15 * unit.kelvin,
                pressure=1.0 * unit.atmosphere),
            phase=PropertyPhase.Liquid,
            substance=substances.Substance.from_components("C", "O"),
            value=1.0 * Density.default_unit(),
            uncertainty=1.0 * Density.default_unit(),
        ),
        Density(
            thermodynamic_state=ThermodynamicState(
                temperature=300.0 * unit.kelvin,
                pressure=1.0 * unit.atmosphere),
            phase=PropertyPhase.Liquid,
            substance=substances.Substance.from_components("C", "O"),
            value=1.0 * Density.default_unit(),
            uncertainty=1.0 * Density.default_unit(),
        ),
    )

    data_set = DataSet(
        id="data-set",
        description=" ",
        authors=[Author(name=" ", email="*****@*****.**", institute=" ")],
        entries=[
            DataSetEntry(
                id=1,
                property_type="Density",
                temperature=298.15,
                pressure=101.325,
                value=1.0,
                std_error=1.0,
                doi=" ",
                components=[
                    Component(smiles="O", mole_fraction=0.5),
                    Component(smiles="C", mole_fraction=0.5),
                ],
            ),
            DataSetEntry(
                id=2,
                property_type="Density",
                temperature=298.15,
                pressure=101.325,
                value=1.0,
                std_error=1.0,
                doi=" ",
                components=[Component(smiles="O", mole_fraction=1.0)],
            ),
        ],
    )

    un_indexed_id = evaluator_data_set.properties[2].id

    reindex_data_set(evaluator_data_set, data_set)

    assert evaluator_data_set.properties[0].id == "2"
    assert evaluator_data_set.properties[1].id == "1"
    assert evaluator_data_set.properties[2].id == un_indexed_id

    data_set_collection = DataSetCollection(data_sets=[
        DataSet(
            id="0",
            description=" ",
            authors=[Author(name=" ", email="*****@*****.**", institute=" ")],
            entries=[
                DataSetEntry(
                    id=3,
                    property_type="Density",
                    temperature=298.15,
                    pressure=101.325,
                    value=1.0,
                    std_error=1.0,
                    doi=" ",
                    components=[
                        Component(smiles="O", mole_fraction=0.5),
                        Component(smiles="C", mole_fraction=0.5),
                    ],
                )
            ],
        ),
        DataSet(
            id="1",
            description=" ",
            authors=[Author(name=" ", email="*****@*****.**", institute=" ")],
            entries=[
                DataSetEntry(
                    id=4,
                    property_type="Density",
                    temperature=298.15,
                    pressure=101.325,
                    value=1.0,
                    std_error=1.0,
                    doi=" ",
                    components=[Component(smiles="O", mole_fraction=1.0)],
                )
            ],
        ),
    ])

    reindex_data_set(evaluator_data_set, data_set_collection)

    assert evaluator_data_set.properties[0].id == "4"
    assert evaluator_data_set.properties[1].id == "3"
    assert evaluator_data_set.properties[2].id == un_indexed_id
Ejemplo n.º 23
0
def test_analysed_result_from_evaluator():
    """Tests the `AnalysedResult.from_evaluator` function."""
    expected_mean = 0.0
    expected_std = numpy.random.rand() + 1.0

    values = numpy.random.normal(expected_mean, expected_std, 1000)

    estimated_properties = []
    reference_entries = []

    for index, value in enumerate(values):
        property_id = index + 1

        estimated_density = Density(
            thermodynamic_state=ThermodynamicState(298.15 * unit.kelvin,
                                                   pressure=1.0 *
                                                   unit.atmosphere),
            phase=PropertyPhase.Liquid,
            substance=Substance.from_components("O"),
            value=value * Density.default_unit(),
            uncertainty=0.0 * Density.default_unit(),
        )
        estimated_density.id = str(property_id)
        estimated_properties.append(estimated_density)

        reference_density = DataSetEntry(
            id=property_id,
            property_type="Density",
            temperature=298.15,
            pressure=101.325,
            value=expected_mean,
            std_error=None,
            doi=" ",
            components=[Component(smiles="O", mole_fraction=1.0)],
        )
        reference_entries.append(reference_density)

    estimated_data_set = PhysicalPropertyDataSet()
    estimated_data_set.add_properties(*estimated_properties)

    reference_data_set = DataSet(
        id="ref",
        description=" ",
        authors=[Author(name=" ", email="*****@*****.**", institute=" ")],
        entries=reference_entries,
    )

    analysis_environments = [ChemicalEnvironment.Aqueous]

    analysed_results = DataSetResult.from_evaluator(
        reference_data_set=reference_data_set,
        estimated_data_set=estimated_data_set,
        analysis_environments=analysis_environments,
        statistic_types=[StatisticType.RMSE],
        bootstrap_iterations=1000,
    )

    assert len(analysed_results.result_entries) == len(estimated_properties)

    full_statistics = next(
        iter(x for x in analysed_results.statistic_entries
             if x.category is None))

    assert full_statistics.property_type == "Density"
    assert full_statistics.n_components == 1
    assert full_statistics.statistic_type == StatisticType.RMSE
    assert numpy.isclose(full_statistics.value, expected_std, rtol=0.10)
def main():

    os.makedirs("raw_data_v2", exist_ok=True)

    for data_set_name in [
            "curated_data_set",
            "gaff 1.81",
            "gaff 2.11",
            "parsley 1.0.0",
            "smirnoff99frosst 1.1.0",
    ]:

        with open(os.path.join("raw_data", f"{data_set_name}.json")) as file:
            raw_data_set = json.load(file)

        assert (raw_data_set["@type"] ==
                "propertyestimator.datasets.datasets.PhysicalPropertyDataSet")

        physical_properties = []

        for raw_data_set_entries in raw_data_set["properties"].values():

            for raw_data_set_entry in raw_data_set_entries:

                # Extract the substance this entry was measured for.
                substance = Substance()

                for raw_component in raw_data_set_entry["substance"][
                        "components"]:

                    component = Component(
                        smiles=raw_component["smiles"],
                        role=Component.Role[raw_component["role"]["value"]],
                    )

                    raw_amounts = raw_data_set_entry["substance"]["amounts"][
                        raw_component["smiles"]]

                    for raw_amount in raw_amounts["value"]:

                        if (raw_amount["@type"] ==
                                "propertyestimator.substances.Substance->MoleFraction"
                            ):

                            substance.add_component(
                                component, MoleFraction(raw_amount["value"]))

                        elif (raw_amount["@type"] ==
                              "propertyestimator.substances.Substance->ExactAmount"
                              ):

                            substance.add_component(
                                component, ExactAmount(raw_amount["value"]))

                        else:
                            raise NotImplementedError()

                # Extract the source of the property
                if (raw_data_set_entry["source"]["@type"] ==
                        "propertyestimator.properties.properties.CalculationSource"
                    ):
                    source = CalculationSource(
                        fidelity=raw_data_set_entry["source"]["fidelity"])
                elif (raw_data_set_entry["source"]["@type"] ==
                      "propertyestimator.properties.properties.MeasurementSource"
                      ):
                    source = MeasurementSource(doi=correct_doi(
                        raw_data_set_entry["source"]["reference"]))
                else:
                    raise NotImplementedError()

                # Generate the new property object.
                property_class = getattr(
                    properties, raw_data_set_entry["@type"].split(".")[-1])

                physical_property = property_class(
                    thermodynamic_state=ThermodynamicState(
                        temperature=(
                            raw_data_set_entry["thermodynamic_state"]
                            ["temperature"]["value"] *
                            unit.Unit(raw_data_set_entry["thermodynamic_state"]
                                      ["temperature"]["unit"])),
                        pressure=(
                            raw_data_set_entry["thermodynamic_state"]
                            ["pressure"]["value"] *
                            unit.Unit(raw_data_set_entry["thermodynamic_state"]
                                      ["pressure"]["unit"])),
                    ),
                    phase=PropertyPhase(raw_data_set_entry["phase"]),
                    substance=substance,
                    value=(raw_data_set_entry["value"]["value"] *
                           unit.Unit(raw_data_set_entry["value"]["unit"])),
                    uncertainty=(
                        None if isinstance(source, MeasurementSource) else
                        (raw_data_set_entry["uncertainty"]["value"] *
                         unit.Unit(raw_data_set_entry["uncertainty"]["unit"])
                         )),
                    source=source,
                )
                physical_property.id = raw_data_set_entry["id"]

                physical_properties.append(physical_property)

        data_set = PhysicalPropertyDataSet()
        data_set.add_properties(*physical_properties)

        data_set.json(os.path.join("raw_data_v2", f"{data_set_name}.json"),
                      format=True)
        data_set.to_pandas().to_csv(
            os.path.join("raw_data_v2", f"{data_set_name}.csv"))
Ejemplo n.º 25
0
def complete_evaluator_data_set():
    """Create a more comprehensive `PhysicalPropertyDataSet` which contains one
    measurement for each of:

        * pure density
        * binary density
        * pure enthalpy of vaporization
        * binary enthalpy of mixing
        * binary excess molar volume
        * hydration free energy

    Returns
    -------
    PhysicalPropertyDataSet
    """
    thermodynamic_state = ThermodynamicState(298.15 * unit.kelvin,
                                             pressure=1.0 * unit.atmosphere)
    source = MeasurementSource(doi="10.1000/xyz123")

    solvation_substance = Substance()
    solvation_substance.add_component(Component("O"), MoleFraction(1.0))
    solvation_substance.add_component(Component("CCCO"), ExactAmount(1))

    evaluator_properties = [
        Density(
            thermodynamic_state=thermodynamic_state,
            phase=PropertyPhase.Liquid,
            substance=Substance.from_components("O"),
            value=1.0 * unit.kilogram / unit.meter**3,
            uncertainty=0.1 * unit.kilogram / unit.meter**3,
            source=source,
        ),
        Density(
            thermodynamic_state=thermodynamic_state,
            phase=PropertyPhase.Liquid,
            substance=Substance.from_components("O", "CC=O"),
            value=1.0 * unit.kilogram / unit.meter**3,
            uncertainty=0.1 * unit.kilogram / unit.meter**3,
            source=source,
        ),
        EnthalpyOfVaporization(
            thermodynamic_state=thermodynamic_state,
            phase=PropertyPhase(PropertyPhase.Liquid | PropertyPhase.Gas),
            substance=Substance.from_components("CCO"),
            value=1.0 * EnthalpyOfVaporization.default_unit(),
            uncertainty=0.1 * EnthalpyOfVaporization.default_unit(),
            source=source,
        ),
        EnthalpyOfMixing(
            thermodynamic_state=thermodynamic_state,
            phase=PropertyPhase.Liquid,
            substance=Substance.from_components("CCCCO", "CC(C=O)C"),
            value=1.0 * EnthalpyOfMixing.default_unit(),
            uncertainty=0.1 * EnthalpyOfMixing.default_unit(),
            source=source,
        ),
        ExcessMolarVolume(
            thermodynamic_state=thermodynamic_state,
            phase=PropertyPhase.Liquid,
            substance=Substance.from_components("C(=O)CCCO", "CCCCCC"),
            value=1.0 * ExcessMolarVolume.default_unit(),
            uncertainty=0.1 * ExcessMolarVolume.default_unit(),
            source=source,
        ),
        SolvationFreeEnergy(
            thermodynamic_state=thermodynamic_state,
            phase=PropertyPhase.Liquid,
            substance=solvation_substance,
            value=1.0 * SolvationFreeEnergy.default_unit(),
            uncertainty=0.1 * SolvationFreeEnergy.default_unit(),
            source=source,
        ),
    ]

    for index, evaluator_property in enumerate(evaluator_properties):
        evaluator_property.id = str(index + 1)

    evaluator_data_set = PhysicalPropertyDataSet()
    evaluator_data_set.add_properties(*evaluator_properties)

    return evaluator_data_set
Ejemplo n.º 26
0
def define_data_set(reweighting: bool) -> PhysicalPropertyDataSet:

    # Define a common state to compute estimates at
    states = [
        ThermodynamicState(temperature=296.15 * unit.kelvin,
                           pressure=1.0 * unit.atmosphere),
        ThermodynamicState(temperature=298.15 * unit.kelvin,
                           pressure=1.0 * unit.atmosphere),
        ThermodynamicState(temperature=300.15 * unit.kelvin,
                           pressure=1.0 * unit.atmosphere),
    ]

    data_set = PhysicalPropertyDataSet()

    # Solvation free energies.
    if not reweighting:

        ethanol_substance = Substance.from_components("CCO")
        ethanol_substance.add_component(
            Component("CC=O", Component.Role.Solute), ExactAmount(1))
        ethanal_substance = Substance.from_components("CC=O")
        ethanal_substance.add_component(
            Component("CCO", Component.Role.Solute), ExactAmount(1))

        data_set.add_properties(
            SolvationFreeEnergy(
                thermodynamic_state=states[1],
                phase=PropertyPhase.Liquid,
                substance=ethanol_substance,
                value=0.0 * SolvationFreeEnergy.default_unit(),
            ),
            SolvationFreeEnergy(
                thermodynamic_state=states[1],
                phase=PropertyPhase.Liquid,
                substance=ethanal_substance,
                value=0.0 * SolvationFreeEnergy.default_unit(),
            ),
            *CurationWorkflow.apply(
                PhysicalPropertyDataSet(),
                CurationWorkflowSchema(component_schemas=[
                    ImportFreeSolvSchema(),
                    FilterBySubstancesSchema(substances_to_include=[("O",
                                                                     "CO")]),
                ]),
            ),
        )

    for state in states:

        # Excess properties.
        data_set.add_properties(
            ExcessMolarVolume(
                thermodynamic_state=state,
                phase=PropertyPhase.Liquid,
                substance=Substance.from_components("CC=O", "CCO"),
                value=0.0 * ExcessMolarVolume.default_unit(),
            ),
            EnthalpyOfMixing(
                thermodynamic_state=state,
                phase=PropertyPhase.Liquid,
                substance=Substance.from_components("CC=O", "CCO"),
                value=0.0 * EnthalpyOfMixing.default_unit(),
            ),
        )
        # Pure properties
        data_set.add_properties(
            Density(
                thermodynamic_state=state,
                phase=PropertyPhase.Liquid,
                substance=Substance.from_components("CCO"),
                value=0.0 * Density.default_unit(),
            ),
            EnthalpyOfVaporization(
                thermodynamic_state=state,
                phase=PropertyPhase(PropertyPhase.Liquid | PropertyPhase.Gas),
                substance=Substance.from_components("CCO"),
                value=0.0 * EnthalpyOfVaporization.default_unit(),
            ),
            DielectricConstant(
                thermodynamic_state=state,
                phase=PropertyPhase.Liquid,
                substance=Substance.from_components("CCO"),
                value=0.0 * DielectricConstant.default_unit(),
            ),
        )

    return data_set
Ejemplo n.º 27
0
def test_filter_by_environment_list():
    """Test that the ``FilterByEnvironments`` filter works well with the
    ``environments`` schema option"""

    data_set = PhysicalPropertyDataSet()
    data_set.add_properties(
        _build_entry("O"),
        _build_entry("C"),
        _build_entry("C", "O"),
        _build_entry("O", "CC(=O)CC=O"),
        _build_entry("CC(=O)CC=O", "O"),
    )

    data_frame = data_set.to_pandas()

    # Retain only aqueous functionality
    filtered_frame = FilterByEnvironments.apply(
        data_frame,
        FilterByEnvironmentsSchema(environments=[ChemicalEnvironment.Aqueous],
                                   at_least_one_environment=True),
    )

    assert len(filtered_frame) == 1
    assert filtered_frame["N Components"].max() == 1
    assert {*filtered_frame["Component 1"].unique()} == {"O"}

    # Retain both aqueous and aldehyde functionality but not strictly
    filtered_frame = FilterByEnvironments.apply(
        data_frame,
        FilterByEnvironmentsSchema(
            environments=[
                ChemicalEnvironment.Aqueous, ChemicalEnvironment.Aldehyde
            ],
            at_least_one_environment=True,
        ),
    )

    assert len(filtered_frame) == 3

    assert filtered_frame["N Components"].min() == 1
    assert filtered_frame["N Components"].max() == 2

    pure_data = filtered_frame[filtered_frame["N Components"] == 1]
    binary_data = filtered_frame[filtered_frame["N Components"] == 2]

    assert len(pure_data) == 1
    assert {*pure_data["Component 1"].unique()} == {"O"}

    assert len(binary_data) == 2

    assert {
        *binary_data["Component 1"].unique(),
        *binary_data["Component 2"].unique(),
    } == {"CC(=O)CC=O", "O"}

    # Ensure enforcing the strict behaviour correctly filters out the
    # combined aldehyde and ketone functionality when only aldehyde and
    # aqueous is permitted.
    filtered_frame = FilterByEnvironments.apply(
        data_frame,
        FilterByEnvironmentsSchema(
            environments=[
                ChemicalEnvironment.Aqueous, ChemicalEnvironment.Aldehyde
            ],
            at_least_one_environment=False,
            strictly_specified_environments=True,
        ),
    )

    assert len(filtered_frame) == 1
    assert filtered_frame["N Components"].max() == 1
    assert {*filtered_frame["Component 1"].unique()} == {"O"}
Ejemplo n.º 28
0
def test_filter_by_environment_per_component():
    """Test that the ``FilterByEnvironments`` filter works well with the
    ``per_component_environments`` schema option"""

    data_set = PhysicalPropertyDataSet()
    data_set.add_properties(
        _build_entry("O"),
        _build_entry("C"),
        _build_entry("C", "O"),
        _build_entry("O", "CC(=O)CC=O"),
        _build_entry("CC(=O)CC=O", "O"),
    )

    data_frame = data_set.to_pandas()

    # Retain only aqueous functionality
    filtered_frame = FilterByEnvironments.apply(
        data_frame,
        FilterByEnvironmentsSchema(
            per_component_environments={
                1: [[ChemicalEnvironment.Aqueous]],
                2: [[ChemicalEnvironment.Aqueous],
                    [ChemicalEnvironment.Aqueous]],
            },
            at_least_one_environment=True,
        ),
    )

    assert len(filtered_frame) == 1
    assert filtered_frame["N Components"].max() == 1
    assert {*filtered_frame["Component 1"].unique()} == {"O"}

    # Retain any pure component data, and only aqueous aldehyde mixture data.
    filtered_frame = FilterByEnvironments.apply(
        data_frame,
        FilterByEnvironmentsSchema(
            per_component_environments={
                2: [[ChemicalEnvironment.Aldehyde],
                    [ChemicalEnvironment.Aqueous]]
            },
            at_least_one_environment=True,
        ),
    )
    assert len(filtered_frame) == 4

    assert filtered_frame["N Components"].min() == 1
    assert filtered_frame["N Components"].max() == 2

    pure_data = filtered_frame[filtered_frame["N Components"] == 1]
    binary_data = filtered_frame[filtered_frame["N Components"] == 2]

    assert len(pure_data) == 2
    assert {*pure_data["Component 1"].unique()} == {"O", "C"}

    assert len(binary_data) == 2

    assert {
        *binary_data["Component 1"].unique(),
        *binary_data["Component 2"].unique(),
    } == {"CC(=O)CC=O", "O"}

    # Repeat the last test but this time make the filtering strict.
    filtered_frame = FilterByEnvironments.apply(
        data_frame,
        FilterByEnvironmentsSchema(
            per_component_environments={
                2: [[ChemicalEnvironment.Aldehyde],
                    [ChemicalEnvironment.Aqueous]]
            },
            at_least_one_environment=False,
            strictly_specified_environments=True,
        ),
    )
    assert len(filtered_frame) == 2

    assert filtered_frame["N Components"].max() == 1
    assert {*filtered_frame["Component 1"].unique()} == {"O", "C"}

    filtered_frame = FilterByEnvironments.apply(
        data_frame,
        FilterByEnvironmentsSchema(
            per_component_environments={
                2: [
                    [
                        ChemicalEnvironment.Aldehyde,
                        ChemicalEnvironment.Ketone,
                        ChemicalEnvironment.Carbonyl,
                    ],
                    [ChemicalEnvironment.Aqueous],
                ]
            },
            at_least_one_environment=False,
            strictly_specified_environments=True,
        ),
    )
    assert len(filtered_frame) == 4

    assert filtered_frame["N Components"].min() == 1
    assert filtered_frame["N Components"].max() == 2

    pure_data = filtered_frame[filtered_frame["N Components"] == 1]
    binary_data = filtered_frame[filtered_frame["N Components"] == 2]

    assert len(pure_data) == 2
    assert {*pure_data["Component 1"].unique()} == {"O", "C"}

    assert len(binary_data) == 2