Ejemplo n.º 1
0
def test_filter_ionic_liquid():
    thermodynamic_state = ThermodynamicState(
        temperature=298.15 * unit.kelvin,
        pressure=101.325 * unit.kilopascal,
    )

    # Ensure ionic liquids are filtered.
    data_set = PhysicalPropertyDataSet()
    data_set.add_properties(
        Density(
            thermodynamic_state=thermodynamic_state,
            phase=PropertyPhase.Liquid,
            value=1.0 * Density.default_unit(),
            uncertainty=1.0 * Density.default_unit(),
            source=MeasurementSource(doi=" "),
            substance=Substance.from_components("[Na+].[Cl-]"),
        ),
        Density(
            thermodynamic_state=thermodynamic_state,
            phase=PropertyPhase.Liquid,
            value=1.0 * Density.default_unit(),
            uncertainty=1.0 * Density.default_unit(),
            source=MeasurementSource(doi=" "),
            substance=Substance.from_components("C"),
        ),
    )

    data_frame = data_set.to_pandas()

    filtered_frame = FilterByIonicLiquid.apply(
        data_frame,
        FilterByIonicLiquidSchema(),
    )

    assert len(filtered_frame) == 1
Ejemplo n.º 2
0
def test_from_pandas():
    """A test to ensure that data sets may be created from pandas objects."""

    thermodynamic_state = ThermodynamicState(temperature=298.15 * unit.kelvin,
                                             pressure=1.0 * unit.atmosphere)

    original_data_set = PhysicalPropertyDataSet()
    original_data_set.add_properties(
        Density(
            thermodynamic_state=thermodynamic_state,
            phase=PropertyPhase.Liquid,
            substance=Substance.from_components("CO", "O"),
            value=1.0 * unit.kilogram / unit.meter**3,
            uncertainty=1.0 * unit.kilogram / unit.meter**3,
            source=MeasurementSource(doi="10.5281/zenodo.596537"),
        ),
        EnthalpyOfVaporization(
            thermodynamic_state=thermodynamic_state,
            phase=PropertyPhase.from_string("Liquid + Gas"),
            substance=Substance.from_components("C"),
            value=2.0 * unit.kilojoule / unit.mole,
            source=MeasurementSource(reference="2"),
        ),
        DielectricConstant(
            thermodynamic_state=thermodynamic_state,
            phase=PropertyPhase.Liquid,
            substance=Substance.from_components("C"),
            value=3.0 * unit.dimensionless,
            source=MeasurementSource(reference="3"),
        ),
    )

    data_frame = original_data_set.to_pandas()

    recreated_data_set = PhysicalPropertyDataSet.from_pandas(data_frame)
    assert len(original_data_set) == len(recreated_data_set)

    for original_property in original_data_set:

        recreated_property = next(x for x in recreated_data_set
                                  if x.id == original_property.id)

        assert (original_property.thermodynamic_state ==
                recreated_property.thermodynamic_state)
        assert original_property.phase == recreated_property.phase
        assert original_property.substance == recreated_property.substance
        assert numpy.isclose(original_property.value, recreated_property.value)

        if original_property.uncertainty == UNDEFINED:
            assert original_property.uncertainty == recreated_property.uncertainty
        else:
            assert numpy.isclose(original_property.uncertainty,
                                 recreated_property.uncertainty)

        assert original_property.source.doi == recreated_property.source.doi
        assert original_property.source.reference == recreated_property.source.reference
Ejemplo n.º 3
0
def _build_entry(*smiles: str) -> Density:
    """Builds a density data entry measured at ambient conditions
    and for a system containing the specified smiles patterns in
    equal amounts.

    Parameters
    ----------
    smiles
        The smiles to build components for.

    Returns
    -------
        The built components.
    """
    assert len(smiles) > 0

    return Density(
        thermodynamic_state=ThermodynamicState(
            temperature=298.15 * unit.kelvin,
            pressure=101.325 * unit.kilopascal,
        ),
        phase=PropertyPhase.Liquid,
        value=1.0 * Density.default_unit(),
        uncertainty=1.0 * Density.default_unit(),
        source=MeasurementSource(doi=" "),
        substance=Substance.from_components(*smiles),
    )
Ejemplo n.º 4
0
def data_frame() -> pandas.DataFrame:

    temperatures = [303.15, 298.15]
    property_types = [Density, EnthalpyOfVaporization]

    data_set_entries = []

    def _temperature_noise():
        return (numpy.random.rand() / 2.0 + 0.51) / 10.0

    for temperature in temperatures:

        for index, property_type in enumerate(property_types):

            noise = _temperature_noise()
            noise *= 1 if index == 0 else -1

            data_set_entries.append(
                property_type(
                    thermodynamic_state=ThermodynamicState(
                        temperature=temperature * unit.kelvin,
                        pressure=101.325 * unit.kilopascal,
                    ),
                    phase=PropertyPhase.Liquid,
                    value=1.0 * property_type.default_unit(),
                    uncertainty=1.0 * property_type.default_unit(),
                    source=MeasurementSource(doi=" "),
                    substance=Substance.from_components("C"),
                ), )
            data_set_entries.append(
                property_type(
                    thermodynamic_state=ThermodynamicState(
                        temperature=(temperature + noise) * unit.kelvin,
                        pressure=101.325 * unit.kilopascal,
                    ),
                    phase=PropertyPhase.Liquid,
                    value=1.0 * property_type.default_unit(),
                    uncertainty=1.0 * property_type.default_unit(),
                    source=MeasurementSource(doi=" "),
                    substance=Substance.from_components("C"),
                ), )

    data_set = PhysicalPropertyDataSet()
    data_set.add_properties(*data_set_entries)

    data_frame = data_set.to_pandas()
    return data_frame
Ejemplo n.º 5
0
    def to_evaluator(self) -> "PhysicalProperty":

        from openff.evaluator import properties, substances, unit
        from openff.evaluator.attributes import UNDEFINED
        from openff.evaluator.datasets import MeasurementSource, PropertyPhase
        from openff.evaluator.thermodynamics import ThermodynamicState

        if not hasattr(properties, self.property_type):
            raise UnrecognisedPropertyType(self.property_type)

        property_class: Type[PhysicalProperty] = getattr(
            properties, self.property_type)

        thermodynamic_state = ThermodynamicState(
            temperature=self.temperature * unit.kelvin,
            pressure=self.pressure * unit.kilopascal,
        )

        phase = PropertyPhase.from_string(self.phase)

        substance = substances.Substance()

        for component in self.components:

            off_component = substances.Component(
                smiles=component.smiles,
                role=substances.Component.Role[component.role])

            if component.mole_fraction > 0:

                mole_fraction = substances.MoleFraction(
                    component.mole_fraction)
                substance.add_component(off_component, mole_fraction)

            if component.exact_amount > 0:

                exact_amount = substances.ExactAmount(component.exact_amount)
                substance.add_component(off_component, exact_amount)

        internal_unit = unit.Unit(self.units)

        physical_property = property_class(
            thermodynamic_state=thermodynamic_state,
            phase=phase,
            substance=substance,
            value=self.value * internal_unit,
            uncertainty=UNDEFINED
            if self.std_error is None else self.std_error * internal_unit,
            source=MeasurementSource(doi=self.doi),
        )
        physical_property.id = str(self.id)

        return physical_property
Ejemplo n.º 6
0
def data_frame() -> pandas.DataFrame:

    data_set = PhysicalPropertyDataSet()
    data_set.add_properties(
        Density(
            thermodynamic_state=ThermodynamicState(
                temperature=298.15 * unit.kelvin,
                pressure=101.325 * unit.kilopascal,
            ),
            phase=PropertyPhase.Liquid,
            value=1.0 * Density.default_unit(),
            uncertainty=1.0 * Density.default_unit(),
            source=MeasurementSource(doi=" "),
            substance=Substance.from_components("C"),
        ),
        Density(
            thermodynamic_state=ThermodynamicState(
                temperature=305.15 * unit.kelvin,
                pressure=101.325 * unit.kilopascal,
            ),
            phase=PropertyPhase.Liquid,
            value=1.0 * Density.default_unit(),
            uncertainty=1.0 * Density.default_unit(),
            source=MeasurementSource(doi=" "),
            substance=Substance.from_components("C"),
        ),
        Density(
            thermodynamic_state=ThermodynamicState(
                temperature=298.15 * unit.kelvin,
                pressure=105.325 * unit.kilopascal,
            ),
            phase=PropertyPhase.Liquid,
            value=1.0 * Density.default_unit(),
            uncertainty=1.0 * Density.default_unit(),
            source=MeasurementSource(doi=" "),
            substance=Substance.from_components("C"),
        ),
    )

    return data_set.to_pandas()
Ejemplo n.º 7
0
def data_frame() -> pandas.DataFrame:

    temperatures = [298.15, 318.15]
    pressures = [101.325, 101.0]

    properties = [Density, EnthalpyOfMixing]

    mole_fractions = [(1.0, ), (1.0, ), (0.25, 0.75), (0.75, 0.25)]
    smiles = {1: [("C(F)(Cl)(Br)", ), ("C", )], 2: [("CO", "C"), ("C", "CO")]}

    loop_variables = [(
        temperature,
        pressure,
        property_type,
        mole_fraction,
    ) for temperature in temperatures for pressure in pressures
                      for property_type in properties
                      for mole_fraction in mole_fractions]

    data_entries = []

    for temperature, pressure, property_type, mole_fraction in loop_variables:

        n_components = len(mole_fraction)

        for smiles_tuple in smiles[n_components]:

            substance = Substance()

            for smiles_pattern, x in zip(smiles_tuple, mole_fraction):
                substance.add_component(Component(smiles_pattern),
                                        MoleFraction(x))

            data_entries.append(
                property_type(
                    thermodynamic_state=ThermodynamicState(
                        temperature=temperature * unit.kelvin,
                        pressure=pressure * unit.kilopascal,
                    ),
                    phase=PropertyPhase.Liquid,
                    value=1.0 * property_type.default_unit(),
                    uncertainty=1.0 * property_type.default_unit(),
                    source=MeasurementSource(doi=" "),
                    substance=substance,
                ))

    data_set = PhysicalPropertyDataSet()
    data_set.add_properties(*data_entries)

    return data_set.to_pandas()
Ejemplo n.º 8
0
def simple_evaluator_data_set():
    """Create a simple evaluator `PhysicalPropertyDataSet` which contains
    a simple binary density measurement.

    Returns
    -------
    PhysicalPropertyDataSet
    """

    evaluator_density = Density(
        thermodynamic_state=ThermodynamicState(298.15 * unit.kelvin,
                                               pressure=1.0 * unit.atmosphere),
        phase=PropertyPhase.Liquid,
        substance=Substance.from_components("O", "CC=O"),
        value=1.0 * unit.kilogram / unit.meter**3,
        uncertainty=0.1 * unit.kilogram / unit.meter**3,
        source=MeasurementSource(doi="10.1000/xyz123"),
    )
    evaluator_density.id = "1"

    evaluator_data_set = PhysicalPropertyDataSet()
    evaluator_data_set.add_properties(evaluator_density)

    return evaluator_data_set
def main():

    os.makedirs("raw_data_v2", exist_ok=True)

    for data_set_name in [
            "curated_data_set",
            "gaff 1.81",
            "gaff 2.11",
            "parsley 1.0.0",
            "smirnoff99frosst 1.1.0",
    ]:

        with open(os.path.join("raw_data", f"{data_set_name}.json")) as file:
            raw_data_set = json.load(file)

        assert (raw_data_set["@type"] ==
                "propertyestimator.datasets.datasets.PhysicalPropertyDataSet")

        physical_properties = []

        for raw_data_set_entries in raw_data_set["properties"].values():

            for raw_data_set_entry in raw_data_set_entries:

                # Extract the substance this entry was measured for.
                substance = Substance()

                for raw_component in raw_data_set_entry["substance"][
                        "components"]:

                    component = Component(
                        smiles=raw_component["smiles"],
                        role=Component.Role[raw_component["role"]["value"]],
                    )

                    raw_amounts = raw_data_set_entry["substance"]["amounts"][
                        raw_component["smiles"]]

                    for raw_amount in raw_amounts["value"]:

                        if (raw_amount["@type"] ==
                                "propertyestimator.substances.Substance->MoleFraction"
                            ):

                            substance.add_component(
                                component, MoleFraction(raw_amount["value"]))

                        elif (raw_amount["@type"] ==
                              "propertyestimator.substances.Substance->ExactAmount"
                              ):

                            substance.add_component(
                                component, ExactAmount(raw_amount["value"]))

                        else:
                            raise NotImplementedError()

                # Extract the source of the property
                if (raw_data_set_entry["source"]["@type"] ==
                        "propertyestimator.properties.properties.CalculationSource"
                    ):
                    source = CalculationSource(
                        fidelity=raw_data_set_entry["source"]["fidelity"])
                elif (raw_data_set_entry["source"]["@type"] ==
                      "propertyestimator.properties.properties.MeasurementSource"
                      ):
                    source = MeasurementSource(doi=correct_doi(
                        raw_data_set_entry["source"]["reference"]))
                else:
                    raise NotImplementedError()

                # Generate the new property object.
                property_class = getattr(
                    properties, raw_data_set_entry["@type"].split(".")[-1])

                physical_property = property_class(
                    thermodynamic_state=ThermodynamicState(
                        temperature=(
                            raw_data_set_entry["thermodynamic_state"]
                            ["temperature"]["value"] *
                            unit.Unit(raw_data_set_entry["thermodynamic_state"]
                                      ["temperature"]["unit"])),
                        pressure=(
                            raw_data_set_entry["thermodynamic_state"]
                            ["pressure"]["value"] *
                            unit.Unit(raw_data_set_entry["thermodynamic_state"]
                                      ["pressure"]["unit"])),
                    ),
                    phase=PropertyPhase(raw_data_set_entry["phase"]),
                    substance=substance,
                    value=(raw_data_set_entry["value"]["value"] *
                           unit.Unit(raw_data_set_entry["value"]["unit"])),
                    uncertainty=(
                        None if isinstance(source, MeasurementSource) else
                        (raw_data_set_entry["uncertainty"]["value"] *
                         unit.Unit(raw_data_set_entry["uncertainty"]["unit"])
                         )),
                    source=source,
                )
                physical_property.id = raw_data_set_entry["id"]

                physical_properties.append(physical_property)

        data_set = PhysicalPropertyDataSet()
        data_set.add_properties(*physical_properties)

        data_set.json(os.path.join("raw_data_v2", f"{data_set_name}.json"),
                      format=True)
        data_set.to_pandas().to_csv(
            os.path.join("raw_data_v2", f"{data_set_name}.csv"))
Ejemplo n.º 10
0
    def from_pandas(cls,
                    data_frame: pandas.DataFrame) -> "PhysicalPropertyDataSet":
        """Constructs a data set object from a pandas ``DataFrame`` object.

        Notes
        -----
        * All physical properties are assumed to be source from experimental
          measurements.
        * Currently this method onlu supports data frames containing properties
          which are built-in to the framework (e.g. Density).
        * This method assumes the data frame has a structure identical to that
          produced by the ``PhysicalPropertyDataSet.to_pandas`` function.

        Parameters
        ----------
        data_frame
            The data frame to construct the data set from.

        Returns
        -------
            The constructed data set.
        """

        from openff.evaluator import properties

        property_header_matches = {
            re.match(r"^([a-zA-Z]+) Value \(([a-zA-Z0-9+-/\s]*)\)$", header)
            for header in data_frame if header.find(" Value ") >= 0
        }
        property_headers = {}

        # Validate that the headers have the correct format, specify a
        # built-in property type, and specify correctly the properties
        # units.
        for match in property_header_matches:

            assert match

            property_type_string, property_unit_string = match.groups()

            assert hasattr(properties, property_type_string)
            property_type = getattr(properties, property_type_string)

            property_unit = unit.Unit(property_unit_string)
            assert property_unit is not None

            assert (property_unit.dimensionality ==
                    property_type.default_unit().dimensionality)

            property_headers[match.group(0)] = (property_type, property_unit)

        # Convert the data rows to property objects.
        physical_properties = []

        for _, data_row in data_frame.iterrows():

            data_row = data_row.dropna()

            # Extract the state at which the measurement was made.
            thermodynamic_state = ThermodynamicState(
                temperature=data_row["Temperature (K)"] * unit.kelvin,
                pressure=data_row["Pressure (kPa)"] * unit.kilopascal,
            )
            property_phase = PropertyPhase.from_string(data_row["Phase"])

            # Extract the substance the measurement was made for.
            substance = Substance()

            for i in range(data_row["N Components"]):

                component = Component(
                    smiles=data_row[f"Component {i + 1}"],
                    role=Component.Role[data_row.get(f"Role {i + 1}",
                                                     "Solvent")],
                )

                mole_fraction = data_row.get(f"Mole Fraction {i + 1}", 0.0)
                exact_amount = data_row.get(f"Exact Amount {i + 1}", 0)

                if not numpy.isclose(mole_fraction, 0.0):
                    substance.add_component(component,
                                            MoleFraction(mole_fraction))
                if not numpy.isclose(exact_amount, 0.0):
                    substance.add_component(component,
                                            ExactAmount(exact_amount))

            for (
                    property_header,
                (property_type, property_unit),
            ) in property_headers.items():

                # Check to see whether the row contains a value for this
                # type of property.
                if property_header not in data_row:
                    continue

                uncertainty_header = property_header.replace(
                    "Value", "Uncertainty")

                source_string = data_row["Source"]

                is_doi = all(
                    any(
                        re.match(pattern, split_string, re.I) for pattern in [
                            r"^10.\d{4,9}/[-._;()/:A-Z0-9]+$",
                            r"^10.1002/[^\s]+$",
                            r"^10.\d{4}/\d+-\d+X?(\d+)\d+<[\d\w]+:[\d\w]*>\d+.\d+.\w+;\d$",
                            r"^10.1021/\w\w\d+$",
                            r"^10.1207/[\w\d]+\&\d+_\d+$",
                        ]) for split_string in source_string.split(" + "))

                physical_property = property_type(
                    thermodynamic_state=thermodynamic_state,
                    phase=property_phase,
                    value=data_row[property_header] * property_unit,
                    uncertainty=None if uncertainty_header not in data_row else
                    data_row[uncertainty_header] * property_unit,
                    substance=substance,
                    source=MeasurementSource(
                        doi="" if not is_doi else source_string,
                        reference=source_string if not is_doi else "",
                    ),
                )

                identifier = data_row.get("Id", None)

                if identifier:
                    physical_property.id = identifier

                physical_properties.append(physical_property)

        data_set = PhysicalPropertyDataSet()
        data_set.add_properties(*physical_properties)

        return data_set
Ejemplo n.º 11
0
def complete_evaluator_data_set():
    """Create a more comprehensive `PhysicalPropertyDataSet` which contains one
    measurement for each of:

        * pure density
        * binary density
        * pure enthalpy of vaporization
        * binary enthalpy of mixing
        * binary excess molar volume
        * hydration free energy

    Returns
    -------
    PhysicalPropertyDataSet
    """
    thermodynamic_state = ThermodynamicState(298.15 * unit.kelvin,
                                             pressure=1.0 * unit.atmosphere)
    source = MeasurementSource(doi="10.1000/xyz123")

    solvation_substance = Substance()
    solvation_substance.add_component(Component("O"), MoleFraction(1.0))
    solvation_substance.add_component(Component("CCCO"), ExactAmount(1))

    evaluator_properties = [
        Density(
            thermodynamic_state=thermodynamic_state,
            phase=PropertyPhase.Liquid,
            substance=Substance.from_components("O"),
            value=1.0 * unit.kilogram / unit.meter**3,
            uncertainty=0.1 * unit.kilogram / unit.meter**3,
            source=source,
        ),
        Density(
            thermodynamic_state=thermodynamic_state,
            phase=PropertyPhase.Liquid,
            substance=Substance.from_components("O", "CC=O"),
            value=1.0 * unit.kilogram / unit.meter**3,
            uncertainty=0.1 * unit.kilogram / unit.meter**3,
            source=source,
        ),
        EnthalpyOfVaporization(
            thermodynamic_state=thermodynamic_state,
            phase=PropertyPhase(PropertyPhase.Liquid | PropertyPhase.Gas),
            substance=Substance.from_components("CCO"),
            value=1.0 * EnthalpyOfVaporization.default_unit(),
            uncertainty=0.1 * EnthalpyOfVaporization.default_unit(),
            source=source,
        ),
        EnthalpyOfMixing(
            thermodynamic_state=thermodynamic_state,
            phase=PropertyPhase.Liquid,
            substance=Substance.from_components("CCCCO", "CC(C=O)C"),
            value=1.0 * EnthalpyOfMixing.default_unit(),
            uncertainty=0.1 * EnthalpyOfMixing.default_unit(),
            source=source,
        ),
        ExcessMolarVolume(
            thermodynamic_state=thermodynamic_state,
            phase=PropertyPhase.Liquid,
            substance=Substance.from_components("C(=O)CCCO", "CCCCCC"),
            value=1.0 * ExcessMolarVolume.default_unit(),
            uncertainty=0.1 * ExcessMolarVolume.default_unit(),
            source=source,
        ),
        SolvationFreeEnergy(
            thermodynamic_state=thermodynamic_state,
            phase=PropertyPhase.Liquid,
            substance=solvation_substance,
            value=1.0 * SolvationFreeEnergy.default_unit(),
            uncertainty=0.1 * SolvationFreeEnergy.default_unit(),
            source=source,
        ),
    ]

    for index, evaluator_property in enumerate(evaluator_properties):
        evaluator_property.id = str(index + 1)

    evaluator_data_set = PhysicalPropertyDataSet()
    evaluator_data_set.add_properties(*evaluator_properties)

    return evaluator_data_set
Ejemplo n.º 12
0
    def _apply(
        cls,
        data_frame: pandas.DataFrame,
        schema: ImportFreeSolvSchema,
        n_processes,
    ) -> pandas.DataFrame:

        from openff.evaluator import properties, substances, unit

        # Convert the data frame into data rows.
        free_solv_data_frame = cls._download_free_solv()

        data_entries = []

        for _, row in free_solv_data_frame.iterrows():

            # Extract and standardize the SMILES pattern of the
            solute_smiles = row["SMILES"].lstrip().rstrip()
            solute_smiles = substances.Component(solute_smiles).smiles

            # Build the substance.
            substance = Substance()
            substance.add_component(Component(smiles="O"), MoleFraction(1.0))
            substance.add_component(
                Component(smiles=solute_smiles, role=Component.Role.Solute),
                ExactAmount(1),
            )

            # Extract the value and uncertainty
            value = (float(row["experimental value (kcal/mol)"]) *
                     unit.kilocalorie / unit.mole)
            std_error = (float(row["experimental uncertainty (kcal/mol)"]) *
                         unit.kilocalorie / unit.mole)

            # Attempt to extract a DOI
            original_source = row[
                "experimental reference (original or paper this value was taken from)"]
            doi = cls._validate_doi(original_source)

            data_entry = SolvationFreeEnergy(
                thermodynamic_state=ThermodynamicState(
                    temperature=298.15 * unit.kelvin,
                    pressure=101.325 * unit.kilopascal,
                ),
                phase=PropertyPhase.Liquid,
                substance=substance,
                value=value.to(properties.SolvationFreeEnergy.default_unit()),
                uncertainty=std_error.to(
                    properties.SolvationFreeEnergy.default_unit()),
                source=MeasurementSource(doi=doi),
            )
            data_entries.append(data_entry)

        data_set = PhysicalPropertyDataSet()
        data_set.add_properties(*data_entries)

        free_solv_data_frame = data_set.to_pandas()

        data_frame = pandas.concat([data_frame, free_solv_data_frame],
                                   ignore_index=True,
                                   sort=False)

        return data_frame