Ejemplo n.º 1
0
def create_filterable_data_set():
    """Creates a dummy data with a diverse set of properties to
    be filtered, namely:

        - a liquid density measured at 298 K and 0.5 atm with 1 component containing only carbon.
        - a gaseous dielectric measured at 288 K and 1 atm with 2 components containing only nitrogen.
        - a solid EoM measured at 308 K and 1.5 atm with 3 components containing only oxygen.

    Returns
    -------
    PhysicalPropertyDataSet
        The created data set.
    """

    source = CalculationSource("Dummy", {})
    carbon_substance = create_dummy_substance(number_of_components=1,
                                              elements=["C"])

    density_property = Density(
        thermodynamic_state=ThermodynamicState(temperature=298 * unit.kelvin,
                                               pressure=0.5 * unit.atmosphere),
        phase=PropertyPhase.Liquid,
        substance=carbon_substance,
        value=1 * unit.gram / unit.milliliter,
        uncertainty=0.11 * unit.gram / unit.milliliter,
        source=source,
    )

    nitrogen_substance = create_dummy_substance(number_of_components=2,
                                                elements=["N"])

    dielectric_property = DielectricConstant(
        thermodynamic_state=ThermodynamicState(temperature=288 * unit.kelvin,
                                               pressure=1 * unit.atmosphere),
        phase=PropertyPhase.Gas,
        substance=nitrogen_substance,
        value=1 * unit.dimensionless,
        uncertainty=0.11 * unit.dimensionless,
        source=source,
    )

    oxygen_substance = create_dummy_substance(number_of_components=3,
                                              elements=["O"])

    enthalpy_property = EnthalpyOfMixing(
        thermodynamic_state=ThermodynamicState(temperature=308 * unit.kelvin,
                                               pressure=1.5 * unit.atmosphere),
        phase=PropertyPhase.Solid,
        substance=oxygen_substance,
        value=1 * unit.kilojoules / unit.mole,
        uncertainty=0.11 * unit.kilojoules / unit.mole,
        source=source,
    )

    data_set = PhysicalPropertyDataSet()
    data_set.add_properties(density_property, dielectric_property,
                            enthalpy_property)

    return data_set
Ejemplo n.º 2
0
def test_sources_substances():

    physical_property = create_dummy_property(Density)

    data_set = PhysicalPropertyDataSet()
    data_set.add_properties(physical_property)

    assert next(iter(data_set.sources)) == physical_property.source
    assert next(iter(data_set.substances)) == physical_property.substance
Ejemplo n.º 3
0
def test_protocol_replacement(force_field_source, expected_protocol_type):

    data_set = PhysicalPropertyDataSet()

    for property_type in property_types:
        physical_property = create_dummy_property(property_type)
        data_set.add_properties(physical_property)

    options = EvaluatorClient.default_request_options(data_set, force_field_source)
    options_json = options.json(format=True)

    assert options_json.find('BaseBuildSystem"') < 0
    assert options_json.find(expected_protocol_type) >= 0
Ejemplo n.º 4
0
def test_serialization():
    """A test to ensure that data sets are JSON serializable."""

    data_set = PhysicalPropertyDataSet()
    data_set.add_properties(create_dummy_property(Density))

    data_set_json = data_set.json()

    parsed_data_set = PhysicalPropertyDataSet.parse_json(data_set_json)
    assert len(data_set) == len(parsed_data_set)

    parsed_data_set_json = parsed_data_set.json()
    assert parsed_data_set_json == data_set_json
Ejemplo n.º 5
0
def test_properties_by_type():

    density = create_dummy_property(Density)
    dielectric = create_dummy_property(DielectricConstant)

    data_set = PhysicalPropertyDataSet()
    data_set.add_properties(density, dielectric)

    densities = [x for x in data_set.properties_by_type("Density")]
    assert len(densities) == 1
    assert densities[0] == density

    dielectrics = [
        x for x in data_set.properties_by_type("DielectricConstant")
    ]
    assert len(dielectrics) == 1
    assert dielectrics[0] == dielectric
Ejemplo n.º 6
0
def test_same_component_batching():

    thermodynamic_state = ThermodynamicState(temperature=1.0 * unit.kelvin,
                                             pressure=1.0 * unit.atmosphere)

    data_set = PhysicalPropertyDataSet()
    data_set.add_properties(
        Density(
            thermodynamic_state=thermodynamic_state,
            substance=Substance.from_components("O", "C"),
            value=0.0 * unit.kilogram / unit.meter**3,
        ),
        EnthalpyOfVaporization(
            thermodynamic_state=thermodynamic_state,
            substance=Substance.from_components("O", "C"),
            value=0.0 * unit.kilojoule / unit.mole,
        ),
        Density(
            thermodynamic_state=thermodynamic_state,
            substance=Substance.from_components("O", "CO"),
            value=0.0 * unit.kilogram / unit.meter**3,
        ),
        EnthalpyOfVaporization(
            thermodynamic_state=thermodynamic_state,
            substance=Substance.from_components("O", "CO"),
            value=0.0 * unit.kilojoule / unit.mole,
        ),
    )

    options = RequestOptions()

    submission = EvaluatorClient._Submission()
    submission.dataset = data_set
    submission.options = options

    with DaskLocalCluster() as calculation_backend:

        server = EvaluatorServer(calculation_backend)
        batches = server._batch_by_same_component(submission, "")

    assert len(batches) == 2

    assert len(batches[0].queued_properties) == 2
    assert len(batches[1].queued_properties) == 2
Ejemplo n.º 7
0
def test_default_options():
    """Test creating the default estimation options."""

    data_set = PhysicalPropertyDataSet()
    force_field_source = SmirnoffForceFieldSource.from_path(
        "smirnoff99Frosst-1.1.0.offxml"
    )

    for property_type in property_types:
        physical_property = create_dummy_property(property_type)
        data_set.add_properties(physical_property)

    options = EvaluatorClient.default_request_options(data_set, force_field_source)
    options.validate()

    assert len(options.calculation_layers) == 2
    assert len(options.calculation_schemas) == len(property_types)
    assert all(
        len(x) == len(options.calculation_layers)
        for x in options.calculation_schemas.values()
    )
Ejemplo n.º 8
0
def test_validate_data_set():

    valid_property = Density(
        ThermodynamicState(298 * unit.kelvin, 1 * unit.atmosphere),
        PropertyPhase.Liquid,
        Substance.from_components("O"),
        0.0 * unit.gram / unit.milliliter,
        0.0 * unit.gram / unit.milliliter,
    )

    data_set = PhysicalPropertyDataSet()
    data_set.add_properties(valid_property)

    data_set.validate()

    invalid_property = Density(
        ThermodynamicState(-1 * unit.kelvin, 1 * unit.atmosphere),
        PropertyPhase.Liquid,
        Substance.from_components("O"),
        0.0 * unit.gram / unit.milliliter,
        0.0 * unit.gram / unit.milliliter,
    )

    with pytest.raises(AssertionError):
        data_set.add_properties(invalid_property)

    data_set.add_properties(invalid_property, validate=False)

    with pytest.raises(AssertionError):
        data_set.validate()
Ejemplo n.º 9
0
def test_launch_batch():

    # Set up a dummy data set
    data_set = PhysicalPropertyDataSet()
    data_set.add_properties(create_dummy_property(Density),
                            create_dummy_property(Density))

    batch = Batch()
    batch.force_field_id = ""
    batch.options = RequestOptions()
    batch.options.calculation_layers = ["QuickCalculationLayer"]
    batch.options.calculation_schemas = {
        "Density": {
            "QuickCalculationLayer": CalculationLayerSchema()
        }
    }
    batch.parameter_gradient_keys = []
    batch.queued_properties = [*data_set]
    batch.validate()

    with tempfile.TemporaryDirectory() as directory:

        with temporarily_change_directory(directory):

            with DaskLocalCluster() as calculation_backend:

                server = EvaluatorServer(
                    calculation_backend=calculation_backend,
                    working_directory=directory,
                )

                server._queued_batches[batch.id] = batch
                server._launch_batch(batch)

                while len(batch.queued_properties) > 0:
                    sleep(0.01)

                assert len(batch.estimated_properties) == 1
                assert len(batch.unsuccessful_properties) == 1
Ejemplo n.º 10
0
def test_filter_by_smiles():
    """A test to ensure that data sets may be filtered by which smiles their
    measured properties contain."""

    methanol_substance = Substance()
    methanol_substance.add_component(Component("CO"), MoleFraction(1.0))

    ethanol_substance = Substance()
    ethanol_substance.add_component(Component("CCO"), MoleFraction(1.0))

    property_a = create_dummy_property(Density)
    property_a.substance = methanol_substance

    property_b = create_dummy_property(Density)
    property_b.substance = ethanol_substance

    data_set = PhysicalPropertyDataSet()
    data_set.add_properties(property_a, property_b)

    data_set.filter_by_smiles("CO")

    assert len(data_set) == 1
    assert methanol_substance in data_set.substances
    assert ethanol_substance not in data_set.substances
Ejemplo n.º 11
0
def select_data_points(data_directory, chosen_substances, target_state_points):
    """The method attempts to find a set of data points for each
    property which are clustered around the set of conditions specified
    in the `target_state_points` input array.

    The points will be chosen so as to try and maximise the number of
    properties measured at the same condition (e.g. ideally we would
    have a data point for each property at T=298.15 and p=1atm) as this
    will maximise the chances that we can extract all properties from a
    single simulation.

    Parameters
    ----------
    data_directory: str
        The directory which contains the processed pandas
        data sets
    chosen_substances: list of tuple of str, optional
        The substances to choose data points for. If None,
        no filtering of substances will be performed by this function.
    target_state_points: dict of tuple of type and SubstanceType and list of StatePoint
        A list of the state points for which we would ideally have data
        points for. The value tuple should be of the form
        (temperature, pressure, (mole fraction 0, ..., mole fraction N))

    Returns
    -------
    PhysicalPropertyDataSet
        A data set which contains the chosen data points.
    """

    # Load the full data set from the processed data files
    data_frames = []

    for property_type, substance_type in target_state_points:

        data_frame = load_processed_data_set(data_directory, property_type,
                                             substance_type)
        data_frames.append(data_frame)

    full_data_frame = pandas.concat(data_frames, ignore_index=True, sort=False)
    data_set = data_set_from_data_frame(full_data_frame)

    properties_by_substance = defaultdict(list)

    # Partition the properties by their substance components,
    # filtering out any not chosen substances.
    for substance in data_set.substances:

        substance_tuple = tuple(
            sorted([component.smiles for component in substance.components]))

        if chosen_substances is not None and substance_tuple not in chosen_substances:
            continue

        properties_by_substance[substance_tuple].extend(
            data_set.properties_by_substance(substance))

    # Start to choose the state points.
    return_data_set = PhysicalPropertyDataSet()

    for substance_tuple in properties_by_substance:

        # Cluster the data points around the closest states of interest.
        clustered_properties = _cluster_properties_around_states(
            properties_by_substance[substance_tuple], target_state_points)

        # For each cluster, we try to find the state points for which we have
        # measured the most types of properties (i.e. prioritise states
        # for which we have a density, dielectric and enthalpy measurement
        # over those for which we only have a density measurement).
        for target_state_point, physical_properties in clustered_properties.items(
        ):

            properties_per_state = defaultdict(list)
            property_types_per_state = defaultdict(set)

            # Refactor the properties into more convenient data structures.
            for physical_property in physical_properties:

                state_point = StatePoint.from_physical_property(
                    physical_property)
                property_tuple = property_to_type_tuple(physical_property)

                properties_per_state[state_point].append(physical_property)
                property_types_per_state[state_point].add(property_tuple)

            # Sort the state points based on their distance to the target state.
            sorted_states_points = list(
                sorted(
                    properties_per_state.keys(),
                    key=functools.partial(StatePoint.individual_distances,
                                          target_state_point),
                ))

            # Keep track of the properties which we need to choose a state point for
            properties_to_cover = set(
                property_tuple for property_tuple in target_state_points)
            # as well as the chosen state points
            chosen_state_points = set()

            # Iteratively consider state points which have all data points, down
            # to state points for which we only have single property measurements.
            for target_number_of_properties in reversed(
                    range(1,
                          len(target_state_points) + 1)):

                for state_point in sorted_states_points:

                    property_types_at_state = property_types_per_state[
                        state_point]

                    if len(property_types_at_state
                           ) != target_number_of_properties:
                        continue

                    if (len(
                            properties_to_cover.intersection(
                                property_types_at_state)) == 0):
                        continue

                    chosen_state_points.add(state_point)

                    properties_to_cover = properties_to_cover.symmetric_difference(
                        properties_to_cover.intersection(
                            property_types_at_state))

            # Add the properties which were measured at the chosen state points
            # to the returned data set.
            for state_point in chosen_state_points:

                if len(properties_per_state[state_point]) == 0:
                    continue

                return_data_set.add_properties(
                    *properties_per_state[state_point])

    return return_data_set
Ejemplo n.º 12
0
def data_set_from_data_frame(data_frame):
    """Converts a `pandas.DataFrame` to a `PhysicalPropertyDataSet` object.
    See the `PhysicalPropertyDataSet.to_pandas()` function for information
    on the required columns.

    Parameters
    ----------
    data_frame: pandas.DataFrame
        The data frame to convert.

    Returns
    -------
    PhysicalPropertyDataSet
        The converted data set.
    """

    return_value = PhysicalPropertyDataSet()

    if len(data_frame) == 0:
        return return_value

    # Make sure the base columns are present.
    required_base_columns = [
        "Temperature (K)",
        "Pressure (kPa)",
        "Phase",
        "N Components",
        "Source",
    ]

    assert all(x in data_frame for x in required_base_columns)

    # Make sure the substance columns are present.
    max_components = max(int(x) for x in data_frame["N Components"])
    assert max_components > 0

    required_components_columns = [
        x for i in range(max_components) for x in [
            f"Component {i + 1}",
            f"Role {i + 1}",
            f"Mole Fraction {i + 1}",
            f"Exact Amount {i + 1}",
        ]
    ]

    assert all(x in data_frame for x in required_components_columns)

    property_types = []

    for column_name in data_frame:

        if " Value" not in column_name:
            continue

        column_name_split = column_name.split(" ")

        assert len(column_name_split) >= 2

        property_type = getattr(evaluator.properties, column_name_split[0])
        property_types.append(property_type)

    assert len(property_types) > 0

    # Make sure we don't have duplicate property columns.
    assert len(set(property_types)) == len(property_types)

    properties = []

    for _, row in data_frame.iterrows():

        # Create the substance from the component columns
        number_of_components = row["N Components"]

        substance = Substance()

        for component_index in range(number_of_components):

            smiles = row[f"Component {component_index + 1}"]
            role = Component.Role[row[f"Role {component_index + 1}"]]
            mole_fraction = row[f"Mole Fraction {component_index + 1}"]
            exact_amount = row[f"Exact Amount {component_index + 1}"]

            assert not numpy.isnan(mole_fraction) or not numpy.isnan(
                exact_amount)

            component = Component(smiles, role)

            if not numpy.isnan(mole_fraction):
                substance.add_component(component, MoleFraction(mole_fraction))
            if not numpy.isnan(exact_amount):
                substance.add_component(component, ExactAmount(exact_amount))

        # Extract the state
        pressure = row["Pressure (kPa)"] * unit.kilopascal
        temperature = row["Temperature (K)"] * unit.kelvin

        thermodynamic_state = ThermodynamicState(temperature, pressure)

        phase = PropertyPhase.from_string(row["Phase"])

        source = MeasurementSource(reference=row["Source"])

        for property_type in property_types:

            default_unit = property_type.default_unit()
            value_header = f"{property_type.__name__} Value ({default_unit:~})"

            if numpy.isnan(row[value_header]):
                continue

            value = row[value_header] * default_unit
            uncertainty = 0.0 * default_unit

            physical_property = property_type(
                thermodynamic_state=thermodynamic_state,
                phase=phase,
                substance=substance,
                value=value,
                uncertainty=uncertainty,
                source=source,
            )

            properties.append(physical_property)

    return_value.add_properties(*properties)
    return return_value
Ejemplo n.º 13
0
def main():

    training_set_smiles = [
        "CCO",
        "CC(=O)O",
        "COC=O",
        "CC(C)(C)O",
        "CC(C)O",
        "CO",
        "CCOC(C)=O",
        "CCOC(=O)CC(=O)OCC",
        "CC(C)CO",
        "CCCCO",
        "CCCCOC(C)=O",
        "CCCOC(C)=O",
    ]

    # Ensure the smiles patterns are standardized.
    smiles = [Component(x).smiles for x in training_set_smiles]

    # Load in the Hvap data
    h_vap_data_frame = pandas.read_csv(
        os.path.join(
            "..",
            "..",
            "..",
            "data_availability",
            "sourced_h_vap_data",
            "enthalpy_of_vaporization_pure.csv",
        ))
    h_vap_data_frame = filter_by_smiles(h_vap_data_frame,
                                        smiles_to_include=smiles,
                                        smiles_to_exclude=None)

    h_vap_data_set = data_set_from_data_frame(h_vap_data_frame)

    # # Load in the density data
    density_data_frame = pandas.read_csv(
        os.path.join(
            "..",
            "..",
            "..",
            "data_availability",
            "data_by_environments",
            "alcohol_ester",
            "all_data",
            "density_pure.csv",
        ))
    density_data_frame = filter_by_smiles(density_data_frame,
                                          smiles_to_include=smiles,
                                          smiles_to_exclude=None)

    density_data_set = data_set_from_data_frame(density_data_frame)

    # Retain the density measurements which were made closest to 298.15K and 1 atm.
    target_state_point = StatePoint(
        temperature=298.15 * unit.kelvin,
        pressure=1.0 * unit.atmosphere,
        mole_fractions=(1.0, ),
    )

    final_data_set = PhysicalPropertyDataSet()

    for substance in density_data_set.substances:

        properties_per_state = defaultdict(list)

        # Refactor the properties into more convenient data structures.
        for physical_property in density_data_set.properties_by_substance(
                substance):

            state_point = StatePoint.from_physical_property(physical_property)
            properties_per_state[state_point].append(physical_property)

        # Sort the state points based on their distance to the target state.
        sorted_states_points = list(
            sorted(
                properties_per_state.keys(),
                key=functools.partial(StatePoint.individual_distances,
                                      target_state_point),
            ))

        final_data_set.add_properties(
            properties_per_state[sorted_states_points[0]][0])

    final_data_set.merge(h_vap_data_set)

    final_data_set.json("training_set.json", format=True)
    final_data_set.to_pandas().to_csv("training_set.csv", index=False)
Ejemplo n.º 14
0
def _parse_thermoml_archives(file_paths, retain_values, retain_uncertainties, **_):
    """Loads a number of ThermoML data xml files (making sure to
    catch errors raised by individual files), and concatenates
    them into data sets containing a single type of property.

    Parameters
    ----------
    file_paths: list of str
        The file paths of the ThermoML xml files to load.
    retain_values: bool
        If False, all values for the measured properties will
        be stripped from the final data set.
    retain_uncertainties: bool
        If False, all uncertainties in measured property values will
        be stripped from the final data set.

    Returns
    -------
    dict of str and pandas.DataFrame
        The parsed data.
    """

    properties_by_type = defaultdict(list)

    try:

        # We make sure to wrap each of the 'error prone' calls in this method
        # in try-catch blocks to stop workers from being killed.
        for file_path in file_paths:

            try:
                data_set = ThermoMLDataSet.from_file(file_path)

            except Exception:

                logger.exception(f"An exception was raised when loading {file_path}")
                continue

            # A data set will be none if no 'valid' properties were found
            # in the archive file.
            if data_set is None:
                continue

            for physical_property in data_set:

                if not retain_values:
                    physical_property.value = UNDEFINED
                if not retain_uncertainties:
                    physical_property.uncertainty = UNDEFINED

                property_type = physical_property.__class__.__name__
                properties_by_type[property_type].append(physical_property)

    except Exception:

        logger.exception(f"An uncaught exception was raised.")
        properties_by_type = {}

    data_frames = {}

    for property_type in properties_by_type:

        if len(properties_by_type[property_type]) == 0:
            continue

        data_set = PhysicalPropertyDataSet()
        data_set.add_properties(*properties_by_type[property_type])

        data_frames[property_type] = data_set.to_pandas()

    return data_frames
Ejemplo n.º 15
0
def test_to_pandas():
    """A test to ensure that data sets are convertable to pandas objects."""

    source = CalculationSource("Dummy", {})

    pure_substance = Substance.from_components("C")
    binary_substance = Substance.from_components("C", "O")

    data_set = PhysicalPropertyDataSet()

    for temperature in [
            298 * unit.kelvin, 300 * unit.kelvin, 302 * unit.kelvin
    ]:

        thermodynamic_state = ThermodynamicState(temperature=temperature,
                                                 pressure=1.0 *
                                                 unit.atmosphere)

        density_property = Density(
            thermodynamic_state=thermodynamic_state,
            phase=PropertyPhase.Liquid,
            substance=pure_substance,
            value=1 * unit.gram / unit.milliliter,
            uncertainty=0.11 * unit.gram / unit.milliliter,
            source=source,
        )

        dielectric_property = DielectricConstant(
            thermodynamic_state=thermodynamic_state,
            phase=PropertyPhase.Liquid,
            substance=pure_substance,
            value=1 * unit.dimensionless,
            uncertainty=0.11 * unit.dimensionless,
            source=source,
        )

        data_set.add_properties(density_property)
        data_set.add_properties(dielectric_property)

    for temperature in [
            298 * unit.kelvin, 300 * unit.kelvin, 302 * unit.kelvin
    ]:

        thermodynamic_state = ThermodynamicState(temperature=temperature,
                                                 pressure=1.0 *
                                                 unit.atmosphere)

        enthalpy_property = EnthalpyOfMixing(
            thermodynamic_state=thermodynamic_state,
            phase=PropertyPhase.Liquid,
            substance=binary_substance,
            value=1 * unit.kilojoules / unit.mole,
            uncertainty=0.11 * unit.kilojoules / unit.mole,
            source=source,
        )

        excess_property = ExcessMolarVolume(
            thermodynamic_state=thermodynamic_state,
            phase=PropertyPhase.Liquid,
            substance=binary_substance,
            value=1 * unit.meter**3 / unit.mole,
            uncertainty=0.11 * unit.meter**3 / unit.mole,
            source=source,
        )

        data_set.add_properties(enthalpy_property)
        data_set.add_properties(excess_property)

    data_set_pandas = data_set.to_pandas()

    required_columns = [
        "Temperature (K)",
        "Pressure (kPa)",
        "Phase",
        "N Components",
        "Source",
        "Component 1",
        "Role 1",
        "Mole Fraction 1",
        "Exact Amount 1",
        "Component 2",
        "Role 2",
        "Mole Fraction 2",
        "Exact Amount 2",
    ]

    assert all(x in data_set_pandas for x in required_columns)

    assert data_set_pandas is not None
    assert data_set_pandas.shape == (12, 21)

    data_set_without_na = data_set_pandas.dropna(axis=1, how="all")
    assert data_set_without_na.shape == (12, 19)