def filter_data(data_directory, properties_of_interest, chemical_environments,
                output_directory):
    """Filters out any measurements which where made for components which
    do not contain the chemical environments of interest.

    Parameters
    ----------
    data_directory: str
        The directory containing the unfiltered data.
    properties_of_interest: list of tuple of PropertyType and SubstanceType
        The types of properties to extract data for.
    chemical_environments: list of list of str
        A list of those chemical environments to filter by. Each list in the
        full list corresponds to the chemical environments which should be
        matched by one of the components in the system.
    output_directory: str
        The directory to store the extracted data in.
    """

    for property_tuple in properties_of_interest:

        property_type, substance_type = property_tuple

        data_set = processing.load_processed_data_set(data_directory,
                                                      property_type,
                                                      substance_type)

        # Start by filtering out any substances not composed of O, C, H, N, F, Cl, Br, S
        data_set = filter_by_elements(data_set, "C", "H", "O", "N", "F", "Cl",
                                      "Br", "S")

        # Next filter out any substances which aren't alcohols, esters or acids.
        data_set = filter_by_checkmol(data_set, *chemical_environments)

        # Save the filtered data set.
        processing.save_processed_data_set(output_directory, data_set,
                                           property_type, substance_type)

        # Save out a pdf of all smiles patterns (/ tuples of smiles patterns).
        property_type = property_to_snake_case(property_type)

        file_name = f"{property_type}_{str(substance_type.value)}.pdf"
        file_path = os.path.join(output_directory, file_name)

        data_frame_to_pdf(data_set, file_path)
def filter_data(data_directory, property_type, substance_type,
                output_directory):

    # Load in the data set
    data_frame = load_processed_data_set(data_directory, property_type,
                                         substance_type)

    # Filter to be close to ambient.
    data_frame = filter_by_temperature(data_frame, 290.0 * unit.kelvin,
                                       305 * unit.kelvin)

    # Filter out aromatics, long chain molecules (>= hept), alkenes,
    # ethers, 3 + 4 membered rings
    data_frame = filter_by_smirks(
        data_frame,
        None,
        [
            "[#6a]",
            "[#6r3]",
            "[#6r4]",
            "[#6]=[#6]",
            "[#6]~[#6]~[#6]~[#6]~[#6]~[#6]~[#6]",
            "[#6H2]-[#8X2]-[#6H2]",
        ],
    )

    # Filter out any molecules with undefined stereochemistry
    data_frame = filter_undefined_stereochemistry(data_frame)

    # Save the filtered set.
    save_processed_data_set(
        output_directory,
        data_frame,
        property_type,
        substance_type,
    )

    property_type = property_to_snake_case(property_type)
    file_name = f"{property_type}_{str(substance_type.value)}.pdf"

    data_frame_to_pdf(data_frame, os.path.join(output_directory, file_name))
def main():

    raw_data_directory = "raw_archives"
    processed_data_directory = "processed_data"

    # Convert the raw ThermoML data files into more easily manipulable
    # `pandas.DataFrame` objects.
    process_raw_data(
        directory=raw_data_directory,
        output_directory=processed_data_directory,
        retain_values=True,
        retain_uncertainties=False,
        n_processes=20,
        files_per_worker=50,
    )

    # Here we will also 'fix' the enthalpy of vaporization entries so that
    # they have a pressure (approximated as ambient).
    h_vap_properties = [
        (EnthalpyOfVaporization, SubstanceType.Pure),
        (EnthalpyOfVaporization, SubstanceType.Binary),
        (EnthalpyOfVaporization, SubstanceType.Ternary),
    ]

    pressure = 1.0 * unit.atmosphere

    for property_tuple in h_vap_properties:

        data_set = load_processed_data_set(processed_data_directory,
                                           *property_tuple)

        data_set["Pressure (kPa)"] = data_set["Pressure (kPa)"].fillna(
            pressure.to(unit.kilopascal).magnitude)

        save_processed_data_set(processed_data_directory, data_set,
                                *property_tuple)
def choose_data_points(
    property_of_interest,
    chosen_substances,
    target_states,
    environments_of_interest,
):
    """Select the data points to include in the benchmark set
    for each of the chosen substances.

    Parameters
    ----------
    property_of_interest: tuple of type of PhysicalProperty and SubstanceType
        The type of property to select data points for.
    chosen_substances: list of tuple of str
        The substances to choose data points for.
    target_states: list of StatePoint
        The target states to select data points at.

    Returns
    -------
    pandas.DataFrame
        The selected data points.
    """

    with TemporaryDirectory() as data_directory:

        data_frames = []

        for environment in environments_of_interest:

            data_folder = os.path.join(
                "..",
                "..",
                "..",
                "data_availability",
                "data_by_environments",
                environment,
                "all_data",
            )

            try:
                data_frame = load_processed_data_set(data_folder,
                                                     *property_of_interest)
            except FileNotFoundError:
                continue

            if len(data_frame) == 0:
                continue

            data_frames.append(data_frame)

        data_frame = pandas.concat(data_frames, ignore_index=True, sort=False)
        data_frame = filter_by_substance_composition(data_frame,
                                                     chosen_substances, None)
        # Fill in the missing columns
        if "Exact Amount 1" not in data_frame:
            data_frame["Exact Amount 1"] = numpy.nan
        if "Exact Amount 2" not in data_frame:
            data_frame["Exact Amount 2"] = numpy.nan
        save_processed_data_set(data_directory, data_frame,
                                *property_of_interest)

        target_states = {property_of_interest: target_states}

        selected_data_set = select_data_points(
            data_directory=data_directory,
            chosen_substances=None,
            target_state_points=target_states,
        )

    selected_data_frame = selected_data_set.to_pandas()

    # Prune any data points measured for too low or too high
    # mole fractions.
    selected_data_frame = selected_data_frame[
        (selected_data_frame["Mole Fraction 1"] > 0.15)
        & (selected_data_frame["Mole Fraction 1"] < 0.85)]

    return selected_data_frame
def main():

    logging.basicConfig(level=logging.INFO)

    root_output_directory = "test_sets"
    os.makedirs(root_output_directory, exist_ok=True)

    # Define the types of property which are of interest.
    properties_of_interest = [
        (Density, SubstanceType.Pure),
        (EnthalpyOfVaporization, SubstanceType.Pure),
    ]

    # Define the state we would ideally chose data points at.
    target_states = [
        StatePoint(298.15 * unit.kelvin, 1.0 * unit.atmosphere, (1.0, )),
    ]
    target_states = {x: target_states for x in properties_of_interest}

    # Define the environments of interest.
    environments_of_interest = [
        "alcohol", "ester", "alkane", "ether", "ketone"
    ]

    # Load in the training substances so we can avoid selecting
    # them for the test set.
    training_smiles = load_training_components()

    with TemporaryDirectory() as data_directory:

        # Apply the filters to the available data.
        for property_of_interest in properties_of_interest:

            data_frames = []

            for environment in environments_of_interest:

                data_frame = load_processed_data_set(
                    os.path.join(
                        "..",
                        "..",
                        "..",
                        "data_availability",
                        "data_by_environments",
                        f"{environment}_{environment}",
                        "all_data",
                    ),
                    *property_of_interest,
                )

                data_frames.append(data_frame)

            data_frame = pandas.concat(data_frames,
                                       ignore_index=True,
                                       sort=False)

            data_frame = filter_data(data_frame)
            data_frame = filter_by_smiles(data_frame, training_smiles, None)

            save_processed_data_set(data_directory, data_frame,
                                    *property_of_interest)

        # Determine which components have enthalpy of vaporization
        # measurements. These will be the compounds which will be
        # included in the pure test set.
        h_vap_data_frame = load_processed_data_set(data_directory,
                                                   EnthalpyOfVaporization,
                                                   SubstanceType.Pure)

        test_set_components = {*h_vap_data_frame["Component 1"]}
        test_set_components = [(x, ) for x in test_set_components]

        # Select the data points.
        selected_data_set = select_data_points(
            data_directory=data_directory,
            chosen_substances=test_set_components,
            target_state_points=target_states,
        )

    selected_data_set.json(os.path.join(root_output_directory,
                                        "pure_set.json"))

    selected_data_frame = selected_data_set.to_pandas()
    selected_data_frame.to_csv(os.path.join(root_output_directory,
                                            "pure_set.csv"),
                               index=False)

    data_frame_to_pdf(selected_data_frame,
                      os.path.join(root_output_directory, "pure_set.pdf"))
def filter_common_data(output_directory, substances):
    """Filter the common data to a smaller temperature range - this
    seems to help the state selection method get closer to the target
    states.
    """
    os.makedirs(os.path.join(output_directory, "h_mix_and_rho_x"),
                exist_ok=True)

    for property_type, substance_type in [
        (EnthalpyOfMixing, SubstanceType.Binary),
        (Density, SubstanceType.Binary),
    ]:

        data_frames = []

        for environment_mix in [
                "alcohol_ester",
                "alcohol_alkane",
                "ether_alkane",
                "ether_ketone",
        ]:

            data_frame = load_processed_data_set(
                os.path.join(
                    "..",
                    "..",
                    "..",
                    "data_availability",
                    "data_by_environments",
                    environment_mix,
                    "common_data",
                    "h_mix_rho_x",
                ),
                property_type,
                substance_type,
            )

            data_frame = filter_by_substance_composition(
                data_frame, substances, None)

            data_frame = data_frame[(data_frame["Mole Fraction 1"] > 0.10)
                                    & (data_frame["Mole Fraction 1"] < 0.90)]

            data_frames.append(data_frame)

        full_data_frame = pandas.concat(data_frames)

        save_processed_data_set(
            os.path.join(output_directory, "h_mix_and_rho_x"),
            full_data_frame,
            property_type,
            substance_type,
        )
        data_frame_to_pdf(
            full_data_frame,
            os.path.join(
                output_directory,
                "h_mix_and_rho_x",
                property_to_file_name(property_type, substance_type) + ".pdf",
            ),
        )
def main():

    root_output_directory = "data_by_environments"

    # Define the number of processes to parallelize over.
    n_processes = 20

    # Set up logging
    logging.basicConfig(level=logging.INFO)

    # Define the properties and environments we are interested in.
    pure_properties_of_interest = [
        (Density, SubstanceType.Pure),
        (EnthalpyOfVaporization, SubstanceType.Pure),
    ]
    mixture_properties_of_interest = [
        (Density, SubstanceType.Binary),
        (ExcessMolarVolume, SubstanceType.Binary),
        (EnthalpyOfMixing, SubstanceType.Binary),
    ]

    environments_of_interest = {
        "alcohol": [
            chemical_environment_codes["hydroxy"],
            chemical_environment_codes["alcohol"],
        ],
        "ester": [
            chemical_environment_codes["caboxylic_acid"],
            chemical_environment_codes["ester"],
        ],
        "ether": [chemical_environment_codes["ether"]],
        "aldehyde": [chemical_environment_codes["aldehyde"]],
        "ketone": [chemical_environment_codes["ketone"]],
        "thiocarbonyl": [chemical_environment_codes["thiocarbonyl"]],
        "phenol": [chemical_environment_codes["phenol"]],
        "amine": [chemical_environment_codes["amine"]],
        "halogenated": [chemical_environment_codes["halogenated"]],
        "amide": [chemical_environment_codes["amide"]],
        "nitro": [chemical_environment_codes["nitro"]],
        "aromatic": [chemical_environment_codes["aromatic"]],
        "heterocycle": [chemical_environment_codes["heterocycle"]],
        "alkane": [""],
        "alkene": [chemical_environment_codes["alkene"]],
    }

    properties_of_interest = [
        *pure_properties_of_interest,
        *mixture_properties_of_interest,
    ]

    with TemporaryDirectory() as data_directory:

        root_data_directory = os.path.join("..", "..", "shared",
                                           "filtered_data")

        # Create a temporary directory which contains both the converted
        # mass density / excess molar volume data, and the other data of
        # interest
        for property_type, substance_type in properties_of_interest:

            if (property_type in [Density, ExcessMolarVolume]
                    and substance_type == SubstanceType.Binary):
                # Source any binary mass density or excess molar
                # volume from the full set of converted density
                # data.
                data_set = load_processed_data_set("converted_density_data",
                                                   property_type,
                                                   substance_type)

            elif property_type == EnthalpyOfVaporization:

                data_set = load_processed_data_set("sourced_h_vap_data",
                                                   property_type,
                                                   substance_type)

            else:

                data_set = load_processed_data_set(root_data_directory,
                                                   property_type,
                                                   substance_type)

            save_processed_data_set(data_directory, data_set, property_type,
                                    substance_type)

        # Determine all combinations of the environments of interest.
        environment_pairs = [(x, x) for x in environments_of_interest]
        environment_pairs.extend(
            itertools.combinations(environments_of_interest, 2))

        with Pool(n_processes) as pool:

            x = list(
                tqdm.tqdm(
                    pool.imap(
                        functools.partial(
                            apply_filters,
                            data_directory=data_directory,
                            environments_of_interest=environments_of_interest,
                            mixture_properties_of_interest=
                            mixture_properties_of_interest,
                            pure_properties_of_interest=
                            pure_properties_of_interest,
                            root_output_directory=root_output_directory,
                        ),
                        environment_pairs,
                    ),
                    total=len(environment_pairs),
                ))

        assert x is not None
Esempio n. 8
0
def main():
    """Collates a directory of NIST ThermoML archive files into
    more readily manipulable pandas csv files.
    """

    # Set up logging
    logging.basicConfig(level=logging.INFO)

    raw_data_directory = resource_filename("nistdataselection",
                                           os.path.join("data", "thermoml"))
    processed_data_directory = "processed_data"

    # Convert the raw ThermoML data files into more easily manipulable
    # `pandas.DataFrame` objects.
    processing.process_raw_data(
        directory=raw_data_directory,
        output_directory=processed_data_directory,
        retain_values=True,
        retain_uncertainties=True,
    )

    # Define the ranges of temperatures and pressures of interest.
    # Here we choose a range of temperatures which are biologically
    # relevant (15 C - 45 C) and pressures which are close to ambient.
    temperature_range = (288.15 * unit.kelvin, 323.15 * unit.kelvin)
    pressure_range = (0.95 * unit.atmosphere, 1.05 * unit.atmosphere)

    # Define the elements that we are interested in. Here we only allow
    # a subset of those elements for which Parsley has parameters for,
    # and for which there exists plentiful data in the ThermoML archives.
    allowed_elements = ["H", "N", "C", "O", "S", "F", "Cl", "Br", "I"]

    # Define the target number of unique substances to choose for each
    # type of property of interest.
    target_substances_per_property = {
        (Density, SubstanceType.Pure): 1,
    }

    # Create a directory to store the filtered data in.
    filtered_data_directory = "filtered_data"
    os.makedirs(filtered_data_directory, exist_ok=True)

    # Perform basic filtering on the data sets.
    for property_type, substance_type in target_substances_per_property:

        # Load the full data sets from the processed data file
        logging.info(f"Applying filters to the {substance_type.value} "
                     f"{property_type.__name__} data set.")
        data_set = processing.load_processed_data_set(processed_data_directory,
                                                      property_type,
                                                      substance_type)

        # Apply a standard set of filters.
        data_set = filtering.apply_standard_filters(data_set,
                                                    temperature_range,
                                                    pressure_range,
                                                    allowed_elements)

        logging.info(f"The filtered data set contains {len(data_set)} "
                     f"properties.")

        # Save the filtered data set.
        processing.save_processed_data_set(filtered_data_directory, data_set,
                                           property_type, substance_type)

    # Choose a set of unique substances to train the VdW parameters against.
    # These are just tuples of smiles patterns which define the composition of
    # the substance. We choose the actual mole fractions of components in a later
    # step.
    #
    # Here we specify which regions of chemical space we want to cover. This
    # is mainly driven by the VdW parameters we wish to exercise, but may also
    # be supplemented with additional environments which are poorly represented.
    target_environments = [
        "[#1:1]-[#6X4]",
        "[#1:1]-[#6X3]",
        "[#1:1]-[#8]",
        "[#6:1]",
        "[#6X4:1]",
        "[#8:1]",
        "[#8X2H0+0:1]",
        "[#8X2H1+0:1]",
        "[#7:1]",
        "[#16:1]",
        "[#9:1]",
        "[#17:1]",
        "[#35:1]",
    ]

    chosen_substances = selection.select_substances(
        filtered_data_directory, target_substances_per_property,
        target_environments)

    logging.info(f"{len(chosen_substances)} substances where chosen.")

    # Define the specific states at which we wish to select data. These are currently
    # tuples of temperature, pressure, and a tuple of the mole fractions of each of the
    # components.
    density_target_state_points = [
        selection.StatePoint(298.15 * unit.kelvin, 101.325 * unit.kilopascal,
                             (1.0, )),
        selection.StatePoint(318.15 * unit.kelvin, 101.325 * unit.kilopascal,
                             (1.0, )),
    ]

    target_property_state_points = {
        (Density, SubstanceType.Pure): density_target_state_points,
    }

    # Set the output path to the data set.
    data_set_name = "pure_data_set"

    # Choose the final data set containing the chosen substances, and
    # data points at the target state points.
    data_set = selection.select_data_points(filtered_data_directory,
                                            chosen_substances,
                                            target_property_state_points)

    with open(f"{data_set_name}.json", "w") as file:
        file.write(data_set.json())

    data_set.to_pandas().to_csv(f"{data_set_name}.csv")

    # Generate a pdf report detailing the chosen set.
    reporting.generate_report(f"{data_set_name}.json",
                              vdw_smirks_of_interest=target_environments)
def filter_common_data(output_directory):
    """Filter the common data to a smaller temperature range - this
    seems to help the state selection method get closer to the target
    states.
    """
    os.makedirs(os.path.join(output_directory, "h_mix_and_v_excess"),
                exist_ok=True)
    os.makedirs(os.path.join(output_directory, "h_mix_and_binary_density"),
                exist_ok=True)

    for property_type, substance_type in [
        (EnthalpyOfMixing, SubstanceType.Binary),
        (ExcessMolarVolume, SubstanceType.Binary),
    ]:

        data_frame = load_processed_data_set(
            os.path.join(
                "..",
                "..",
                "..",
                "data_availability",
                "data_by_environments",
                "alcohol_ester",
                "common_data",
                "h_mix_v_excess",
            ),
            property_type,
            substance_type,
        )
        data_frame = filter_by_temperature(data_frame, 290.0 * unit.kelvin,
                                           305 * unit.kelvin)
        save_processed_data_set(
            os.path.join(output_directory, "h_mix_and_v_excess"),
            data_frame,
            property_type,
            substance_type,
        )

    for property_type, substance_type in [
        (EnthalpyOfMixing, SubstanceType.Binary),
        (Density, SubstanceType.Binary),
    ]:

        data_frame = load_processed_data_set(
            os.path.join(
                "..",
                "..",
                "..",
                "data_availability",
                "data_by_environments",
                "alcohol_ester",
                "common_data",
                "h_mix_rho_x",
            ),
            property_type,
            substance_type,
        )
        data_frame = filter_by_temperature(data_frame, 290.0 * unit.kelvin,
                                           305 * unit.kelvin)
        save_processed_data_set(
            os.path.join(output_directory, "h_mix_and_binary_density"),
            data_frame,
            property_type,
            substance_type,
        )
def main():

    root_output_directory = "data_by_environments"

    # Set up logging
    logging.basicConfig(level=logging.INFO)

    # Define the types of data to find.
    properties_of_interest = [
        [(EnthalpyOfMixing, SubstanceType.Binary),
         (Density, SubstanceType.Binary)],
        [
            (EnthalpyOfMixing, SubstanceType.Binary),
            (ExcessMolarVolume, SubstanceType.Binary),
        ],
        [
            (EnthalpyOfMixing, SubstanceType.Binary),
            (Density, SubstanceType.Binary),
            (ExcessMolarVolume, SubstanceType.Binary),
        ],
    ]

    # Define some shorter file names to use:
    type_to_file_name = {
        (Density, SubstanceType.Binary): "rho_x",
        (EnthalpyOfMixing, SubstanceType.Binary): "h_mix",
        (ExcessMolarVolume, SubstanceType.Binary): "v_excess",
    }

    # Define which types of mixtures we are interested in, e.g.
    # alcohol-alcohol, alcohol-ester etc.
    environments_of_interest = [
        os.path.basename(x) for x in glob("data_by_environments/*")
    ]

    for environment_of_interest in environments_of_interest:

        data_directory = os.path.join("data_by_environments",
                                      environment_of_interest, "all_data")

        os.makedirs(
            os.path.join(root_output_directory, environment_of_interest,
                         "common_data"),
            exist_ok=True,
        )

        for property_type_set in properties_of_interest:

            # Find the set of substances which are common to all of the
            # specified property types.
            all_substance_smiles = []

            for property_type, substance_type in property_type_set:

                data_frame = load_processed_data_set(data_directory,
                                                     property_type,
                                                     substance_type)

                if len(data_frame) == 0:

                    all_substance_smiles = []
                    break

                substance_smiles = set(data_frame_to_smiles_tuples(data_frame))
                all_substance_smiles.append(substance_smiles)

            if len(all_substance_smiles) == 0:
                continue

            common_substance_smiles = set.intersection(*all_substance_smiles)

            # Save the common substances to a pdf file.
            file_name = "_".join(type_to_file_name[x]
                                 for x in property_type_set)

            file_path = os.path.join(
                root_output_directory,
                environment_of_interest,
                "common_data",
                f"{file_name}.pdf",
            )

            if len(common_substance_smiles) > 0:
                smiles_to_pdf(list(common_substance_smiles), file_path)

            # Output the common data to the `common_data` directory.
            output_directory = os.path.join(root_output_directory,
                                            environment_of_interest,
                                            "common_data", file_name)

            for property_type, substance_type in property_type_set:

                data_frame = load_processed_data_set(data_directory,
                                                     property_type,
                                                     substance_type)

                data_frame = filter_by_substance_composition(
                    data_frame, common_substance_smiles, None)

                save_processed_data_set(output_directory, data_frame,
                                        property_type, substance_type)