Example #1
0
def main():

    raw_data_directory = "../raw_archives"
    processed_data_directory = "data_with_uncertainties"

    output_directory = "uncertainties"
    os.makedirs(output_directory, exist_ok=True)

    # Convert the raw ThermoML data files into more easily manipulable
    # `pandas.DataFrame` objects.
    if not os.path.isdir(processed_data_directory):

        process_raw_data(
            directory=raw_data_directory,
            output_directory=processed_data_directory,
            retain_values=True,
            retain_uncertainties=True,
            n_processes=20,
            files_per_worker=50,
        )

    # Specify the properties to extract the modal uncertainties of.
    properties_of_interest = [
        (Density, SubstanceType.Pure),
        (Density, SubstanceType.Binary),
        (EnthalpyOfVaporization, SubstanceType.Pure),
        (ExcessMolarVolume, SubstanceType.Binary),
        (EnthalpyOfMixing, SubstanceType.Binary),
    ]

    for property_type, substance_type in properties_of_interest:

        data_frame = load_processed_data_set(processed_data_directory,
                                             property_type, substance_type)

        if len(data_frame) == 0:
            continue

        default_unit = property_type.default_unit()
        uncertainty_header = f"{property_type.__name__} Uncertainty ({default_unit:~})"

        # Drop NaN or unbelievably high uncertainties.
        data_frame.dropna(subset=[uncertainty_header], inplace=True)
        data_frame = data_frame[data_frame[uncertainty_header] < 5.0]

        raw_uncertainties = data_frame[uncertainty_header]

        uncertainties = {
            "minimum": float(numpy.min(raw_uncertainties)),
            "maximum": float(numpy.max(raw_uncertainties)),
            "mean": float(numpy.mean(raw_uncertainties)),
            "mode": float(scipy.stats.mode(raw_uncertainties).mode),
        }

        # Save the uncertainties to a JSON file.
        property_type = property_to_snake_case(property_type)
        file_name = f"{property_type}_{str(substance_type.value)}.json"

        with open(os.path.join(output_directory, file_name), "w") as file:
            json.dump(uncertainties, file)
def filter_data(data_directory, properties_of_interest, chemical_environments,
                output_directory):
    """Filters out any measurements which where made for components which
    do not contain the chemical environments of interest.

    Parameters
    ----------
    data_directory: str
        The directory containing the unfiltered data.
    properties_of_interest: list of tuple of PropertyType and SubstanceType
        The types of properties to extract data for.
    chemical_environments: list of list of str
        A list of those chemical environments to filter by. Each list in the
        full list corresponds to the chemical environments which should be
        matched by one of the components in the system.
    output_directory: str
        The directory to store the extracted data in.
    """

    for property_tuple in properties_of_interest:

        property_type, substance_type = property_tuple

        data_set = processing.load_processed_data_set(data_directory,
                                                      property_type,
                                                      substance_type)

        # Start by filtering out any substances not composed of O, C, H, N, F, Cl, Br, S
        data_set = filter_by_elements(data_set, "C", "H", "O", "N", "F", "Cl",
                                      "Br", "S")

        # Next filter out any substances which aren't alcohols, esters or acids.
        data_set = filter_by_checkmol(data_set, *chemical_environments)

        # Save the filtered data set.
        processing.save_processed_data_set(output_directory, data_set,
                                           property_type, substance_type)

        # Save out a pdf of all smiles patterns (/ tuples of smiles patterns).
        property_type = property_to_snake_case(property_type)

        file_name = f"{property_type}_{str(substance_type.value)}.pdf"
        file_path = os.path.join(output_directory, file_name)

        data_frame_to_pdf(data_set, file_path)
def filter_data(data_directory, property_type, substance_type,
                output_directory):

    # Load in the data set
    data_frame = load_processed_data_set(data_directory, property_type,
                                         substance_type)

    # Filter to be close to ambient.
    data_frame = filter_by_temperature(data_frame, 290.0 * unit.kelvin,
                                       305 * unit.kelvin)

    # Filter out aromatics, long chain molecules (>= hept), alkenes,
    # ethers, 3 + 4 membered rings
    data_frame = filter_by_smirks(
        data_frame,
        None,
        [
            "[#6a]",
            "[#6r3]",
            "[#6r4]",
            "[#6]=[#6]",
            "[#6]~[#6]~[#6]~[#6]~[#6]~[#6]~[#6]",
            "[#6H2]-[#8X2]-[#6H2]",
        ],
    )

    # Filter out any molecules with undefined stereochemistry
    data_frame = filter_undefined_stereochemistry(data_frame)

    # Save the filtered set.
    save_processed_data_set(
        output_directory,
        data_frame,
        property_type,
        substance_type,
    )

    property_type = property_to_snake_case(property_type)
    file_name = f"{property_type}_{str(substance_type.value)}.pdf"

    data_frame_to_pdf(data_frame, os.path.join(output_directory, file_name))
def main():

    raw_data_directory = "raw_archives"
    processed_data_directory = "processed_data"

    # Convert the raw ThermoML data files into more easily manipulable
    # `pandas.DataFrame` objects.
    process_raw_data(
        directory=raw_data_directory,
        output_directory=processed_data_directory,
        retain_values=True,
        retain_uncertainties=False,
        n_processes=20,
        files_per_worker=50,
    )

    # Here we will also 'fix' the enthalpy of vaporization entries so that
    # they have a pressure (approximated as ambient).
    h_vap_properties = [
        (EnthalpyOfVaporization, SubstanceType.Pure),
        (EnthalpyOfVaporization, SubstanceType.Binary),
        (EnthalpyOfVaporization, SubstanceType.Ternary),
    ]

    pressure = 1.0 * unit.atmosphere

    for property_tuple in h_vap_properties:

        data_set = load_processed_data_set(processed_data_directory,
                                           *property_tuple)

        data_set["Pressure (kPa)"] = data_set["Pressure (kPa)"].fillna(
            pressure.to(unit.kilopascal).magnitude)

        save_processed_data_set(processed_data_directory, data_set,
                                *property_tuple)
def plot_estimated_vs_reference(property_types, study_names, output_directory):

    # Refactor the data into a single frame.
    for property_type, substance_type in property_types:

        data_frames = []

        for study_name in study_names:

            results_directory = os.path.join("partitioned_data", study_name)

            environments = [
                os.path.basename(x)
                for x in glob(os.path.join(results_directory, "*"))
            ]
            environments = [
                x for x in environments if len(tuple(x.split("_"))) ==
                substance_type_to_int[substance_type]
            ]

            for environment in environments:

                try:

                    data_frame = load_processed_data_set(
                        os.path.join(results_directory, environment),
                        property_type,
                        substance_type,
                    )

                except FileNotFoundError:
                    continue

                if len(data_frame) == 0:
                    continue

                default_unit = property_type.default_unit()

                reference_values = data_frame[
                    f"Reference {property_type.__name__} Value ({default_unit:~})"]
                estimated_values = data_frame[
                    f"Estimated {property_type.__name__} Value ({default_unit:~})"]
                estimated_std = data_frame[
                    f"Estimated {property_type.__name__} Uncertainty ({default_unit:~})"]

                data_frame = pandas.DataFrame()

                data_frame["Reference Value"] = reference_values
                data_frame["Reference Std"] = 0.0

                data_frame["Estimated Value"] = estimated_values
                data_frame["Estimated Std"] = estimated_std

                data_frame["Study"] = study_name
                data_frame["Environment"] = environment

                data_frames.append(data_frame)

        data_frame = pandas.concat(data_frames, ignore_index=True, sort=False)

        environments = list(sorted(set(data_frame["Environment"])))

        palette = seaborn.color_palette("Set1", len(environments))

        plot = seaborn.FacetGrid(
            data_frame,
            col="Study",
            sharex="row",
            sharey="row",
            hue_order=environments,
            palette=palette,
            size=4.0,
            aspect=0.8,
        )
        plot.map_dataframe(
            plot_scatter,
            "Estimated Value",
            "Reference Value",
            "Reference Std",
            "Estimated Std",
            "Environment",
            environments,
            color=palette,
            marker="o",
            linestyle="None",
        )

        plot.set_titles("{col_name}")
        plot.add_legend()

        pyplot.subplots_adjust(top=0.85)

        property_title = property_to_title(property_type, substance_type)
        plot.fig.suptitle(property_title)

        file_name = property_to_file_name(property_type, substance_type)
        plot.savefig(os.path.join(output_directory, f"{file_name}.png"))
def choose_data_points(
    property_of_interest,
    chosen_substances,
    target_states,
    environments_of_interest,
):
    """Select the data points to include in the benchmark set
    for each of the chosen substances.

    Parameters
    ----------
    property_of_interest: tuple of type of PhysicalProperty and SubstanceType
        The type of property to select data points for.
    chosen_substances: list of tuple of str
        The substances to choose data points for.
    target_states: list of StatePoint
        The target states to select data points at.

    Returns
    -------
    pandas.DataFrame
        The selected data points.
    """

    with TemporaryDirectory() as data_directory:

        data_frames = []

        for environment in environments_of_interest:

            data_folder = os.path.join(
                "..",
                "..",
                "..",
                "data_availability",
                "data_by_environments",
                environment,
                "all_data",
            )

            try:
                data_frame = load_processed_data_set(data_folder,
                                                     *property_of_interest)
            except FileNotFoundError:
                continue

            if len(data_frame) == 0:
                continue

            data_frames.append(data_frame)

        data_frame = pandas.concat(data_frames, ignore_index=True, sort=False)
        data_frame = filter_by_substance_composition(data_frame,
                                                     chosen_substances, None)
        # Fill in the missing columns
        if "Exact Amount 1" not in data_frame:
            data_frame["Exact Amount 1"] = numpy.nan
        if "Exact Amount 2" not in data_frame:
            data_frame["Exact Amount 2"] = numpy.nan
        save_processed_data_set(data_directory, data_frame,
                                *property_of_interest)

        target_states = {property_of_interest: target_states}

        selected_data_set = select_data_points(
            data_directory=data_directory,
            chosen_substances=None,
            target_state_points=target_states,
        )

    selected_data_frame = selected_data_set.to_pandas()

    # Prune any data points measured for too low or too high
    # mole fractions.
    selected_data_frame = selected_data_frame[
        (selected_data_frame["Mole Fraction 1"] > 0.15)
        & (selected_data_frame["Mole Fraction 1"] < 0.85)]

    return selected_data_frame
def choose_substances(
    property_of_interest,
    environment,
    finger_print_type,
    n_mixtures_per_environment,
    training_mixtures,
):
    """A function which aims to select a set of substances which are
    as distinct as possible from both the training and currently selected
    test set.

    This proceeds by:

    1. Selecting the molecule which is 'furthest' away from both the training
       set and the currently selected test set (which starts of empty), where
       the distance is defined as:

       sqrt(compute_distance_with_set(unselected_substance, training_set) ** 2 +
            compute_distance_with_set(unselected_substance, test_set) ** 2)

    2. Moving the selected molecule from the unselected set into the test set.

    3. Repeat steps 1 and two until either the target number of molecules have
       been selected, or there are no more unselected molecules to choose from.

    Parameters
    ----------
    property_of_interest: tuple of type of PhysicalProperty and SubstanceType
        The properties of interest.
    environment: str
        The environment (e.g. alcohol_alkane) to select molecules for.
    finger_print_type: OEFPTypeBase
        The type of finger print to base the distance metrics on.
    n_mixtures_per_environment: int
        The target number of molecules to select.
    training_mixtures: list of tuple of str
        The substances in the training set.
    Returns
    -------
    list of tuple of str
        The selected molecules.
    """

    property_name = property_to_file_name(*property_of_interest)

    logger.info(f"{property_name}_{environment}: Starting.")

    try:

        data_frame = load_processed_data_set(
            os.path.join(
                "..",
                "..",
                "..",
                "data_availability",
                "data_by_environments",
                environment,
                "all_data",
            ),
            *property_of_interest,
        )

    except FileNotFoundError:
        return []

    # Filter out the training mixtures.
    data_frame = filter_by_substance_composition(data_frame, None,
                                                 training_mixtures)
    data_frame = filter_data(data_frame)

    mixtures = {
        tuple(sorted((x["Component 1"], x["Component 2"])))
        for _, x in data_frame.iterrows()
    }

    open_list = [*mixtures]
    closed_list = []

    max_n_possible = min(len(open_list), n_mixtures_per_environment)

    while len(open_list) > 0 and len(closed_list) < n_mixtures_per_environment:

        def distance_metric(mixture):

            training_distance = compute_distance_with_set(
                mixture, training_mixtures, finger_print_type)
            test_distance = compute_distance_with_set(mixture, mixtures,
                                                      finger_print_type)

            return sqrt(training_distance**2 + test_distance**2)

        least_similar = sorted(open_list, key=distance_metric, reverse=True)[0]

        open_list.remove(least_similar)
        closed_list.append(least_similar)

        logger.info(f"{property_name}_{environment}: "
                    f"{len(closed_list)} / {max_n_possible} selected")

    return closed_list
Example #8
0
def main():

    root_data_directory = "data_by_environments"

    # Set up logging
    logging.basicConfig(level=logging.INFO)

    # Define the properties and environments we are interested in.
    environments_of_interest = [
        os.path.basename(x) for x in glob("data_by_environments/*")
    ]

    properties_of_interest = [
        (EnthalpyOfMixing, SubstanceType.Binary),
        (Density, SubstanceType.Binary),
        # (ExcessMolarVolume, SubstanceType.Binary),
    ]
    friendly_names = {
        (EnthalpyOfMixing, SubstanceType.Binary): "Hmix(x)",
        (Density, SubstanceType.Binary): "rho(x)",
        # (ExcessMolarVolume, SubstanceType.Binary): "Vexcess(x)",
    }

    property_combinations = [(x, ) for x in properties_of_interest]
    property_combinations.extend(
        itertools.combinations(properties_of_interest, 2))

    data_rows = []

    for environment_of_interest in environments_of_interest:

        environment_1, environment_2 = environment_of_interest.split("_")

        data_row = {
            "Environment 1": environment_1,
            "Environment 2": environment_2
        }

        data_directory = os.path.join(root_data_directory,
                                      "_".join([environment_1,
                                                environment_2]), "all_data")

        for property_combination in property_combinations:

            # Find the set of substances which are common to all of the
            # specified property types.
            all_substance_smiles = []
            property_names = []

            for property_tuple in property_combination:

                property_names.append(friendly_names[property_tuple])

                data_frame = load_processed_data_set(data_directory,
                                                     *property_tuple)

                if len(data_frame) == 0:
                    all_substance_smiles = []
                    break

                substance_smiles = set(data_frame_to_smiles_tuples(data_frame))
                all_substance_smiles.append(substance_smiles)

            common_substance_smiles = {}

            if len(all_substance_smiles) > 0:
                common_substance_smiles = set.intersection(
                    *all_substance_smiles)

            property_string = " + ".join(property_names)
            data_row[property_string] = len(common_substance_smiles)

        data_rows.append(data_row)

    columns = [
        "Environment 1",
        "Environment 2",
        *[
            " + ".join([friendly_names[x] for x in y])
            for y in property_combinations
        ],
    ]

    summary_frame = pandas.DataFrame(data=data_rows, columns=columns)
    summary_frame.fillna(0, inplace=True)
    summary_frame.sort_values(["Hmix(x) + rho(x)"],
                              ascending=False,
                              inplace=True)

    summary_frame.to_csv("summary.csv", index=False)

    with open("summary.md", "w") as file:
        summary_frame.to_markdown(file, showindex=False)
def select_data_points(data_directory, chosen_substances, target_state_points):
    """The method attempts to find a set of data points for each
    property which are clustered around the set of conditions specified
    in the `target_state_points` input array.

    The points will be chosen so as to try and maximise the number of
    properties measured at the same condition (e.g. ideally we would
    have a data point for each property at T=298.15 and p=1atm) as this
    will maximise the chances that we can extract all properties from a
    single simulation.

    Parameters
    ----------
    data_directory: str
        The directory which contains the processed pandas
        data sets
    chosen_substances: list of tuple of str, optional
        The substances to choose data points for. If None,
        no filtering of substances will be performed by this function.
    target_state_points: dict of tuple of type and SubstanceType and list of StatePoint
        A list of the state points for which we would ideally have data
        points for. The value tuple should be of the form
        (temperature, pressure, (mole fraction 0, ..., mole fraction N))

    Returns
    -------
    PhysicalPropertyDataSet
        A data set which contains the chosen data points.
    """

    # Load the full data set from the processed data files
    data_frames = []

    for property_type, substance_type in target_state_points:

        data_frame = load_processed_data_set(data_directory, property_type,
                                             substance_type)
        data_frames.append(data_frame)

    full_data_frame = pandas.concat(data_frames, ignore_index=True, sort=False)
    data_set = data_set_from_data_frame(full_data_frame)

    properties_by_substance = defaultdict(list)

    # Partition the properties by their substance components,
    # filtering out any not chosen substances.
    for substance in data_set.substances:

        substance_tuple = tuple(
            sorted([component.smiles for component in substance.components]))

        if chosen_substances is not None and substance_tuple not in chosen_substances:
            continue

        properties_by_substance[substance_tuple].extend(
            data_set.properties_by_substance(substance))

    # Start to choose the state points.
    return_data_set = PhysicalPropertyDataSet()

    for substance_tuple in properties_by_substance:

        # Cluster the data points around the closest states of interest.
        clustered_properties = _cluster_properties_around_states(
            properties_by_substance[substance_tuple], target_state_points)

        # For each cluster, we try to find the state points for which we have
        # measured the most types of properties (i.e. prioritise states
        # for which we have a density, dielectric and enthalpy measurement
        # over those for which we only have a density measurement).
        for target_state_point, physical_properties in clustered_properties.items(
        ):

            properties_per_state = defaultdict(list)
            property_types_per_state = defaultdict(set)

            # Refactor the properties into more convenient data structures.
            for physical_property in physical_properties:

                state_point = StatePoint.from_physical_property(
                    physical_property)
                property_tuple = property_to_type_tuple(physical_property)

                properties_per_state[state_point].append(physical_property)
                property_types_per_state[state_point].add(property_tuple)

            # Sort the state points based on their distance to the target state.
            sorted_states_points = list(
                sorted(
                    properties_per_state.keys(),
                    key=functools.partial(StatePoint.individual_distances,
                                          target_state_point),
                ))

            # Keep track of the properties which we need to choose a state point for
            properties_to_cover = set(
                property_tuple for property_tuple in target_state_points)
            # as well as the chosen state points
            chosen_state_points = set()

            # Iteratively consider state points which have all data points, down
            # to state points for which we only have single property measurements.
            for target_number_of_properties in reversed(
                    range(1,
                          len(target_state_points) + 1)):

                for state_point in sorted_states_points:

                    property_types_at_state = property_types_per_state[
                        state_point]

                    if len(property_types_at_state
                           ) != target_number_of_properties:
                        continue

                    if (len(
                            properties_to_cover.intersection(
                                property_types_at_state)) == 0):
                        continue

                    chosen_state_points.add(state_point)

                    properties_to_cover = properties_to_cover.symmetric_difference(
                        properties_to_cover.intersection(
                            property_types_at_state))

            # Add the properties which were measured at the chosen state points
            # to the returned data set.
            for state_point in chosen_state_points:

                if len(properties_per_state[state_point]) == 0:
                    continue

                return_data_set.add_properties(
                    *properties_per_state[state_point])

    return return_data_set
def _build_substance_data(data_directory, target_substances_per_property,
                          smirks_to_exercise):
    """Loads all of the different data sets for each property type of
    interest and converts them into a single list of `SubstanceData`
    objects.

    Any substances which don't exercise at least one of the chemical
    environments of interest are ignored.

    Parameters
    ----------
    data_directory: str
        The directory which contains the processed pandas
        data sets
    target_substances_per_property: dict of tuple of type and SubstanceType and int
        The target number of unique substances to choose for each
        type of property of interest.
    smirks_to_exercise: list of str
        A list of those smirks patterns which represent those chemical environments
         which we to aim to exercise.

    Returns
    -------
    list of SubstanceData
        The loaded substance data.
    """
    all_substance_tuples = defaultdict(set)
    all_smiles_patterns = set()

    for property_type, substance_type in target_substances_per_property:

        # Load the full data sets from the processed data file
        data_frame = load_processed_data_set(data_directory, property_type,
                                             substance_type)

        substance_tuples = data_frame_to_smiles_tuples(data_frame)

        for substance_tuple in substance_tuples:
            all_substance_tuples[substance_tuple].add(
                (property_type, substance_type))

        substance_smiles = set(x for y in substance_tuples for x in y)
        all_smiles_patterns.update(substance_smiles)

    # Build the list of substances which we can choose from
    all_substance_data = []

    for substance_tuple in all_substance_tuples:

        # Make sure that this smiles tuple does actually exercise at least one
        # of the chemical environments of interest.
        smiles_per_smirks = find_smirks_matches(tuple(smirks_to_exercise),
                                                *substance_tuple)
        all_exercised_smirks = set([
            smirks for smirks, smiles in smiles_per_smirks.items()
            if len(smiles) > 0
        ])

        smirks_per_smiles = invert_dict_of_iterable(smiles_per_smirks)

        exercised_smirks_of_interest = set()

        for smiles_pattern in substance_tuple:

            if (smiles_pattern not in smirks_per_smiles
                    or len(smirks_per_smiles[smiles_pattern]) == 0):
                continue

            exercised_smirks_of_interest.update(
                smirks_per_smiles[smiles_pattern])

        if len(exercised_smirks_of_interest) == 0:
            continue

        substance_data = SubstanceData(
            substance_tuple=substance_tuple,
            smirks_exercised=all_exercised_smirks,
            property_types=all_substance_tuples[substance_tuple],
        )

        all_substance_data.append(substance_data)

    return all_substance_data
def main():

    root_output_directory = "data_by_environments"

    # Set up logging
    logging.basicConfig(level=logging.INFO)

    # Define the types of data to find.
    properties_of_interest = [
        [(EnthalpyOfMixing, SubstanceType.Binary),
         (Density, SubstanceType.Binary)],
        [
            (EnthalpyOfMixing, SubstanceType.Binary),
            (ExcessMolarVolume, SubstanceType.Binary),
        ],
        [
            (EnthalpyOfMixing, SubstanceType.Binary),
            (Density, SubstanceType.Binary),
            (ExcessMolarVolume, SubstanceType.Binary),
        ],
    ]

    # Define some shorter file names to use:
    type_to_file_name = {
        (Density, SubstanceType.Binary): "rho_x",
        (EnthalpyOfMixing, SubstanceType.Binary): "h_mix",
        (ExcessMolarVolume, SubstanceType.Binary): "v_excess",
    }

    # Define which types of mixtures we are interested in, e.g.
    # alcohol-alcohol, alcohol-ester etc.
    environments_of_interest = [
        os.path.basename(x) for x in glob("data_by_environments/*")
    ]

    for environment_of_interest in environments_of_interest:

        data_directory = os.path.join("data_by_environments",
                                      environment_of_interest, "all_data")

        os.makedirs(
            os.path.join(root_output_directory, environment_of_interest,
                         "common_data"),
            exist_ok=True,
        )

        for property_type_set in properties_of_interest:

            # Find the set of substances which are common to all of the
            # specified property types.
            all_substance_smiles = []

            for property_type, substance_type in property_type_set:

                data_frame = load_processed_data_set(data_directory,
                                                     property_type,
                                                     substance_type)

                if len(data_frame) == 0:

                    all_substance_smiles = []
                    break

                substance_smiles = set(data_frame_to_smiles_tuples(data_frame))
                all_substance_smiles.append(substance_smiles)

            if len(all_substance_smiles) == 0:
                continue

            common_substance_smiles = set.intersection(*all_substance_smiles)

            # Save the common substances to a pdf file.
            file_name = "_".join(type_to_file_name[x]
                                 for x in property_type_set)

            file_path = os.path.join(
                root_output_directory,
                environment_of_interest,
                "common_data",
                f"{file_name}.pdf",
            )

            if len(common_substance_smiles) > 0:
                smiles_to_pdf(list(common_substance_smiles), file_path)

            # Output the common data to the `common_data` directory.
            output_directory = os.path.join(root_output_directory,
                                            environment_of_interest,
                                            "common_data", file_name)

            for property_type, substance_type in property_type_set:

                data_frame = load_processed_data_set(data_directory,
                                                     property_type,
                                                     substance_type)

                data_frame = filter_by_substance_composition(
                    data_frame, common_substance_smiles, None)

                save_processed_data_set(output_directory, data_frame,
                                        property_type, substance_type)
def main():

    logging.basicConfig(level=logging.INFO)

    root_output_directory = "test_sets"
    os.makedirs(root_output_directory, exist_ok=True)

    # Define the types of property which are of interest.
    properties_of_interest = [
        (Density, SubstanceType.Pure),
        (EnthalpyOfVaporization, SubstanceType.Pure),
    ]

    # Define the state we would ideally chose data points at.
    target_states = [
        StatePoint(298.15 * unit.kelvin, 1.0 * unit.atmosphere, (1.0, )),
    ]
    target_states = {x: target_states for x in properties_of_interest}

    # Define the environments of interest.
    environments_of_interest = [
        "alcohol", "ester", "alkane", "ether", "ketone"
    ]

    # Load in the training substances so we can avoid selecting
    # them for the test set.
    training_smiles = load_training_components()

    with TemporaryDirectory() as data_directory:

        # Apply the filters to the available data.
        for property_of_interest in properties_of_interest:

            data_frames = []

            for environment in environments_of_interest:

                data_frame = load_processed_data_set(
                    os.path.join(
                        "..",
                        "..",
                        "..",
                        "data_availability",
                        "data_by_environments",
                        f"{environment}_{environment}",
                        "all_data",
                    ),
                    *property_of_interest,
                )

                data_frames.append(data_frame)

            data_frame = pandas.concat(data_frames,
                                       ignore_index=True,
                                       sort=False)

            data_frame = filter_data(data_frame)
            data_frame = filter_by_smiles(data_frame, training_smiles, None)

            save_processed_data_set(data_directory, data_frame,
                                    *property_of_interest)

        # Determine which components have enthalpy of vaporization
        # measurements. These will be the compounds which will be
        # included in the pure test set.
        h_vap_data_frame = load_processed_data_set(data_directory,
                                                   EnthalpyOfVaporization,
                                                   SubstanceType.Pure)

        test_set_components = {*h_vap_data_frame["Component 1"]}
        test_set_components = [(x, ) for x in test_set_components]

        # Select the data points.
        selected_data_set = select_data_points(
            data_directory=data_directory,
            chosen_substances=test_set_components,
            target_state_points=target_states,
        )

    selected_data_set.json(os.path.join(root_output_directory,
                                        "pure_set.json"))

    selected_data_frame = selected_data_set.to_pandas()
    selected_data_frame.to_csv(os.path.join(root_output_directory,
                                            "pure_set.csv"),
                               index=False)

    data_frame_to_pdf(selected_data_frame,
                      os.path.join(root_output_directory, "pure_set.pdf"))
def filter_common_data(output_directory, substances):
    """Filter the common data to a smaller temperature range - this
    seems to help the state selection method get closer to the target
    states.
    """
    os.makedirs(os.path.join(output_directory, "h_mix_and_rho_x"),
                exist_ok=True)

    for property_type, substance_type in [
        (EnthalpyOfMixing, SubstanceType.Binary),
        (Density, SubstanceType.Binary),
    ]:

        data_frames = []

        for environment_mix in [
                "alcohol_ester",
                "alcohol_alkane",
                "ether_alkane",
                "ether_ketone",
        ]:

            data_frame = load_processed_data_set(
                os.path.join(
                    "..",
                    "..",
                    "..",
                    "data_availability",
                    "data_by_environments",
                    environment_mix,
                    "common_data",
                    "h_mix_rho_x",
                ),
                property_type,
                substance_type,
            )

            data_frame = filter_by_substance_composition(
                data_frame, substances, None)

            data_frame = data_frame[(data_frame["Mole Fraction 1"] > 0.10)
                                    & (data_frame["Mole Fraction 1"] < 0.90)]

            data_frames.append(data_frame)

        full_data_frame = pandas.concat(data_frames)

        save_processed_data_set(
            os.path.join(output_directory, "h_mix_and_rho_x"),
            full_data_frame,
            property_type,
            substance_type,
        )
        data_frame_to_pdf(
            full_data_frame,
            os.path.join(
                output_directory,
                "h_mix_and_rho_x",
                property_to_file_name(property_type, substance_type) + ".pdf",
            ),
        )
def filter_common_data(output_directory):
    """Filter the common data to a smaller temperature range - this
    seems to help the state selection method get closer to the target
    states.
    """
    os.makedirs(os.path.join(output_directory, "h_mix_and_v_excess"),
                exist_ok=True)
    os.makedirs(os.path.join(output_directory, "h_mix_and_binary_density"),
                exist_ok=True)

    for property_type, substance_type in [
        (EnthalpyOfMixing, SubstanceType.Binary),
        (ExcessMolarVolume, SubstanceType.Binary),
    ]:

        data_frame = load_processed_data_set(
            os.path.join(
                "..",
                "..",
                "..",
                "data_availability",
                "data_by_environments",
                "alcohol_ester",
                "common_data",
                "h_mix_v_excess",
            ),
            property_type,
            substance_type,
        )
        data_frame = filter_by_temperature(data_frame, 290.0 * unit.kelvin,
                                           305 * unit.kelvin)
        save_processed_data_set(
            os.path.join(output_directory, "h_mix_and_v_excess"),
            data_frame,
            property_type,
            substance_type,
        )

    for property_type, substance_type in [
        (EnthalpyOfMixing, SubstanceType.Binary),
        (Density, SubstanceType.Binary),
    ]:

        data_frame = load_processed_data_set(
            os.path.join(
                "..",
                "..",
                "..",
                "data_availability",
                "data_by_environments",
                "alcohol_ester",
                "common_data",
                "h_mix_rho_x",
            ),
            property_type,
            substance_type,
        )
        data_frame = filter_by_temperature(data_frame, 290.0 * unit.kelvin,
                                           305 * unit.kelvin)
        save_processed_data_set(
            os.path.join(output_directory, "h_mix_and_binary_density"),
            data_frame,
            property_type,
            substance_type,
        )
def main():

    root_output_directory = "data_by_environments"

    # Define the number of processes to parallelize over.
    n_processes = 20

    # Set up logging
    logging.basicConfig(level=logging.INFO)

    # Define the properties and environments we are interested in.
    pure_properties_of_interest = [
        (Density, SubstanceType.Pure),
        (EnthalpyOfVaporization, SubstanceType.Pure),
    ]
    mixture_properties_of_interest = [
        (Density, SubstanceType.Binary),
        (ExcessMolarVolume, SubstanceType.Binary),
        (EnthalpyOfMixing, SubstanceType.Binary),
    ]

    environments_of_interest = {
        "alcohol": [
            chemical_environment_codes["hydroxy"],
            chemical_environment_codes["alcohol"],
        ],
        "ester": [
            chemical_environment_codes["caboxylic_acid"],
            chemical_environment_codes["ester"],
        ],
        "ether": [chemical_environment_codes["ether"]],
        "aldehyde": [chemical_environment_codes["aldehyde"]],
        "ketone": [chemical_environment_codes["ketone"]],
        "thiocarbonyl": [chemical_environment_codes["thiocarbonyl"]],
        "phenol": [chemical_environment_codes["phenol"]],
        "amine": [chemical_environment_codes["amine"]],
        "halogenated": [chemical_environment_codes["halogenated"]],
        "amide": [chemical_environment_codes["amide"]],
        "nitro": [chemical_environment_codes["nitro"]],
        "aromatic": [chemical_environment_codes["aromatic"]],
        "heterocycle": [chemical_environment_codes["heterocycle"]],
        "alkane": [""],
        "alkene": [chemical_environment_codes["alkene"]],
    }

    properties_of_interest = [
        *pure_properties_of_interest,
        *mixture_properties_of_interest,
    ]

    with TemporaryDirectory() as data_directory:

        root_data_directory = os.path.join("..", "..", "shared",
                                           "filtered_data")

        # Create a temporary directory which contains both the converted
        # mass density / excess molar volume data, and the other data of
        # interest
        for property_type, substance_type in properties_of_interest:

            if (property_type in [Density, ExcessMolarVolume]
                    and substance_type == SubstanceType.Binary):
                # Source any binary mass density or excess molar
                # volume from the full set of converted density
                # data.
                data_set = load_processed_data_set("converted_density_data",
                                                   property_type,
                                                   substance_type)

            elif property_type == EnthalpyOfVaporization:

                data_set = load_processed_data_set("sourced_h_vap_data",
                                                   property_type,
                                                   substance_type)

            else:

                data_set = load_processed_data_set(root_data_directory,
                                                   property_type,
                                                   substance_type)

            save_processed_data_set(data_directory, data_set, property_type,
                                    substance_type)

        # Determine all combinations of the environments of interest.
        environment_pairs = [(x, x) for x in environments_of_interest]
        environment_pairs.extend(
            itertools.combinations(environments_of_interest, 2))

        with Pool(n_processes) as pool:

            x = list(
                tqdm.tqdm(
                    pool.imap(
                        functools.partial(
                            apply_filters,
                            data_directory=data_directory,
                            environments_of_interest=environments_of_interest,
                            mixture_properties_of_interest=
                            mixture_properties_of_interest,
                            pure_properties_of_interest=
                            pure_properties_of_interest,
                            root_output_directory=root_output_directory,
                        ),
                        environment_pairs,
                    ),
                    total=len(environment_pairs),
                ))

        assert x is not None
def main():

    root_output_directory = "partitioned_data"

    # Define the types of property which are of interest.
    properties_of_interest = [
        (Density, SubstanceType.Pure),
        (EnthalpyOfVaporization, SubstanceType.Pure),
        (EnthalpyOfMixing, SubstanceType.Binary),
        (ExcessMolarVolume, SubstanceType.Binary),
        (Density, SubstanceType.Binary),
    ]

    # Define the types of mixture which are of interest
    environment_types = ["alcohol_alcohol", "alcohol_ester", "ester_ester"]

    # Find all of the substances which appeared in the training set
    training_smiles = find_training_smiles()

    for environment_type in environment_types:

        output_directory = os.path.join(root_output_directory, environment_type)
        os.makedirs(output_directory, exist_ok=True)

        for property_type, substance_type in properties_of_interest:

            full_data_frame = load_processed_data_set(
                os.path.join("filtered_data", environment_type),
                property_type,
                substance_type,
            )

            property_type = property_to_snake_case(property_type)
            file_name = f"{property_type}_{str(substance_type.value)}"

            # Extract properties where neither component appears in
            # in the training set.
            if substance_type == SubstanceType.Binary:

                data_frame = full_data_frame[
                    ~full_data_frame["Component 1"].isin(training_smiles)
                    & ~full_data_frame["Component 2"].isin(training_smiles)
                ]

            elif substance_type == SubstanceType.Pure:

                data_frame = full_data_frame[
                    ~full_data_frame["Component 1"].isin(training_smiles)
                ]

            else:

                raise NotImplementedError()

            base_directory = os.path.join(output_directory, "not_in_training")
            os.makedirs(base_directory, exist_ok=True)

            data_frame.to_csv(
                os.path.join(base_directory, file_name + ".csv"), index=False
            )
            data_frame_to_pdf(
                data_frame, os.path.join(base_directory, file_name + ".pdf")
            )

            if substance_type == SubstanceType.Pure:
                continue

            # Extract properties where both components appear in
            # in the training set.
            data_frame = full_data_frame[
                full_data_frame["Component 1"].isin(training_smiles)
                & full_data_frame["Component 2"].isin(training_smiles)
            ]

            base_directory = os.path.join(output_directory, "both_in_training")
            os.makedirs(base_directory, exist_ok=True)

            data_frame.to_csv(
                os.path.join(base_directory, file_name + ".csv"), index=False
            )
            data_frame_to_pdf(
                data_frame, os.path.join(base_directory, file_name + ".pdf")
            )

            # Extract properties where only one component appears in
            # in the training set.
            data_frame = full_data_frame[
                (
                    full_data_frame["Component 1"].isin(training_smiles)
                    & ~full_data_frame["Component 2"].isin(training_smiles)
                )
                | (
                    ~full_data_frame["Component 1"].isin(training_smiles)
                    & full_data_frame["Component 2"].isin(training_smiles)
                )
            ]

            base_directory = os.path.join(output_directory, "one_in_training")
            os.makedirs(base_directory, exist_ok=True)

            data_frame.to_csv(
                os.path.join(base_directory, file_name + ".csv"), index=False
            )
            data_frame_to_pdf(
                data_frame, os.path.join(base_directory, file_name + ".pdf")
            )
Example #17
0
def main():
    """Collates a directory of NIST ThermoML archive files into
    more readily manipulable pandas csv files.
    """

    # Set up logging
    logging.basicConfig(level=logging.INFO)

    raw_data_directory = resource_filename("nistdataselection",
                                           os.path.join("data", "thermoml"))
    processed_data_directory = "processed_data"

    # Convert the raw ThermoML data files into more easily manipulable
    # `pandas.DataFrame` objects.
    processing.process_raw_data(
        directory=raw_data_directory,
        output_directory=processed_data_directory,
        retain_values=True,
        retain_uncertainties=True,
    )

    # Define the ranges of temperatures and pressures of interest.
    # Here we choose a range of temperatures which are biologically
    # relevant (15 C - 45 C) and pressures which are close to ambient.
    temperature_range = (288.15 * unit.kelvin, 323.15 * unit.kelvin)
    pressure_range = (0.95 * unit.atmosphere, 1.05 * unit.atmosphere)

    # Define the elements that we are interested in. Here we only allow
    # a subset of those elements for which Parsley has parameters for,
    # and for which there exists plentiful data in the ThermoML archives.
    allowed_elements = ["H", "N", "C", "O", "S", "F", "Cl", "Br", "I"]

    # Define the target number of unique substances to choose for each
    # type of property of interest.
    target_substances_per_property = {
        (Density, SubstanceType.Pure): 1,
    }

    # Create a directory to store the filtered data in.
    filtered_data_directory = "filtered_data"
    os.makedirs(filtered_data_directory, exist_ok=True)

    # Perform basic filtering on the data sets.
    for property_type, substance_type in target_substances_per_property:

        # Load the full data sets from the processed data file
        logging.info(f"Applying filters to the {substance_type.value} "
                     f"{property_type.__name__} data set.")
        data_set = processing.load_processed_data_set(processed_data_directory,
                                                      property_type,
                                                      substance_type)

        # Apply a standard set of filters.
        data_set = filtering.apply_standard_filters(data_set,
                                                    temperature_range,
                                                    pressure_range,
                                                    allowed_elements)

        logging.info(f"The filtered data set contains {len(data_set)} "
                     f"properties.")

        # Save the filtered data set.
        processing.save_processed_data_set(filtered_data_directory, data_set,
                                           property_type, substance_type)

    # Choose a set of unique substances to train the VdW parameters against.
    # These are just tuples of smiles patterns which define the composition of
    # the substance. We choose the actual mole fractions of components in a later
    # step.
    #
    # Here we specify which regions of chemical space we want to cover. This
    # is mainly driven by the VdW parameters we wish to exercise, but may also
    # be supplemented with additional environments which are poorly represented.
    target_environments = [
        "[#1:1]-[#6X4]",
        "[#1:1]-[#6X3]",
        "[#1:1]-[#8]",
        "[#6:1]",
        "[#6X4:1]",
        "[#8:1]",
        "[#8X2H0+0:1]",
        "[#8X2H1+0:1]",
        "[#7:1]",
        "[#16:1]",
        "[#9:1]",
        "[#17:1]",
        "[#35:1]",
    ]

    chosen_substances = selection.select_substances(
        filtered_data_directory, target_substances_per_property,
        target_environments)

    logging.info(f"{len(chosen_substances)} substances where chosen.")

    # Define the specific states at which we wish to select data. These are currently
    # tuples of temperature, pressure, and a tuple of the mole fractions of each of the
    # components.
    density_target_state_points = [
        selection.StatePoint(298.15 * unit.kelvin, 101.325 * unit.kilopascal,
                             (1.0, )),
        selection.StatePoint(318.15 * unit.kelvin, 101.325 * unit.kilopascal,
                             (1.0, )),
    ]

    target_property_state_points = {
        (Density, SubstanceType.Pure): density_target_state_points,
    }

    # Set the output path to the data set.
    data_set_name = "pure_data_set"

    # Choose the final data set containing the chosen substances, and
    # data points at the target state points.
    data_set = selection.select_data_points(filtered_data_directory,
                                            chosen_substances,
                                            target_property_state_points)

    with open(f"{data_set_name}.json", "w") as file:
        file.write(data_set.json())

    data_set.to_pandas().to_csv(f"{data_set_name}.csv")

    # Generate a pdf report detailing the chosen set.
    reporting.generate_report(f"{data_set_name}.json",
                              vdw_smirks_of_interest=target_environments)