def main(): raw_data_directory = "../raw_archives" processed_data_directory = "data_with_uncertainties" output_directory = "uncertainties" os.makedirs(output_directory, exist_ok=True) # Convert the raw ThermoML data files into more easily manipulable # `pandas.DataFrame` objects. if not os.path.isdir(processed_data_directory): process_raw_data( directory=raw_data_directory, output_directory=processed_data_directory, retain_values=True, retain_uncertainties=True, n_processes=20, files_per_worker=50, ) # Specify the properties to extract the modal uncertainties of. properties_of_interest = [ (Density, SubstanceType.Pure), (Density, SubstanceType.Binary), (EnthalpyOfVaporization, SubstanceType.Pure), (ExcessMolarVolume, SubstanceType.Binary), (EnthalpyOfMixing, SubstanceType.Binary), ] for property_type, substance_type in properties_of_interest: data_frame = load_processed_data_set(processed_data_directory, property_type, substance_type) if len(data_frame) == 0: continue default_unit = property_type.default_unit() uncertainty_header = f"{property_type.__name__} Uncertainty ({default_unit:~})" # Drop NaN or unbelievably high uncertainties. data_frame.dropna(subset=[uncertainty_header], inplace=True) data_frame = data_frame[data_frame[uncertainty_header] < 5.0] raw_uncertainties = data_frame[uncertainty_header] uncertainties = { "minimum": float(numpy.min(raw_uncertainties)), "maximum": float(numpy.max(raw_uncertainties)), "mean": float(numpy.mean(raw_uncertainties)), "mode": float(scipy.stats.mode(raw_uncertainties).mode), } # Save the uncertainties to a JSON file. property_type = property_to_snake_case(property_type) file_name = f"{property_type}_{str(substance_type.value)}.json" with open(os.path.join(output_directory, file_name), "w") as file: json.dump(uncertainties, file)
def filter_data(data_directory, properties_of_interest, chemical_environments, output_directory): """Filters out any measurements which where made for components which do not contain the chemical environments of interest. Parameters ---------- data_directory: str The directory containing the unfiltered data. properties_of_interest: list of tuple of PropertyType and SubstanceType The types of properties to extract data for. chemical_environments: list of list of str A list of those chemical environments to filter by. Each list in the full list corresponds to the chemical environments which should be matched by one of the components in the system. output_directory: str The directory to store the extracted data in. """ for property_tuple in properties_of_interest: property_type, substance_type = property_tuple data_set = processing.load_processed_data_set(data_directory, property_type, substance_type) # Start by filtering out any substances not composed of O, C, H, N, F, Cl, Br, S data_set = filter_by_elements(data_set, "C", "H", "O", "N", "F", "Cl", "Br", "S") # Next filter out any substances which aren't alcohols, esters or acids. data_set = filter_by_checkmol(data_set, *chemical_environments) # Save the filtered data set. processing.save_processed_data_set(output_directory, data_set, property_type, substance_type) # Save out a pdf of all smiles patterns (/ tuples of smiles patterns). property_type = property_to_snake_case(property_type) file_name = f"{property_type}_{str(substance_type.value)}.pdf" file_path = os.path.join(output_directory, file_name) data_frame_to_pdf(data_set, file_path)
def filter_data(data_directory, property_type, substance_type, output_directory): # Load in the data set data_frame = load_processed_data_set(data_directory, property_type, substance_type) # Filter to be close to ambient. data_frame = filter_by_temperature(data_frame, 290.0 * unit.kelvin, 305 * unit.kelvin) # Filter out aromatics, long chain molecules (>= hept), alkenes, # ethers, 3 + 4 membered rings data_frame = filter_by_smirks( data_frame, None, [ "[#6a]", "[#6r3]", "[#6r4]", "[#6]=[#6]", "[#6]~[#6]~[#6]~[#6]~[#6]~[#6]~[#6]", "[#6H2]-[#8X2]-[#6H2]", ], ) # Filter out any molecules with undefined stereochemistry data_frame = filter_undefined_stereochemistry(data_frame) # Save the filtered set. save_processed_data_set( output_directory, data_frame, property_type, substance_type, ) property_type = property_to_snake_case(property_type) file_name = f"{property_type}_{str(substance_type.value)}.pdf" data_frame_to_pdf(data_frame, os.path.join(output_directory, file_name))
def main(): raw_data_directory = "raw_archives" processed_data_directory = "processed_data" # Convert the raw ThermoML data files into more easily manipulable # `pandas.DataFrame` objects. process_raw_data( directory=raw_data_directory, output_directory=processed_data_directory, retain_values=True, retain_uncertainties=False, n_processes=20, files_per_worker=50, ) # Here we will also 'fix' the enthalpy of vaporization entries so that # they have a pressure (approximated as ambient). h_vap_properties = [ (EnthalpyOfVaporization, SubstanceType.Pure), (EnthalpyOfVaporization, SubstanceType.Binary), (EnthalpyOfVaporization, SubstanceType.Ternary), ] pressure = 1.0 * unit.atmosphere for property_tuple in h_vap_properties: data_set = load_processed_data_set(processed_data_directory, *property_tuple) data_set["Pressure (kPa)"] = data_set["Pressure (kPa)"].fillna( pressure.to(unit.kilopascal).magnitude) save_processed_data_set(processed_data_directory, data_set, *property_tuple)
def plot_estimated_vs_reference(property_types, study_names, output_directory): # Refactor the data into a single frame. for property_type, substance_type in property_types: data_frames = [] for study_name in study_names: results_directory = os.path.join("partitioned_data", study_name) environments = [ os.path.basename(x) for x in glob(os.path.join(results_directory, "*")) ] environments = [ x for x in environments if len(tuple(x.split("_"))) == substance_type_to_int[substance_type] ] for environment in environments: try: data_frame = load_processed_data_set( os.path.join(results_directory, environment), property_type, substance_type, ) except FileNotFoundError: continue if len(data_frame) == 0: continue default_unit = property_type.default_unit() reference_values = data_frame[ f"Reference {property_type.__name__} Value ({default_unit:~})"] estimated_values = data_frame[ f"Estimated {property_type.__name__} Value ({default_unit:~})"] estimated_std = data_frame[ f"Estimated {property_type.__name__} Uncertainty ({default_unit:~})"] data_frame = pandas.DataFrame() data_frame["Reference Value"] = reference_values data_frame["Reference Std"] = 0.0 data_frame["Estimated Value"] = estimated_values data_frame["Estimated Std"] = estimated_std data_frame["Study"] = study_name data_frame["Environment"] = environment data_frames.append(data_frame) data_frame = pandas.concat(data_frames, ignore_index=True, sort=False) environments = list(sorted(set(data_frame["Environment"]))) palette = seaborn.color_palette("Set1", len(environments)) plot = seaborn.FacetGrid( data_frame, col="Study", sharex="row", sharey="row", hue_order=environments, palette=palette, size=4.0, aspect=0.8, ) plot.map_dataframe( plot_scatter, "Estimated Value", "Reference Value", "Reference Std", "Estimated Std", "Environment", environments, color=palette, marker="o", linestyle="None", ) plot.set_titles("{col_name}") plot.add_legend() pyplot.subplots_adjust(top=0.85) property_title = property_to_title(property_type, substance_type) plot.fig.suptitle(property_title) file_name = property_to_file_name(property_type, substance_type) plot.savefig(os.path.join(output_directory, f"{file_name}.png"))
def choose_data_points( property_of_interest, chosen_substances, target_states, environments_of_interest, ): """Select the data points to include in the benchmark set for each of the chosen substances. Parameters ---------- property_of_interest: tuple of type of PhysicalProperty and SubstanceType The type of property to select data points for. chosen_substances: list of tuple of str The substances to choose data points for. target_states: list of StatePoint The target states to select data points at. Returns ------- pandas.DataFrame The selected data points. """ with TemporaryDirectory() as data_directory: data_frames = [] for environment in environments_of_interest: data_folder = os.path.join( "..", "..", "..", "data_availability", "data_by_environments", environment, "all_data", ) try: data_frame = load_processed_data_set(data_folder, *property_of_interest) except FileNotFoundError: continue if len(data_frame) == 0: continue data_frames.append(data_frame) data_frame = pandas.concat(data_frames, ignore_index=True, sort=False) data_frame = filter_by_substance_composition(data_frame, chosen_substances, None) # Fill in the missing columns if "Exact Amount 1" not in data_frame: data_frame["Exact Amount 1"] = numpy.nan if "Exact Amount 2" not in data_frame: data_frame["Exact Amount 2"] = numpy.nan save_processed_data_set(data_directory, data_frame, *property_of_interest) target_states = {property_of_interest: target_states} selected_data_set = select_data_points( data_directory=data_directory, chosen_substances=None, target_state_points=target_states, ) selected_data_frame = selected_data_set.to_pandas() # Prune any data points measured for too low or too high # mole fractions. selected_data_frame = selected_data_frame[ (selected_data_frame["Mole Fraction 1"] > 0.15) & (selected_data_frame["Mole Fraction 1"] < 0.85)] return selected_data_frame
def choose_substances( property_of_interest, environment, finger_print_type, n_mixtures_per_environment, training_mixtures, ): """A function which aims to select a set of substances which are as distinct as possible from both the training and currently selected test set. This proceeds by: 1. Selecting the molecule which is 'furthest' away from both the training set and the currently selected test set (which starts of empty), where the distance is defined as: sqrt(compute_distance_with_set(unselected_substance, training_set) ** 2 + compute_distance_with_set(unselected_substance, test_set) ** 2) 2. Moving the selected molecule from the unselected set into the test set. 3. Repeat steps 1 and two until either the target number of molecules have been selected, or there are no more unselected molecules to choose from. Parameters ---------- property_of_interest: tuple of type of PhysicalProperty and SubstanceType The properties of interest. environment: str The environment (e.g. alcohol_alkane) to select molecules for. finger_print_type: OEFPTypeBase The type of finger print to base the distance metrics on. n_mixtures_per_environment: int The target number of molecules to select. training_mixtures: list of tuple of str The substances in the training set. Returns ------- list of tuple of str The selected molecules. """ property_name = property_to_file_name(*property_of_interest) logger.info(f"{property_name}_{environment}: Starting.") try: data_frame = load_processed_data_set( os.path.join( "..", "..", "..", "data_availability", "data_by_environments", environment, "all_data", ), *property_of_interest, ) except FileNotFoundError: return [] # Filter out the training mixtures. data_frame = filter_by_substance_composition(data_frame, None, training_mixtures) data_frame = filter_data(data_frame) mixtures = { tuple(sorted((x["Component 1"], x["Component 2"]))) for _, x in data_frame.iterrows() } open_list = [*mixtures] closed_list = [] max_n_possible = min(len(open_list), n_mixtures_per_environment) while len(open_list) > 0 and len(closed_list) < n_mixtures_per_environment: def distance_metric(mixture): training_distance = compute_distance_with_set( mixture, training_mixtures, finger_print_type) test_distance = compute_distance_with_set(mixture, mixtures, finger_print_type) return sqrt(training_distance**2 + test_distance**2) least_similar = sorted(open_list, key=distance_metric, reverse=True)[0] open_list.remove(least_similar) closed_list.append(least_similar) logger.info(f"{property_name}_{environment}: " f"{len(closed_list)} / {max_n_possible} selected") return closed_list
def main(): root_data_directory = "data_by_environments" # Set up logging logging.basicConfig(level=logging.INFO) # Define the properties and environments we are interested in. environments_of_interest = [ os.path.basename(x) for x in glob("data_by_environments/*") ] properties_of_interest = [ (EnthalpyOfMixing, SubstanceType.Binary), (Density, SubstanceType.Binary), # (ExcessMolarVolume, SubstanceType.Binary), ] friendly_names = { (EnthalpyOfMixing, SubstanceType.Binary): "Hmix(x)", (Density, SubstanceType.Binary): "rho(x)", # (ExcessMolarVolume, SubstanceType.Binary): "Vexcess(x)", } property_combinations = [(x, ) for x in properties_of_interest] property_combinations.extend( itertools.combinations(properties_of_interest, 2)) data_rows = [] for environment_of_interest in environments_of_interest: environment_1, environment_2 = environment_of_interest.split("_") data_row = { "Environment 1": environment_1, "Environment 2": environment_2 } data_directory = os.path.join(root_data_directory, "_".join([environment_1, environment_2]), "all_data") for property_combination in property_combinations: # Find the set of substances which are common to all of the # specified property types. all_substance_smiles = [] property_names = [] for property_tuple in property_combination: property_names.append(friendly_names[property_tuple]) data_frame = load_processed_data_set(data_directory, *property_tuple) if len(data_frame) == 0: all_substance_smiles = [] break substance_smiles = set(data_frame_to_smiles_tuples(data_frame)) all_substance_smiles.append(substance_smiles) common_substance_smiles = {} if len(all_substance_smiles) > 0: common_substance_smiles = set.intersection( *all_substance_smiles) property_string = " + ".join(property_names) data_row[property_string] = len(common_substance_smiles) data_rows.append(data_row) columns = [ "Environment 1", "Environment 2", *[ " + ".join([friendly_names[x] for x in y]) for y in property_combinations ], ] summary_frame = pandas.DataFrame(data=data_rows, columns=columns) summary_frame.fillna(0, inplace=True) summary_frame.sort_values(["Hmix(x) + rho(x)"], ascending=False, inplace=True) summary_frame.to_csv("summary.csv", index=False) with open("summary.md", "w") as file: summary_frame.to_markdown(file, showindex=False)
def select_data_points(data_directory, chosen_substances, target_state_points): """The method attempts to find a set of data points for each property which are clustered around the set of conditions specified in the `target_state_points` input array. The points will be chosen so as to try and maximise the number of properties measured at the same condition (e.g. ideally we would have a data point for each property at T=298.15 and p=1atm) as this will maximise the chances that we can extract all properties from a single simulation. Parameters ---------- data_directory: str The directory which contains the processed pandas data sets chosen_substances: list of tuple of str, optional The substances to choose data points for. If None, no filtering of substances will be performed by this function. target_state_points: dict of tuple of type and SubstanceType and list of StatePoint A list of the state points for which we would ideally have data points for. The value tuple should be of the form (temperature, pressure, (mole fraction 0, ..., mole fraction N)) Returns ------- PhysicalPropertyDataSet A data set which contains the chosen data points. """ # Load the full data set from the processed data files data_frames = [] for property_type, substance_type in target_state_points: data_frame = load_processed_data_set(data_directory, property_type, substance_type) data_frames.append(data_frame) full_data_frame = pandas.concat(data_frames, ignore_index=True, sort=False) data_set = data_set_from_data_frame(full_data_frame) properties_by_substance = defaultdict(list) # Partition the properties by their substance components, # filtering out any not chosen substances. for substance in data_set.substances: substance_tuple = tuple( sorted([component.smiles for component in substance.components])) if chosen_substances is not None and substance_tuple not in chosen_substances: continue properties_by_substance[substance_tuple].extend( data_set.properties_by_substance(substance)) # Start to choose the state points. return_data_set = PhysicalPropertyDataSet() for substance_tuple in properties_by_substance: # Cluster the data points around the closest states of interest. clustered_properties = _cluster_properties_around_states( properties_by_substance[substance_tuple], target_state_points) # For each cluster, we try to find the state points for which we have # measured the most types of properties (i.e. prioritise states # for which we have a density, dielectric and enthalpy measurement # over those for which we only have a density measurement). for target_state_point, physical_properties in clustered_properties.items( ): properties_per_state = defaultdict(list) property_types_per_state = defaultdict(set) # Refactor the properties into more convenient data structures. for physical_property in physical_properties: state_point = StatePoint.from_physical_property( physical_property) property_tuple = property_to_type_tuple(physical_property) properties_per_state[state_point].append(physical_property) property_types_per_state[state_point].add(property_tuple) # Sort the state points based on their distance to the target state. sorted_states_points = list( sorted( properties_per_state.keys(), key=functools.partial(StatePoint.individual_distances, target_state_point), )) # Keep track of the properties which we need to choose a state point for properties_to_cover = set( property_tuple for property_tuple in target_state_points) # as well as the chosen state points chosen_state_points = set() # Iteratively consider state points which have all data points, down # to state points for which we only have single property measurements. for target_number_of_properties in reversed( range(1, len(target_state_points) + 1)): for state_point in sorted_states_points: property_types_at_state = property_types_per_state[ state_point] if len(property_types_at_state ) != target_number_of_properties: continue if (len( properties_to_cover.intersection( property_types_at_state)) == 0): continue chosen_state_points.add(state_point) properties_to_cover = properties_to_cover.symmetric_difference( properties_to_cover.intersection( property_types_at_state)) # Add the properties which were measured at the chosen state points # to the returned data set. for state_point in chosen_state_points: if len(properties_per_state[state_point]) == 0: continue return_data_set.add_properties( *properties_per_state[state_point]) return return_data_set
def _build_substance_data(data_directory, target_substances_per_property, smirks_to_exercise): """Loads all of the different data sets for each property type of interest and converts them into a single list of `SubstanceData` objects. Any substances which don't exercise at least one of the chemical environments of interest are ignored. Parameters ---------- data_directory: str The directory which contains the processed pandas data sets target_substances_per_property: dict of tuple of type and SubstanceType and int The target number of unique substances to choose for each type of property of interest. smirks_to_exercise: list of str A list of those smirks patterns which represent those chemical environments which we to aim to exercise. Returns ------- list of SubstanceData The loaded substance data. """ all_substance_tuples = defaultdict(set) all_smiles_patterns = set() for property_type, substance_type in target_substances_per_property: # Load the full data sets from the processed data file data_frame = load_processed_data_set(data_directory, property_type, substance_type) substance_tuples = data_frame_to_smiles_tuples(data_frame) for substance_tuple in substance_tuples: all_substance_tuples[substance_tuple].add( (property_type, substance_type)) substance_smiles = set(x for y in substance_tuples for x in y) all_smiles_patterns.update(substance_smiles) # Build the list of substances which we can choose from all_substance_data = [] for substance_tuple in all_substance_tuples: # Make sure that this smiles tuple does actually exercise at least one # of the chemical environments of interest. smiles_per_smirks = find_smirks_matches(tuple(smirks_to_exercise), *substance_tuple) all_exercised_smirks = set([ smirks for smirks, smiles in smiles_per_smirks.items() if len(smiles) > 0 ]) smirks_per_smiles = invert_dict_of_iterable(smiles_per_smirks) exercised_smirks_of_interest = set() for smiles_pattern in substance_tuple: if (smiles_pattern not in smirks_per_smiles or len(smirks_per_smiles[smiles_pattern]) == 0): continue exercised_smirks_of_interest.update( smirks_per_smiles[smiles_pattern]) if len(exercised_smirks_of_interest) == 0: continue substance_data = SubstanceData( substance_tuple=substance_tuple, smirks_exercised=all_exercised_smirks, property_types=all_substance_tuples[substance_tuple], ) all_substance_data.append(substance_data) return all_substance_data
def main(): root_output_directory = "data_by_environments" # Set up logging logging.basicConfig(level=logging.INFO) # Define the types of data to find. properties_of_interest = [ [(EnthalpyOfMixing, SubstanceType.Binary), (Density, SubstanceType.Binary)], [ (EnthalpyOfMixing, SubstanceType.Binary), (ExcessMolarVolume, SubstanceType.Binary), ], [ (EnthalpyOfMixing, SubstanceType.Binary), (Density, SubstanceType.Binary), (ExcessMolarVolume, SubstanceType.Binary), ], ] # Define some shorter file names to use: type_to_file_name = { (Density, SubstanceType.Binary): "rho_x", (EnthalpyOfMixing, SubstanceType.Binary): "h_mix", (ExcessMolarVolume, SubstanceType.Binary): "v_excess", } # Define which types of mixtures we are interested in, e.g. # alcohol-alcohol, alcohol-ester etc. environments_of_interest = [ os.path.basename(x) for x in glob("data_by_environments/*") ] for environment_of_interest in environments_of_interest: data_directory = os.path.join("data_by_environments", environment_of_interest, "all_data") os.makedirs( os.path.join(root_output_directory, environment_of_interest, "common_data"), exist_ok=True, ) for property_type_set in properties_of_interest: # Find the set of substances which are common to all of the # specified property types. all_substance_smiles = [] for property_type, substance_type in property_type_set: data_frame = load_processed_data_set(data_directory, property_type, substance_type) if len(data_frame) == 0: all_substance_smiles = [] break substance_smiles = set(data_frame_to_smiles_tuples(data_frame)) all_substance_smiles.append(substance_smiles) if len(all_substance_smiles) == 0: continue common_substance_smiles = set.intersection(*all_substance_smiles) # Save the common substances to a pdf file. file_name = "_".join(type_to_file_name[x] for x in property_type_set) file_path = os.path.join( root_output_directory, environment_of_interest, "common_data", f"{file_name}.pdf", ) if len(common_substance_smiles) > 0: smiles_to_pdf(list(common_substance_smiles), file_path) # Output the common data to the `common_data` directory. output_directory = os.path.join(root_output_directory, environment_of_interest, "common_data", file_name) for property_type, substance_type in property_type_set: data_frame = load_processed_data_set(data_directory, property_type, substance_type) data_frame = filter_by_substance_composition( data_frame, common_substance_smiles, None) save_processed_data_set(output_directory, data_frame, property_type, substance_type)
def main(): logging.basicConfig(level=logging.INFO) root_output_directory = "test_sets" os.makedirs(root_output_directory, exist_ok=True) # Define the types of property which are of interest. properties_of_interest = [ (Density, SubstanceType.Pure), (EnthalpyOfVaporization, SubstanceType.Pure), ] # Define the state we would ideally chose data points at. target_states = [ StatePoint(298.15 * unit.kelvin, 1.0 * unit.atmosphere, (1.0, )), ] target_states = {x: target_states for x in properties_of_interest} # Define the environments of interest. environments_of_interest = [ "alcohol", "ester", "alkane", "ether", "ketone" ] # Load in the training substances so we can avoid selecting # them for the test set. training_smiles = load_training_components() with TemporaryDirectory() as data_directory: # Apply the filters to the available data. for property_of_interest in properties_of_interest: data_frames = [] for environment in environments_of_interest: data_frame = load_processed_data_set( os.path.join( "..", "..", "..", "data_availability", "data_by_environments", f"{environment}_{environment}", "all_data", ), *property_of_interest, ) data_frames.append(data_frame) data_frame = pandas.concat(data_frames, ignore_index=True, sort=False) data_frame = filter_data(data_frame) data_frame = filter_by_smiles(data_frame, training_smiles, None) save_processed_data_set(data_directory, data_frame, *property_of_interest) # Determine which components have enthalpy of vaporization # measurements. These will be the compounds which will be # included in the pure test set. h_vap_data_frame = load_processed_data_set(data_directory, EnthalpyOfVaporization, SubstanceType.Pure) test_set_components = {*h_vap_data_frame["Component 1"]} test_set_components = [(x, ) for x in test_set_components] # Select the data points. selected_data_set = select_data_points( data_directory=data_directory, chosen_substances=test_set_components, target_state_points=target_states, ) selected_data_set.json(os.path.join(root_output_directory, "pure_set.json")) selected_data_frame = selected_data_set.to_pandas() selected_data_frame.to_csv(os.path.join(root_output_directory, "pure_set.csv"), index=False) data_frame_to_pdf(selected_data_frame, os.path.join(root_output_directory, "pure_set.pdf"))
def filter_common_data(output_directory, substances): """Filter the common data to a smaller temperature range - this seems to help the state selection method get closer to the target states. """ os.makedirs(os.path.join(output_directory, "h_mix_and_rho_x"), exist_ok=True) for property_type, substance_type in [ (EnthalpyOfMixing, SubstanceType.Binary), (Density, SubstanceType.Binary), ]: data_frames = [] for environment_mix in [ "alcohol_ester", "alcohol_alkane", "ether_alkane", "ether_ketone", ]: data_frame = load_processed_data_set( os.path.join( "..", "..", "..", "data_availability", "data_by_environments", environment_mix, "common_data", "h_mix_rho_x", ), property_type, substance_type, ) data_frame = filter_by_substance_composition( data_frame, substances, None) data_frame = data_frame[(data_frame["Mole Fraction 1"] > 0.10) & (data_frame["Mole Fraction 1"] < 0.90)] data_frames.append(data_frame) full_data_frame = pandas.concat(data_frames) save_processed_data_set( os.path.join(output_directory, "h_mix_and_rho_x"), full_data_frame, property_type, substance_type, ) data_frame_to_pdf( full_data_frame, os.path.join( output_directory, "h_mix_and_rho_x", property_to_file_name(property_type, substance_type) + ".pdf", ), )
def filter_common_data(output_directory): """Filter the common data to a smaller temperature range - this seems to help the state selection method get closer to the target states. """ os.makedirs(os.path.join(output_directory, "h_mix_and_v_excess"), exist_ok=True) os.makedirs(os.path.join(output_directory, "h_mix_and_binary_density"), exist_ok=True) for property_type, substance_type in [ (EnthalpyOfMixing, SubstanceType.Binary), (ExcessMolarVolume, SubstanceType.Binary), ]: data_frame = load_processed_data_set( os.path.join( "..", "..", "..", "data_availability", "data_by_environments", "alcohol_ester", "common_data", "h_mix_v_excess", ), property_type, substance_type, ) data_frame = filter_by_temperature(data_frame, 290.0 * unit.kelvin, 305 * unit.kelvin) save_processed_data_set( os.path.join(output_directory, "h_mix_and_v_excess"), data_frame, property_type, substance_type, ) for property_type, substance_type in [ (EnthalpyOfMixing, SubstanceType.Binary), (Density, SubstanceType.Binary), ]: data_frame = load_processed_data_set( os.path.join( "..", "..", "..", "data_availability", "data_by_environments", "alcohol_ester", "common_data", "h_mix_rho_x", ), property_type, substance_type, ) data_frame = filter_by_temperature(data_frame, 290.0 * unit.kelvin, 305 * unit.kelvin) save_processed_data_set( os.path.join(output_directory, "h_mix_and_binary_density"), data_frame, property_type, substance_type, )
def main(): root_output_directory = "data_by_environments" # Define the number of processes to parallelize over. n_processes = 20 # Set up logging logging.basicConfig(level=logging.INFO) # Define the properties and environments we are interested in. pure_properties_of_interest = [ (Density, SubstanceType.Pure), (EnthalpyOfVaporization, SubstanceType.Pure), ] mixture_properties_of_interest = [ (Density, SubstanceType.Binary), (ExcessMolarVolume, SubstanceType.Binary), (EnthalpyOfMixing, SubstanceType.Binary), ] environments_of_interest = { "alcohol": [ chemical_environment_codes["hydroxy"], chemical_environment_codes["alcohol"], ], "ester": [ chemical_environment_codes["caboxylic_acid"], chemical_environment_codes["ester"], ], "ether": [chemical_environment_codes["ether"]], "aldehyde": [chemical_environment_codes["aldehyde"]], "ketone": [chemical_environment_codes["ketone"]], "thiocarbonyl": [chemical_environment_codes["thiocarbonyl"]], "phenol": [chemical_environment_codes["phenol"]], "amine": [chemical_environment_codes["amine"]], "halogenated": [chemical_environment_codes["halogenated"]], "amide": [chemical_environment_codes["amide"]], "nitro": [chemical_environment_codes["nitro"]], "aromatic": [chemical_environment_codes["aromatic"]], "heterocycle": [chemical_environment_codes["heterocycle"]], "alkane": [""], "alkene": [chemical_environment_codes["alkene"]], } properties_of_interest = [ *pure_properties_of_interest, *mixture_properties_of_interest, ] with TemporaryDirectory() as data_directory: root_data_directory = os.path.join("..", "..", "shared", "filtered_data") # Create a temporary directory which contains both the converted # mass density / excess molar volume data, and the other data of # interest for property_type, substance_type in properties_of_interest: if (property_type in [Density, ExcessMolarVolume] and substance_type == SubstanceType.Binary): # Source any binary mass density or excess molar # volume from the full set of converted density # data. data_set = load_processed_data_set("converted_density_data", property_type, substance_type) elif property_type == EnthalpyOfVaporization: data_set = load_processed_data_set("sourced_h_vap_data", property_type, substance_type) else: data_set = load_processed_data_set(root_data_directory, property_type, substance_type) save_processed_data_set(data_directory, data_set, property_type, substance_type) # Determine all combinations of the environments of interest. environment_pairs = [(x, x) for x in environments_of_interest] environment_pairs.extend( itertools.combinations(environments_of_interest, 2)) with Pool(n_processes) as pool: x = list( tqdm.tqdm( pool.imap( functools.partial( apply_filters, data_directory=data_directory, environments_of_interest=environments_of_interest, mixture_properties_of_interest= mixture_properties_of_interest, pure_properties_of_interest= pure_properties_of_interest, root_output_directory=root_output_directory, ), environment_pairs, ), total=len(environment_pairs), )) assert x is not None
def main(): root_output_directory = "partitioned_data" # Define the types of property which are of interest. properties_of_interest = [ (Density, SubstanceType.Pure), (EnthalpyOfVaporization, SubstanceType.Pure), (EnthalpyOfMixing, SubstanceType.Binary), (ExcessMolarVolume, SubstanceType.Binary), (Density, SubstanceType.Binary), ] # Define the types of mixture which are of interest environment_types = ["alcohol_alcohol", "alcohol_ester", "ester_ester"] # Find all of the substances which appeared in the training set training_smiles = find_training_smiles() for environment_type in environment_types: output_directory = os.path.join(root_output_directory, environment_type) os.makedirs(output_directory, exist_ok=True) for property_type, substance_type in properties_of_interest: full_data_frame = load_processed_data_set( os.path.join("filtered_data", environment_type), property_type, substance_type, ) property_type = property_to_snake_case(property_type) file_name = f"{property_type}_{str(substance_type.value)}" # Extract properties where neither component appears in # in the training set. if substance_type == SubstanceType.Binary: data_frame = full_data_frame[ ~full_data_frame["Component 1"].isin(training_smiles) & ~full_data_frame["Component 2"].isin(training_smiles) ] elif substance_type == SubstanceType.Pure: data_frame = full_data_frame[ ~full_data_frame["Component 1"].isin(training_smiles) ] else: raise NotImplementedError() base_directory = os.path.join(output_directory, "not_in_training") os.makedirs(base_directory, exist_ok=True) data_frame.to_csv( os.path.join(base_directory, file_name + ".csv"), index=False ) data_frame_to_pdf( data_frame, os.path.join(base_directory, file_name + ".pdf") ) if substance_type == SubstanceType.Pure: continue # Extract properties where both components appear in # in the training set. data_frame = full_data_frame[ full_data_frame["Component 1"].isin(training_smiles) & full_data_frame["Component 2"].isin(training_smiles) ] base_directory = os.path.join(output_directory, "both_in_training") os.makedirs(base_directory, exist_ok=True) data_frame.to_csv( os.path.join(base_directory, file_name + ".csv"), index=False ) data_frame_to_pdf( data_frame, os.path.join(base_directory, file_name + ".pdf") ) # Extract properties where only one component appears in # in the training set. data_frame = full_data_frame[ ( full_data_frame["Component 1"].isin(training_smiles) & ~full_data_frame["Component 2"].isin(training_smiles) ) | ( ~full_data_frame["Component 1"].isin(training_smiles) & full_data_frame["Component 2"].isin(training_smiles) ) ] base_directory = os.path.join(output_directory, "one_in_training") os.makedirs(base_directory, exist_ok=True) data_frame.to_csv( os.path.join(base_directory, file_name + ".csv"), index=False ) data_frame_to_pdf( data_frame, os.path.join(base_directory, file_name + ".pdf") )
def main(): """Collates a directory of NIST ThermoML archive files into more readily manipulable pandas csv files. """ # Set up logging logging.basicConfig(level=logging.INFO) raw_data_directory = resource_filename("nistdataselection", os.path.join("data", "thermoml")) processed_data_directory = "processed_data" # Convert the raw ThermoML data files into more easily manipulable # `pandas.DataFrame` objects. processing.process_raw_data( directory=raw_data_directory, output_directory=processed_data_directory, retain_values=True, retain_uncertainties=True, ) # Define the ranges of temperatures and pressures of interest. # Here we choose a range of temperatures which are biologically # relevant (15 C - 45 C) and pressures which are close to ambient. temperature_range = (288.15 * unit.kelvin, 323.15 * unit.kelvin) pressure_range = (0.95 * unit.atmosphere, 1.05 * unit.atmosphere) # Define the elements that we are interested in. Here we only allow # a subset of those elements for which Parsley has parameters for, # and for which there exists plentiful data in the ThermoML archives. allowed_elements = ["H", "N", "C", "O", "S", "F", "Cl", "Br", "I"] # Define the target number of unique substances to choose for each # type of property of interest. target_substances_per_property = { (Density, SubstanceType.Pure): 1, } # Create a directory to store the filtered data in. filtered_data_directory = "filtered_data" os.makedirs(filtered_data_directory, exist_ok=True) # Perform basic filtering on the data sets. for property_type, substance_type in target_substances_per_property: # Load the full data sets from the processed data file logging.info(f"Applying filters to the {substance_type.value} " f"{property_type.__name__} data set.") data_set = processing.load_processed_data_set(processed_data_directory, property_type, substance_type) # Apply a standard set of filters. data_set = filtering.apply_standard_filters(data_set, temperature_range, pressure_range, allowed_elements) logging.info(f"The filtered data set contains {len(data_set)} " f"properties.") # Save the filtered data set. processing.save_processed_data_set(filtered_data_directory, data_set, property_type, substance_type) # Choose a set of unique substances to train the VdW parameters against. # These are just tuples of smiles patterns which define the composition of # the substance. We choose the actual mole fractions of components in a later # step. # # Here we specify which regions of chemical space we want to cover. This # is mainly driven by the VdW parameters we wish to exercise, but may also # be supplemented with additional environments which are poorly represented. target_environments = [ "[#1:1]-[#6X4]", "[#1:1]-[#6X3]", "[#1:1]-[#8]", "[#6:1]", "[#6X4:1]", "[#8:1]", "[#8X2H0+0:1]", "[#8X2H1+0:1]", "[#7:1]", "[#16:1]", "[#9:1]", "[#17:1]", "[#35:1]", ] chosen_substances = selection.select_substances( filtered_data_directory, target_substances_per_property, target_environments) logging.info(f"{len(chosen_substances)} substances where chosen.") # Define the specific states at which we wish to select data. These are currently # tuples of temperature, pressure, and a tuple of the mole fractions of each of the # components. density_target_state_points = [ selection.StatePoint(298.15 * unit.kelvin, 101.325 * unit.kilopascal, (1.0, )), selection.StatePoint(318.15 * unit.kelvin, 101.325 * unit.kilopascal, (1.0, )), ] target_property_state_points = { (Density, SubstanceType.Pure): density_target_state_points, } # Set the output path to the data set. data_set_name = "pure_data_set" # Choose the final data set containing the chosen substances, and # data points at the target state points. data_set = selection.select_data_points(filtered_data_directory, chosen_substances, target_property_state_points) with open(f"{data_set_name}.json", "w") as file: file.write(data_set.json()) data_set.to_pandas().to_csv(f"{data_set_name}.csv") # Generate a pdf report detailing the chosen set. reporting.generate_report(f"{data_set_name}.json", vdw_smirks_of_interest=target_environments)