def filter_data(data_directory, properties_of_interest, chemical_environments, output_directory): """Filters out any measurements which where made for components which do not contain the chemical environments of interest. Parameters ---------- data_directory: str The directory containing the unfiltered data. properties_of_interest: list of tuple of PropertyType and SubstanceType The types of properties to extract data for. chemical_environments: list of list of str A list of those chemical environments to filter by. Each list in the full list corresponds to the chemical environments which should be matched by one of the components in the system. output_directory: str The directory to store the extracted data in. """ for property_tuple in properties_of_interest: property_type, substance_type = property_tuple data_set = processing.load_processed_data_set(data_directory, property_type, substance_type) # Start by filtering out any substances not composed of O, C, H, N, F, Cl, Br, S data_set = filter_by_elements(data_set, "C", "H", "O", "N", "F", "Cl", "Br", "S") # Next filter out any substances which aren't alcohols, esters or acids. data_set = filter_by_checkmol(data_set, *chemical_environments) # Save the filtered data set. processing.save_processed_data_set(output_directory, data_set, property_type, substance_type) # Save out a pdf of all smiles patterns (/ tuples of smiles patterns). property_type = property_to_snake_case(property_type) file_name = f"{property_type}_{str(substance_type.value)}.pdf" file_path = os.path.join(output_directory, file_name) data_frame_to_pdf(data_set, file_path)
def filter_data(data_directory, property_type, substance_type, output_directory): # Load in the data set data_frame = load_processed_data_set(data_directory, property_type, substance_type) # Filter to be close to ambient. data_frame = filter_by_temperature(data_frame, 290.0 * unit.kelvin, 305 * unit.kelvin) # Filter out aromatics, long chain molecules (>= hept), alkenes, # ethers, 3 + 4 membered rings data_frame = filter_by_smirks( data_frame, None, [ "[#6a]", "[#6r3]", "[#6r4]", "[#6]=[#6]", "[#6]~[#6]~[#6]~[#6]~[#6]~[#6]~[#6]", "[#6H2]-[#8X2]-[#6H2]", ], ) # Filter out any molecules with undefined stereochemistry data_frame = filter_undefined_stereochemistry(data_frame) # Save the filtered set. save_processed_data_set( output_directory, data_frame, property_type, substance_type, ) property_type = property_to_snake_case(property_type) file_name = f"{property_type}_{str(substance_type.value)}.pdf" data_frame_to_pdf(data_frame, os.path.join(output_directory, file_name))
def main(): raw_data_directory = "raw_archives" processed_data_directory = "processed_data" # Convert the raw ThermoML data files into more easily manipulable # `pandas.DataFrame` objects. process_raw_data( directory=raw_data_directory, output_directory=processed_data_directory, retain_values=True, retain_uncertainties=False, n_processes=20, files_per_worker=50, ) # Here we will also 'fix' the enthalpy of vaporization entries so that # they have a pressure (approximated as ambient). h_vap_properties = [ (EnthalpyOfVaporization, SubstanceType.Pure), (EnthalpyOfVaporization, SubstanceType.Binary), (EnthalpyOfVaporization, SubstanceType.Ternary), ] pressure = 1.0 * unit.atmosphere for property_tuple in h_vap_properties: data_set = load_processed_data_set(processed_data_directory, *property_tuple) data_set["Pressure (kPa)"] = data_set["Pressure (kPa)"].fillna( pressure.to(unit.kilopascal).magnitude) save_processed_data_set(processed_data_directory, data_set, *property_tuple)
def choose_data_points( property_of_interest, chosen_substances, target_states, environments_of_interest, ): """Select the data points to include in the benchmark set for each of the chosen substances. Parameters ---------- property_of_interest: tuple of type of PhysicalProperty and SubstanceType The type of property to select data points for. chosen_substances: list of tuple of str The substances to choose data points for. target_states: list of StatePoint The target states to select data points at. Returns ------- pandas.DataFrame The selected data points. """ with TemporaryDirectory() as data_directory: data_frames = [] for environment in environments_of_interest: data_folder = os.path.join( "..", "..", "..", "data_availability", "data_by_environments", environment, "all_data", ) try: data_frame = load_processed_data_set(data_folder, *property_of_interest) except FileNotFoundError: continue if len(data_frame) == 0: continue data_frames.append(data_frame) data_frame = pandas.concat(data_frames, ignore_index=True, sort=False) data_frame = filter_by_substance_composition(data_frame, chosen_substances, None) # Fill in the missing columns if "Exact Amount 1" not in data_frame: data_frame["Exact Amount 1"] = numpy.nan if "Exact Amount 2" not in data_frame: data_frame["Exact Amount 2"] = numpy.nan save_processed_data_set(data_directory, data_frame, *property_of_interest) target_states = {property_of_interest: target_states} selected_data_set = select_data_points( data_directory=data_directory, chosen_substances=None, target_state_points=target_states, ) selected_data_frame = selected_data_set.to_pandas() # Prune any data points measured for too low or too high # mole fractions. selected_data_frame = selected_data_frame[ (selected_data_frame["Mole Fraction 1"] > 0.15) & (selected_data_frame["Mole Fraction 1"] < 0.85)] return selected_data_frame
def main(): logging.basicConfig(level=logging.INFO) root_output_directory = "test_sets" os.makedirs(root_output_directory, exist_ok=True) # Define the types of property which are of interest. properties_of_interest = [ (Density, SubstanceType.Pure), (EnthalpyOfVaporization, SubstanceType.Pure), ] # Define the state we would ideally chose data points at. target_states = [ StatePoint(298.15 * unit.kelvin, 1.0 * unit.atmosphere, (1.0, )), ] target_states = {x: target_states for x in properties_of_interest} # Define the environments of interest. environments_of_interest = [ "alcohol", "ester", "alkane", "ether", "ketone" ] # Load in the training substances so we can avoid selecting # them for the test set. training_smiles = load_training_components() with TemporaryDirectory() as data_directory: # Apply the filters to the available data. for property_of_interest in properties_of_interest: data_frames = [] for environment in environments_of_interest: data_frame = load_processed_data_set( os.path.join( "..", "..", "..", "data_availability", "data_by_environments", f"{environment}_{environment}", "all_data", ), *property_of_interest, ) data_frames.append(data_frame) data_frame = pandas.concat(data_frames, ignore_index=True, sort=False) data_frame = filter_data(data_frame) data_frame = filter_by_smiles(data_frame, training_smiles, None) save_processed_data_set(data_directory, data_frame, *property_of_interest) # Determine which components have enthalpy of vaporization # measurements. These will be the compounds which will be # included in the pure test set. h_vap_data_frame = load_processed_data_set(data_directory, EnthalpyOfVaporization, SubstanceType.Pure) test_set_components = {*h_vap_data_frame["Component 1"]} test_set_components = [(x, ) for x in test_set_components] # Select the data points. selected_data_set = select_data_points( data_directory=data_directory, chosen_substances=test_set_components, target_state_points=target_states, ) selected_data_set.json(os.path.join(root_output_directory, "pure_set.json")) selected_data_frame = selected_data_set.to_pandas() selected_data_frame.to_csv(os.path.join(root_output_directory, "pure_set.csv"), index=False) data_frame_to_pdf(selected_data_frame, os.path.join(root_output_directory, "pure_set.pdf"))
def filter_common_data(output_directory, substances): """Filter the common data to a smaller temperature range - this seems to help the state selection method get closer to the target states. """ os.makedirs(os.path.join(output_directory, "h_mix_and_rho_x"), exist_ok=True) for property_type, substance_type in [ (EnthalpyOfMixing, SubstanceType.Binary), (Density, SubstanceType.Binary), ]: data_frames = [] for environment_mix in [ "alcohol_ester", "alcohol_alkane", "ether_alkane", "ether_ketone", ]: data_frame = load_processed_data_set( os.path.join( "..", "..", "..", "data_availability", "data_by_environments", environment_mix, "common_data", "h_mix_rho_x", ), property_type, substance_type, ) data_frame = filter_by_substance_composition( data_frame, substances, None) data_frame = data_frame[(data_frame["Mole Fraction 1"] > 0.10) & (data_frame["Mole Fraction 1"] < 0.90)] data_frames.append(data_frame) full_data_frame = pandas.concat(data_frames) save_processed_data_set( os.path.join(output_directory, "h_mix_and_rho_x"), full_data_frame, property_type, substance_type, ) data_frame_to_pdf( full_data_frame, os.path.join( output_directory, "h_mix_and_rho_x", property_to_file_name(property_type, substance_type) + ".pdf", ), )
def main(): root_output_directory = "data_by_environments" # Define the number of processes to parallelize over. n_processes = 20 # Set up logging logging.basicConfig(level=logging.INFO) # Define the properties and environments we are interested in. pure_properties_of_interest = [ (Density, SubstanceType.Pure), (EnthalpyOfVaporization, SubstanceType.Pure), ] mixture_properties_of_interest = [ (Density, SubstanceType.Binary), (ExcessMolarVolume, SubstanceType.Binary), (EnthalpyOfMixing, SubstanceType.Binary), ] environments_of_interest = { "alcohol": [ chemical_environment_codes["hydroxy"], chemical_environment_codes["alcohol"], ], "ester": [ chemical_environment_codes["caboxylic_acid"], chemical_environment_codes["ester"], ], "ether": [chemical_environment_codes["ether"]], "aldehyde": [chemical_environment_codes["aldehyde"]], "ketone": [chemical_environment_codes["ketone"]], "thiocarbonyl": [chemical_environment_codes["thiocarbonyl"]], "phenol": [chemical_environment_codes["phenol"]], "amine": [chemical_environment_codes["amine"]], "halogenated": [chemical_environment_codes["halogenated"]], "amide": [chemical_environment_codes["amide"]], "nitro": [chemical_environment_codes["nitro"]], "aromatic": [chemical_environment_codes["aromatic"]], "heterocycle": [chemical_environment_codes["heterocycle"]], "alkane": [""], "alkene": [chemical_environment_codes["alkene"]], } properties_of_interest = [ *pure_properties_of_interest, *mixture_properties_of_interest, ] with TemporaryDirectory() as data_directory: root_data_directory = os.path.join("..", "..", "shared", "filtered_data") # Create a temporary directory which contains both the converted # mass density / excess molar volume data, and the other data of # interest for property_type, substance_type in properties_of_interest: if (property_type in [Density, ExcessMolarVolume] and substance_type == SubstanceType.Binary): # Source any binary mass density or excess molar # volume from the full set of converted density # data. data_set = load_processed_data_set("converted_density_data", property_type, substance_type) elif property_type == EnthalpyOfVaporization: data_set = load_processed_data_set("sourced_h_vap_data", property_type, substance_type) else: data_set = load_processed_data_set(root_data_directory, property_type, substance_type) save_processed_data_set(data_directory, data_set, property_type, substance_type) # Determine all combinations of the environments of interest. environment_pairs = [(x, x) for x in environments_of_interest] environment_pairs.extend( itertools.combinations(environments_of_interest, 2)) with Pool(n_processes) as pool: x = list( tqdm.tqdm( pool.imap( functools.partial( apply_filters, data_directory=data_directory, environments_of_interest=environments_of_interest, mixture_properties_of_interest= mixture_properties_of_interest, pure_properties_of_interest= pure_properties_of_interest, root_output_directory=root_output_directory, ), environment_pairs, ), total=len(environment_pairs), )) assert x is not None
def main(): """Collates a directory of NIST ThermoML archive files into more readily manipulable pandas csv files. """ # Set up logging logging.basicConfig(level=logging.INFO) raw_data_directory = resource_filename("nistdataselection", os.path.join("data", "thermoml")) processed_data_directory = "processed_data" # Convert the raw ThermoML data files into more easily manipulable # `pandas.DataFrame` objects. processing.process_raw_data( directory=raw_data_directory, output_directory=processed_data_directory, retain_values=True, retain_uncertainties=True, ) # Define the ranges of temperatures and pressures of interest. # Here we choose a range of temperatures which are biologically # relevant (15 C - 45 C) and pressures which are close to ambient. temperature_range = (288.15 * unit.kelvin, 323.15 * unit.kelvin) pressure_range = (0.95 * unit.atmosphere, 1.05 * unit.atmosphere) # Define the elements that we are interested in. Here we only allow # a subset of those elements for which Parsley has parameters for, # and for which there exists plentiful data in the ThermoML archives. allowed_elements = ["H", "N", "C", "O", "S", "F", "Cl", "Br", "I"] # Define the target number of unique substances to choose for each # type of property of interest. target_substances_per_property = { (Density, SubstanceType.Pure): 1, } # Create a directory to store the filtered data in. filtered_data_directory = "filtered_data" os.makedirs(filtered_data_directory, exist_ok=True) # Perform basic filtering on the data sets. for property_type, substance_type in target_substances_per_property: # Load the full data sets from the processed data file logging.info(f"Applying filters to the {substance_type.value} " f"{property_type.__name__} data set.") data_set = processing.load_processed_data_set(processed_data_directory, property_type, substance_type) # Apply a standard set of filters. data_set = filtering.apply_standard_filters(data_set, temperature_range, pressure_range, allowed_elements) logging.info(f"The filtered data set contains {len(data_set)} " f"properties.") # Save the filtered data set. processing.save_processed_data_set(filtered_data_directory, data_set, property_type, substance_type) # Choose a set of unique substances to train the VdW parameters against. # These are just tuples of smiles patterns which define the composition of # the substance. We choose the actual mole fractions of components in a later # step. # # Here we specify which regions of chemical space we want to cover. This # is mainly driven by the VdW parameters we wish to exercise, but may also # be supplemented with additional environments which are poorly represented. target_environments = [ "[#1:1]-[#6X4]", "[#1:1]-[#6X3]", "[#1:1]-[#8]", "[#6:1]", "[#6X4:1]", "[#8:1]", "[#8X2H0+0:1]", "[#8X2H1+0:1]", "[#7:1]", "[#16:1]", "[#9:1]", "[#17:1]", "[#35:1]", ] chosen_substances = selection.select_substances( filtered_data_directory, target_substances_per_property, target_environments) logging.info(f"{len(chosen_substances)} substances where chosen.") # Define the specific states at which we wish to select data. These are currently # tuples of temperature, pressure, and a tuple of the mole fractions of each of the # components. density_target_state_points = [ selection.StatePoint(298.15 * unit.kelvin, 101.325 * unit.kilopascal, (1.0, )), selection.StatePoint(318.15 * unit.kelvin, 101.325 * unit.kilopascal, (1.0, )), ] target_property_state_points = { (Density, SubstanceType.Pure): density_target_state_points, } # Set the output path to the data set. data_set_name = "pure_data_set" # Choose the final data set containing the chosen substances, and # data points at the target state points. data_set = selection.select_data_points(filtered_data_directory, chosen_substances, target_property_state_points) with open(f"{data_set_name}.json", "w") as file: file.write(data_set.json()) data_set.to_pandas().to_csv(f"{data_set_name}.csv") # Generate a pdf report detailing the chosen set. reporting.generate_report(f"{data_set_name}.json", vdw_smirks_of_interest=target_environments)
def filter_common_data(output_directory): """Filter the common data to a smaller temperature range - this seems to help the state selection method get closer to the target states. """ os.makedirs(os.path.join(output_directory, "h_mix_and_v_excess"), exist_ok=True) os.makedirs(os.path.join(output_directory, "h_mix_and_binary_density"), exist_ok=True) for property_type, substance_type in [ (EnthalpyOfMixing, SubstanceType.Binary), (ExcessMolarVolume, SubstanceType.Binary), ]: data_frame = load_processed_data_set( os.path.join( "..", "..", "..", "data_availability", "data_by_environments", "alcohol_ester", "common_data", "h_mix_v_excess", ), property_type, substance_type, ) data_frame = filter_by_temperature(data_frame, 290.0 * unit.kelvin, 305 * unit.kelvin) save_processed_data_set( os.path.join(output_directory, "h_mix_and_v_excess"), data_frame, property_type, substance_type, ) for property_type, substance_type in [ (EnthalpyOfMixing, SubstanceType.Binary), (Density, SubstanceType.Binary), ]: data_frame = load_processed_data_set( os.path.join( "..", "..", "..", "data_availability", "data_by_environments", "alcohol_ester", "common_data", "h_mix_rho_x", ), property_type, substance_type, ) data_frame = filter_by_temperature(data_frame, 290.0 * unit.kelvin, 305 * unit.kelvin) save_processed_data_set( os.path.join(output_directory, "h_mix_and_binary_density"), data_frame, property_type, substance_type, )
def main(): root_output_directory = "data_by_environments" # Set up logging logging.basicConfig(level=logging.INFO) # Define the types of data to find. properties_of_interest = [ [(EnthalpyOfMixing, SubstanceType.Binary), (Density, SubstanceType.Binary)], [ (EnthalpyOfMixing, SubstanceType.Binary), (ExcessMolarVolume, SubstanceType.Binary), ], [ (EnthalpyOfMixing, SubstanceType.Binary), (Density, SubstanceType.Binary), (ExcessMolarVolume, SubstanceType.Binary), ], ] # Define some shorter file names to use: type_to_file_name = { (Density, SubstanceType.Binary): "rho_x", (EnthalpyOfMixing, SubstanceType.Binary): "h_mix", (ExcessMolarVolume, SubstanceType.Binary): "v_excess", } # Define which types of mixtures we are interested in, e.g. # alcohol-alcohol, alcohol-ester etc. environments_of_interest = [ os.path.basename(x) for x in glob("data_by_environments/*") ] for environment_of_interest in environments_of_interest: data_directory = os.path.join("data_by_environments", environment_of_interest, "all_data") os.makedirs( os.path.join(root_output_directory, environment_of_interest, "common_data"), exist_ok=True, ) for property_type_set in properties_of_interest: # Find the set of substances which are common to all of the # specified property types. all_substance_smiles = [] for property_type, substance_type in property_type_set: data_frame = load_processed_data_set(data_directory, property_type, substance_type) if len(data_frame) == 0: all_substance_smiles = [] break substance_smiles = set(data_frame_to_smiles_tuples(data_frame)) all_substance_smiles.append(substance_smiles) if len(all_substance_smiles) == 0: continue common_substance_smiles = set.intersection(*all_substance_smiles) # Save the common substances to a pdf file. file_name = "_".join(type_to_file_name[x] for x in property_type_set) file_path = os.path.join( root_output_directory, environment_of_interest, "common_data", f"{file_name}.pdf", ) if len(common_substance_smiles) > 0: smiles_to_pdf(list(common_substance_smiles), file_path) # Output the common data to the `common_data` directory. output_directory = os.path.join(root_output_directory, environment_of_interest, "common_data", file_name) for property_type, substance_type in property_type_set: data_frame = load_processed_data_set(data_directory, property_type, substance_type) data_frame = filter_by_substance_composition( data_frame, common_substance_smiles, None) save_processed_data_set(output_directory, data_frame, property_type, substance_type)