def universal_dataset_check(self, dataset_name, object_headers=None, numeric_headers=None, bool_headers=None, metadata_headers=None): # Runs tests common to all datasets, # makes it quicker to write tests for new datasets # Get rid of dataset if it's on the disk already data_path = os.path.join( self.dataset_dir, dataset_name + "." + self.dataset_dict[dataset_name]['file_type'] ) if os.path.exists(data_path): os.remove(data_path) # Test that dataset can be downloaded load_dataset(dataset_name) self.assertTrue(os.path.exists(data_path)) # Test that data is now available and has all its elements df = load_dataset(dataset_name, download_if_missing=False) self.assertEqual( len(df), self.dataset_dict[dataset_name]["num_entries"] ) # Tes all the non-metadata columns are there if metadata_headers is None: metadata_headers = set() self.assertEqual(sorted(list(df)), sorted( [header for header in self.dataset_dict[dataset_name]['columns'].keys() if header not in metadata_headers] )) # Test each column for appropriate type if object_headers is None: object_headers = [] if numeric_headers is None: numeric_headers = [] if bool_headers is None: bool_headers = [] df = load_dataset(dataset_name, include_metadata=True, download_if_missing=False) if object_headers: self.assertTrue(is_object_dtype(df[object_headers].values)) if numeric_headers: self.assertTrue(is_numeric_dtype(df[numeric_headers].values)) if bool_headers: self.assertTrue(is_bool_dtype(df[bool_headers].values)) # Make sure all columns are accounted for column_headers = object_headers + numeric_headers + bool_headers self.assertEqual(sorted(list(df)), sorted(column_headers))
def test_elastic_tensor_2015(self): # Run set of universal dataset tests object_headers = ['material_id', 'formula', 'structure', 'compliance_tensor', 'elastic_tensor', 'elastic_tensor_original', 'cif', 'poscar'] numeric_headers = ['nsites', 'space_group', 'volume', 'elastic_anisotropy', 'G_Reuss', 'G_VRH', 'G_Voigt', 'K_Reuss', 'K_VRH', 'K_Voigt', 'poisson_ratio', 'kpoint_density'] metadata_headers = {'cif', 'kpoint_density', 'poscar'} self.universal_dataset_check( "elastic_tensor_2015", object_headers, numeric_headers, metadata_headers=metadata_headers ) # Tests unique to this dataset df = load_dataset('elastic_tensor_2015', include_metadata=True, download_if_missing=False) self.assertEqual(type(df['structure'][0]), Structure) tensor_headers = ['compliance_tensor', 'elastic_tensor', 'elastic_tensor_original'] for c in tensor_headers: self.assertEqual(type(df[c][0]), np.ndarray)
def download_data(self, name, save_data=True): df = load_dataset(name=name) if save_data == True: df.to_excel(name+'.xlsx', index=False) with open('%s.pickle' % name, 'wb') as data_file: pickle.dump(df, data_file) return df
def setUp(self): df = load_dataset("elastic_tensor_2015").rename( columns={"formula": "composition"}) self.df = df[["composition", "K_VRH"]] self.df_struc = df[["composition", "structure", "K_VRH"]] self.extra_features = df["G_VRH"] self.target = "K_VRH" self.config = get_preset_config("debug_single") self.config_cached = get_preset_config("debug_single", cache_src=CACHE_SRC) self.pipe = MatPipe(**self.config) self.pipe_cached = MatPipe(**self.config_cached)
def test_flla(self): # Universal tests object_headers = ['material_id', 'formula', 'structure'] numeric_headers = ['e_above_hull', 'nsites', 'formation_energy', 'formation_energy_per_atom'] self.universal_dataset_check("flla", object_headers, numeric_headers) # Unique tests df = load_dataset('flla', include_metadata=True, download_if_missing=False) self.assertEqual(type(df['structure'][0]), Structure)
def test_load_dataset(self): # Can't find dataset or similar with self.assertRaises(ValueError): load_dataset("not_real_dataset") # Finds similar with self.assertRaises(ValueError): load_dataset("tensor") # Actual dataset is subset of passed dataset name dataset_name = sorted(self.dataset_dict.keys())[0] with self.assertRaises(ValueError): load_dataset("a" + dataset_name + "a")
def test_load_dataset(self): # Can't find dataset or similar with self.assertRaises(ValueError): load_dataset("not_real_dataset") # Finds similar with self.assertRaises(ValueError): load_dataset("tensor") # Actual dataset is subset of passed dataset name dataset_name = sorted(self.dataset_dict.keys())[0] with self.assertRaises(ValueError): load_dataset("a" + dataset_name + "a")
def test_load_dataset(self): # Can't find dataset or similar with self.assertRaises(ValueError): load_dataset("not_real_dataset") # Finds similar with self.assertRaises(ValueError): load_dataset("tensor") # Actual dataset is subset of passed dataset name dataset_name = sorted(self.dataset_dict.keys())[0] with self.assertRaises(ValueError): load_dataset("a" + dataset_name + "a") dataset_filename = (dataset_name + "." + self.dataset_dict[dataset_name]["file_type"]) data_home = os.path.expanduser("~") dataset_path = os.path.join(data_home, dataset_filename) if os.path.exists(dataset_path): os.remove(dataset_path) load_dataset(dataset_name, data_home) self.assertTrue(os.path.exists(data_home))
def test_piezoelectric_tensor(self): # Run universal tests object_headers = ['material_id', 'formula', 'structure', 'point_group', 'v_max', 'piezoelectric_tensor', 'cif', 'meta', 'poscar'] numeric_headers = ['nsites', 'space_group', 'volume', 'eij_max'] metadata_headers = {'cif', 'meta', 'poscar'} self.universal_dataset_check( "piezoelectric_tensor", object_headers, numeric_headers, metadata_headers=metadata_headers ) # Dataset specific checks df = load_dataset('piezoelectric_tensor', include_metadata=True, download_if_missing=False) self.assertEqual(type(df['structure'][0]), Structure) self.assertEqual(type(df['piezoelectric_tensor'][0]), np.ndarray)
def test_dielectric_constant(self): # Universal Tests object_headers = ['material_id', 'formula', 'structure', 'e_electronic', 'e_total', 'cif', 'meta', 'poscar'] numeric_headers = ['nsites', 'space_group', 'volume', 'band_gap', 'n', 'poly_electronic', 'poly_total'] bool_headers = ['pot_ferroelectric'] metadata_headers = {'cif', 'meta', 'poscar'} self.universal_dataset_check( "dielectric_constant", object_headers, numeric_headers, bool_headers=bool_headers, metadata_headers=metadata_headers ) # Unique tests df = load_dataset("dielectric_constant", include_metadata=True, download_if_missing=False) self.assertEqual(type(df['structure'][0]), Structure)
From matminer's dataset library. """ from matminer.datasets.dataset_retrieval import load_dataset from matminer.data_retrieval.retrieve_MP import MPDataRetrieval import pandas as pd # pd.set_option('display.height', 1000) pd.set_option('display.max_rows', 500) pd.set_option('display.max_columns', 500) pd.set_option('display.width', 1000) mpdr = MPDataRetrieval() df = load_dataset("phonon_dielectric_mp") print(df) mpids = df["mpid"].tolist() dfe = mpdr.get_dataframe( criteria={"material_id": { "$in": mpids }}, properties=["e_above_hull", "formation_energy_per_atom", "material_id"], index_mpid=False) dfe = dfe.rename(columns={"material_id": "mpid"}) df = pd.merge(df, dfe, how='inner') df = df[(df["e_above_hull"] < .150)
def setUp(self): self.test_df = load_dataset('elastic_tensor_2015').rename( columns={"formula": "composition"}) self.limit = 5
""" This file makes the following benchmarking datasets: - steels From matminer's dataset library. """ from matminer.datasets.dataset_retrieval import load_dataset if __name__ == "__main__": df = load_dataset("steel_strength") df = df[["formula", "yield strength"]] df = df.rename(columns={"formula": "composition"}) print(df) df.to_pickle("steels.pickle.gz")
def universal_dataset_check(self, dataset_name, object_headers=None, numeric_headers=None, bool_headers=None, test_func=None): # "Hard" integrity checks that take a long time. # These tests only run if the MATMINER_DATASET_FULL_TEST # environment variable is set to True if do_complete_test: # Get rid of dataset if it's on the disk already data_path = os.path.join( self.dataset_dir, dataset_name + "." + self.dataset_dict[dataset_name]['file_type']) if os.path.exists(data_path): os.remove(data_path) # Test that dataset can be downloaded load_dataset(dataset_name) self.assertTrue(os.path.exists(data_path)) # Test that data is now available and has all its elements df = load_dataset(dataset_name, download_if_missing=False) self.assertEqual(len(df), self.dataset_dict[dataset_name]["num_entries"]) # Test all columns are there self.assertEqual( sorted(list(df)), sorted([ header for header in self.dataset_dict[dataset_name] ['columns'].keys() ])) # Test each column for appropriate type if object_headers is None: object_headers = [] if numeric_headers is None: numeric_headers = [] if bool_headers is None: bool_headers = [] df = load_dataset(dataset_name, download_if_missing=False) if object_headers: self.assertTrue(is_object_dtype(df[object_headers].values)) if numeric_headers: self.assertTrue(is_numeric_dtype(df[numeric_headers].values)) if bool_headers: self.assertTrue(is_bool_dtype(df[bool_headers].values)) # Make sure all columns are accounted for column_headers = object_headers + numeric_headers + bool_headers self.assertEqual(sorted(list(df)), sorted(column_headers)) # Run tests unique to the dataset if test_func is not None: test_func(df) # "Soft" check that just makes sure the dataset download page is active # This runs when on a system with the CI environment var present # (e.g. when running a continuous integration VCS system) else: download_page = requests.head( self.dataset_dict[dataset_name]["url"]) self.assertTrue(download_page.ok)
out at the end. It appears there are none. """ from matminer.datasets.dataset_retrieval import load_dataset from matminer.utils.io import store_dataframe_as_json from matminer.featurizers.conversions import StrToComposition from tqdm import tqdm import pandas as pd # pd.set_option('display.height', 1000) pd.set_option("display.max_rows", 500) pd.set_option("display.max_columns", 500) pd.set_option("display.width", 1000) df = load_dataset("glass_ternary_landolt") df = df.rename(columns={"formula": "composition"}) df = df[["composition", "gfa"]] df = StrToComposition(target_col_id="composition_obj").featurize_dataframe( df, "composition") df["composition"] = [c.reduced_formula for c in df["composition_obj"]] df = df.drop(columns=["composition_obj"]) # print("Ground truth") # print(df[df["composition"]=="ZrTi9"]) # should be False in final dataframe also!! # print(df[df["composition"]=="ZrVCo8"]) # should be True in final dataframe also! # print(df["gfa"].value_counts()) # proportion is about 5000 GFA 2054 no GFA # raise ValueError
""" This file makes the following benchmarking datasets: - jdft2d From matminer's dataset library. """ from matminer.datasets.dataset_retrieval import load_dataset import pandas as pd # pd.set_option('display.height', 1000) pd.set_option('display.max_rows', 500) pd.set_option('display.max_columns', 500) pd.set_option('display.width', 1000) df = load_dataset("jarvis_dft_2d") df = df[["structure", "exfoliation_en"]] df = df.reset_index(drop=True) print(df) df.to_pickle("jdft2d.pickle.gz")
""" This file makes the following benchmarking datasets: - castelli From matminer's dataset library. """ from matminer.datasets.dataset_retrieval import load_dataset from matminer.data_retrieval.retrieve_MP import MPDataRetrieval import pandas as pd # pd.set_option('display.height', 1000) pd.set_option("display.max_rows", 500) pd.set_option("display.max_columns", 500) pd.set_option("display.width", 1000) mpdr = MPDataRetrieval() df = load_dataset("castelli_perovskites") df = df[["structure", "e_form"]] df = df.reset_index(drop=True) print(df) df.to_pickle("castelli.pickle.gz")
from matminer.datasets.dataset_retrieval import load_dataset, get_available_datasets, get_all_dataset_info datasets = get_available_datasets(print_format=None) for dataset in datasets: if "matbench_" in dataset: df = load_dataset(dataset) target_col = [col for col in df.columns if col not in ["structure", "composition"]][0] print(f" * - :code:`{dataset}`\n - :code:`{target_col}`\n - {df.shape[0]}") # print(get_all_dataset_info("matbench_steels"))
def universal_dataset_check(self, dataset_name, object_headers=None, numeric_headers=None, bool_headers=None, test_func=None): # "Hard" integrity checks that take a long time. # These tests only run if the MATMINER_DATASET_FULL_TEST # environment variable is set to True if do_complete_test: # Get rid of dataset if it's on the disk already data_path = os.path.join( self.dataset_dir, dataset_name + "." + self.dataset_dict[dataset_name][ 'file_type' ] ) if os.path.exists(data_path): os.remove(data_path) # Test that dataset can be downloaded load_dataset(dataset_name) self.assertTrue(os.path.exists(data_path)) # Test that data is now available and has all its elements df = load_dataset(dataset_name, download_if_missing=False) self.assertEqual( len(df), self.dataset_dict[dataset_name]["num_entries"] ) # Test all columns are there self.assertEqual(sorted(list(df)), sorted( [header for header in self.dataset_dict[dataset_name]['columns'].keys()] )) # Test each column for appropriate type if object_headers is None: object_headers = [] if numeric_headers is None: numeric_headers = [] if bool_headers is None: bool_headers = [] df = load_dataset(dataset_name, download_if_missing=False) if object_headers: self.assertTrue(is_object_dtype(df[object_headers].values)) if numeric_headers: self.assertTrue(is_numeric_dtype(df[numeric_headers].values)) if bool_headers: self.assertTrue(is_bool_dtype(df[bool_headers].values)) # Make sure all columns are accounted for column_headers = object_headers + numeric_headers + bool_headers self.assertEqual(sorted(list(df)), sorted(column_headers)) # Run tests unique to the dataset if test_func is not None: test_func(df) # "Soft" check that just makes sure the dataset download page is active # This runs when on a system with the CI environment var present # (e.g. when running a continuous integration VCS system) else: download_page = requests.head( self.dataset_dict[dataset_name]["url"] ) self.assertTrue(download_page.ok)
# A configured MatPipe object will featurize, clean, and learn on a dataset # automatically, and it made of 4 classes: AutoFeaturizer, DataCleaner, # FeatureReducer, and an ML adaptor (e.g., TPOTAdaptor). The exact operations # MatPipe executes are based entirely on how these 4 classes are configured. # The easiest way to get started is by passing in a preset configuration to # MatPipe. We can do this with the get_preset_config function; here, we'll use # the "express" config, which will provide decent results in a reasonable time # frame (an hour or two). pipe = MatPipe(**get_preset_config("express")) # Let's download an example dataset and try predicting bulk moduli. from sklearn.model_selection import train_test_split from matminer.datasets.dataset_retrieval import load_dataset df = load_dataset("elastic_tensor_2015")[["structure", "K_VRH"]] train, test = train_test_split(df, shuffle=True, random_state=20190301, test_size=0.2) test_true = test['K_VRH'] test = test.drop(columns=["K_VRH"]) # MatPipe uses an sklearn-esque BaseEstimator API for fitting pipelines and # predicting properties. Fitting a pipe trains it to the input data; predicting # with a pipe will output predictions. pipe.fit(train, target="K_VRH") # Now we can predict our outputs. They'll appear in a column called # "K_VRH predicted". test_predicted = pipe.predict(test, "K_VRH")["K_VRH predicted"]
def pretty_column_map(columns_old): colmap = {} for col in columns_old: k = (col.replace("_", "|").replace("-", "|").replace(" ", "||").replace( "(", " ").replace(")", "")) colmap[col] = k return colmap if __name__ == "__main__": # Just trying it out with a single dataset, Dielectric from MP... for config in [DIELECTRIC]: project = config["data_file"].replace(".json.gz", "") df = load_dataset(project) pinput = "structure" if "structure" in df.columns else "composition" column_map_pretty = pretty_column_map(df.columns.tolist()) df = df.rename(columns=column_map_pretty) target = column_map_pretty[config["target"]] # print(pinput) # raise ValueError # print(df) # raise ValueError # clean up has_more = True while has_more: resp = client.contributions.delete_entries(project=project,
def setUp(self): df = load_dataset("elastic_tensor_2015").rename( columns={"formula": "composition"}) self.df = df[["composition", "K_VRH"]] self.extra_features = df["G_VRH"] self.target = "K_VRH"
""" This file makes the following benchmarking datasets: - expt_gap - expt_is_metal From matminer's dataset library. """ from matminer.datasets.dataset_retrieval import load_dataset import pandas as pd pd.set_option('display.max_rows', 500) pd.set_option('display.max_columns', 500) pd.set_option('display.width', 1000) df = load_dataset("expt_gap") df = df.rename(columns={"formula": "composition"}) df.to_pickle("expt_gap.pickle.gz") print(df) df["is_metal"] = df["gap expt"] == 0 df = df.drop(columns=["gap expt"]) print(df["is_metal"].value_counts()) df = df.reset_index(drop=True) print(df) df.to_pickle("expt_is_metal.pickle.gz")
else: removed_feat = idx if removed_feat not in rm_feats: rm_feats.append(removed_feat) self.logger.debug('"{}" correlates strongly with ' '"{}"'.format(feature, idx)) self.logger.debug( 'removing "{}"...'.format(removed_feat)) if removed_feat == feature: break if len(rm_feats) > 0: df = df.drop(rm_feats, axis=1) self.logger.info('These {} features were removed due to cross ' 'correlation with the current features more than ' '{}:\n{}'.format(len(rm_feats), R_max, rm_feats)) return df if __name__ == "__main__": from matminer.datasets.dataset_retrieval import load_dataset from automatminer.pipeline import MatPipe, debug_config target = "eij_max" df = load_dataset("piezoelectric_tensor").rename( columns={"formula": "composition"})[[ target, "composition", "structure" ]] mp = MatPipe(**debug_config) df2 = mp.benchmark(df, target, test_spec=0.2) print(df2)