def transfer_data(df, worker, now): this_dir = os.path.dirname(os.path.abspath(__file__)) user_folder = os.path.join(this_dir, "user_dfs") if not os.path.exists(user_folder): os.makedirs(user_folder) filename = "user_df_" + now + ".json" filepath = os.path.join(user_folder, filename) store_dataframe_as_json(df, filepath) if worker != "local": if worker == "cori": o = subprocess.check_output( ['bash', '-c', '. ~/.bash_profile; cori_get_password']) user = os.environ["CORI_USER"] host = "lrc-login.lbl.gov" elif worker == "lrc": o = subprocess.check_output( ['bash', '-c', '. ~/.bash_profile; lrc_get_password']) user = os.environ["LRC_USER"] host = "lrc-login.lbl.gov" else: raise ValueError(f"Worker {worker} not valid!") o_utf = o.decode("utf-8") o_all = o_utf.split("\n") o_all.remove("") password = o_all[-1] ssh = SSHClient() ssh.load_system_host_keys() ssh.connect(host, username=user, password=password, look_for_keys=False) with SCPClient(ssh.get_transport()) as scp: scp.put(filepath, recursive=True, remote_path="/global/home/users/ardunn") else: pass
def test_store_dataframe_as_json(self): # check write produces correct file temp_file = os.path.join(self.temp_folder, 'test_dataframe.json') test_file = os.path.join(test_dir, "dataframe.json") store_dataframe_as_json(self.df, temp_file) with zopen(temp_file, 'rb') as f: temp_data = json.load(f) with zopen(test_file, 'rb') as f: test_data = json.load(f) # remove version otherwise this will have to be updated everytime # the pymatgen version changes temp_data["data"][0][0].pop("@version") test_data["data"][0][0].pop("@version") self.assertDictsAlmostEqual(temp_data, test_data) # check writing gzipped json (comparing hashes doesn't work) so have to # compare contents temp_file = os.path.join(self.temp_folder, 'test_dataframe.json.gz') test_file = os.path.join(test_dir, "dataframe.json.gz") store_dataframe_as_json(self.df, temp_file, compression='gz') with zopen(temp_file, 'rb') as f: temp_data = json.load(f) with zopen(test_file, 'rb') as f: test_data = json.load(f) temp_data["data"][0][0].pop("@version") test_data["data"][0][0].pop("@version") self.assertDictsAlmostEqual(temp_data, test_data) # check writing bz2 compressed json (comparing hashes doesn't work) # check writing gzipped json (comparing hashes doesn't work) so have to # compare contents temp_file = os.path.join(self.temp_folder, 'test_dataframe.json.bz2') test_file = os.path.join(test_dir, "dataframe.json.bz2") store_dataframe_as_json(self.df, temp_file, compression='bz2') with zopen(temp_file, 'rb') as f: temp_data = json.load(f) with zopen(test_file, 'rb') as f: test_data = json.load(f) temp_data["data"][0][0].pop("@version") test_data["data"][0][0].pop("@version") self.assertDictsAlmostEqual(temp_data, test_data)
def generate_json_files(): diamond = Structure( Lattice([[2.189, 0, 1.264], [0.73, 2.064, 1.264], [0, 0, 2.528]]), ["C0+", "C0+"], [[2.554, 1.806, 4.423], [0.365, 0.258, 0.632]], validate_proximity=False, to_unit_cell=False, coords_are_cartesian=True, site_properties=None ) df = pd.DataFrame(data={'structure': [diamond]}) plain_file = os.path.join(test_dir, "dataframe.json") store_dataframe_as_json(df, plain_file) gz_file = os.path.join(test_dir, "dataframe.json.gz") store_dataframe_as_json(df, gz_file, compression='gz') bz2_file = os.path.join(test_dir, "dataframe.json.bz2") store_dataframe_as_json(df, bz2_file, compression='bz2')
def generate_json_files(): diamond = Structure(Lattice([[2.189, 0, 1.264], [0.73, 2.064, 1.264], [0, 0, 2.528]]), ["C0+", "C0+"], [[2.554, 1.806, 4.423], [0.365, 0.258, 0.632]], validate_proximity=False, to_unit_cell=False, coords_are_cartesian=True, site_properties=None) df = pd.DataFrame(data={'structure': [diamond]}) plain_file = os.path.join(test_dir, "dataframe.json") store_dataframe_as_json(df, plain_file) gz_file = os.path.join(test_dir, "dataframe.json.gz") store_dataframe_as_json(df, gz_file, compression='gz') bz2_file = os.path.join(test_dir, "dataframe.json.bz2") store_dataframe_as_json(df, bz2_file, compression='bz2')
def test_store_dataframe_as_json(self): # check write produces correct file temp_file = os.path.join(self.temp_folder, 'test_dataframe.json') test_file = os.path.join(test_dir, "dataframe.json") store_dataframe_as_json(self.df, temp_file) self.assertTrue( filecmp.cmp(temp_file, test_file), "Json files do not match.") # check writing gzipped json (comparing hashes doesn't work) so have to # compare contents temp_file = os.path.join(self.temp_folder, 'test_dataframe.json.gz') test_file = os.path.join(test_dir, "dataframe.json.gz") store_dataframe_as_json(self.df, temp_file, compression='gz') with zopen(temp_file, 'rb') as f: temp_data = json.load(f) with zopen(test_file, 'rb') as f: test_data = json.load(f) self.assertTrue(temp_data == test_data, "Compressed json files do not match.") # check writing bz2 compressed json (comparing hashes doesn't work) # check writing gzipped json (comparing hashes doesn't work) so have to # compare contents temp_file = os.path.join(self.temp_folder, 'test_dataframe.json.bz2') test_file = os.path.join(test_dir, "dataframe.json.bz2") store_dataframe_as_json(self.df, temp_file, compression='bz2') with zopen(temp_file, 'rb') as f: temp_data = json.load(f) with zopen(test_file, 'rb') as f: test_data = json.load(f) self.assertTrue(temp_data == test_data, "Compressed json files do not match.")
def test_store_dataframe_as_json(self): # check write produces correct file temp_file = os.path.join(self.temp_folder, 'test_dataframe.json') test_file = os.path.join(test_dir, "dataframe.json") store_dataframe_as_json(self.df, temp_file) self.assertTrue(filecmp.cmp(temp_file, test_file), "Json files do not match.") # check writing gzipped json (comparing hashes doesn't work) so have to # compare contents temp_file = os.path.join(self.temp_folder, 'test_dataframe.json.gz') test_file = os.path.join(test_dir, "dataframe.json.gz") store_dataframe_as_json(self.df, temp_file, compression='gz') with zopen(temp_file, 'rb') as f: temp_data = json.load(f) with zopen(test_file, 'rb') as f: test_data = json.load(f) self.assertTrue(temp_data == test_data, "Compressed json files do not match.") # check writing bz2 compressed json (comparing hashes doesn't work) # check writing gzipped json (comparing hashes doesn't work) so have to # compare contents temp_file = os.path.join(self.temp_folder, 'test_dataframe.json.bz2') test_file = os.path.join(test_dir, "dataframe.json.bz2") store_dataframe_as_json(self.df, temp_file, compression='bz2') with zopen(temp_file, 'rb') as f: temp_data = json.load(f) with zopen(test_file, 'rb') as f: test_data = json.load(f) self.assertTrue(temp_data == test_data, "Compressed json files do not match.")
# -- start F12 from matminer.featurizers.composition import TMetalFraction tmf_feat = TMetalFraction() fdf = tmf_feat.featurize_dataframe(fdf, col_id='composition', ignore_errors=True) # -- end F12 # End of the basic featurization # Saving the database print("The final dataset has {}".format(fdf.shape)) fdf.to_csv(r"Batteries_predict.csv", index=None, header=True) store_dataframe_as_json(fdf, 'Batteries_predict.json', compression=None, orient='split') ''' Block 3 - Loading and making predictions ''' # Saving Id and Formula to an output dataframe odf odf = pd.DataFrame() # output dataframe odf['Id'] = fdf['Id'] odf['Reduced Formula'] = fdf['Reduced Formula'] excluded = [ 'Id', 'Reduced Formula', 'composition', 'composition_oxid', 'HOMO_character', 'HOMO_element', 'LUMO_character', 'LUMO_element' ]
all_gfa = any(per_comp_gfa) gfa = None if any_gfa and not all_gfa: print(f"Problem composition {c}: {df_per_comp_gfa}\n") problem_compositions.append(c) continue elif all_gfa and any_gfa: print(f"All gfa: {c}") gfa = 1 elif not all_gfa and not any_gfa: print(f"No gfa: {c}") gfa = 0 elif all_gfa and not any_gfa: raise ValueError("Impossible combination of gfa values.") new_df_dict["composition"].append(c) new_df_dict["gfa"].append(gfa) df_new = pd.DataFrame(new_df_dict) df_new = df_new.sort_values(by="composition") df_new = df_new.reset_index(drop=True) # convert to bools df_new["gfa"] = df_new["gfa"] == 1 print(df_new) print(df_new["gfa"].value_counts()) print(f"Problem compositions: {problem_compositions}") store_dataframe_as_json(df_new, "glass.json.gz", compression="gz")
# Defining the ions correctly df[["Id", "Ion"]] = df["Battid"].str.split("_", expand=True) # Final check to avoid errors in the matminer.composition module clean3 = df["Reduced Formula"].str[0:1] # 1st char df["RF3"] = df["Reduced Formula"].str[1:] df["Reduced Formula"] = np.where(clean3.str.isnumeric(), df["RF3"], df["Reduced Formula"]) excluded = [ "Type", "Unnamed: 11", "Trash", "RF1", "RF2", "RF3", "Battid", "Reduced Cell Formula" ] df = df.drop(excluded, axis=1) # Print a summary before writing print("A summary of the dataframe: {}".format(df.describe)) print("The reduced formula column: {}".format(df["Reduced Formula"])) # Exporting "df" to *.csv and *.json df.to_csv(r"Batteries_raw.csv", index=None, header=True) store_dataframe_as_json(df, 'Batteries_raw.json', compression=None, orient='split') # Debug routine # reduced = df["Reduced Formula"] # reduced.to_csv(r"reduced.csv", index = None, header = True)
print("Number of unique compositions:", len(unique)) # raise ValueError new_df_dict = {"composition": [], "gap expt": []} for c in tqdm(unique): df_per_comp_gaps = df[df["composition"] == c] per_comp_gaps = df_per_comp_gaps["gap expt"] measurement_range = max(per_comp_gaps) - min(per_comp_gaps) if measurement_range > 0.1: # print(df_per_comp_gaps) # big_diff += 1 excluded_compositions.append(c) else: mean_gap = per_comp_gaps.mean() gap_diffs = per_comp_gaps - mean_gap min_gap_diff = gap_diffs.min() min_gap_diff_index = gap_diffs.tolist().index(min_gap_diff) actual_gap_diff = per_comp_gaps.tolist()[min_gap_diff_index] # if len(per_comp_gaps) > 1: # print(f"{c} decided on {actual_gap_diff} from \n {per_comp_gaps} \n\n") new_df_dict["composition"].append(c) new_df_dict["gap expt"].append(actual_gap_diff) df_new = pd.DataFrame(new_df_dict) df_new = df_new.sort_values(by="composition") df_new = df_new.reset_index(drop=True) store_dataframe_as_json(df_new, "expt_gap.json.gz", compression="gz") print(df_new)
else: destination = args.destination makedirs(destination, exist_ok=True) if args.hash_file is None: hash_file = join(destination, "file_hashes.txt") else: hash_file = args.hash_file if args.no_hashes: for file_path in csv_file_paths: dataframe = _csv_to_dataframe(file_path) store_dataframe_as_json( dataframe, join(destination, basename(file_path)[:-4] + ".json"), compression=args.compression_type ) else: with open(hash_file, "w") as out: for file_path in csv_file_paths: dataset_name = basename(file_path)[:-4] json_destination = join(destination, dataset_name + ".json") dataframe = _csv_to_dataframe(file_path) store_dataframe_as_json(dataframe, json_destination, compression=args.compression_type) if args.compression_type is not None: json_destination += ("." + args.compression_type)
def test_featurize_bsdos(self, refresh_df_init=False, limit=1): """ Tests featurize_dos and featurize_bandstructure. Args: refresh_df_init (bool): for developers, if the test need to be updated set to True. Otherwise set to False to make the final test independent of MPRester and faster. limit (int): the maximum final number of entries. Returns (None): """ target = "color" df_bsdos_pickled = "mp_data_with_dos_bandstructure.pickle" save_path = os.path.join(TEST_DIR, df_bsdos_pickled) if refresh_df_init: mpdr = MPDataRetrieval() df = mpdr.get_dataframe( criteria={"material_id": "mp-149"}, properties=[ "pretty_formula", "dos", "bandstructure", "bandstructure_uniform", ], ) store_dataframe_as_json(df, save_path) else: df = load_dataframe_from_json(save_path) df = df.dropna(axis=0) df = df.rename( columns={ "bandstructure_uniform": "bandstructure", "bandstructure": "line bandstructure", } ) df[target] = [["red"]] n_cols_init = df.shape[1] featurizer = AutoFeaturizer( preset="express", ignore_errors=False, multiindex=False ) df = featurizer.fit_transform(df, target) # sanity checks self.assertTrue(len(df), limit) self.assertGreater(len(df.columns), n_cols_init) # DOSFeaturizer: self.assertEqual(df["cbm_character_1"][0], "p") # DopingFermi: self.assertAlmostEqual(df["fermi_c1e+20T300"][0], -0.539, 3) # Hybridization: self.assertAlmostEqual(df["vbm_sp"][0], 0.181, 3) self.assertAlmostEqual(df["cbm_s"][0], 0.4416, 3) self.assertAlmostEqual(df["cbm_sp"][0], 0.9864, 3) # BandFeaturizer: self.assertAlmostEqual(df["direct_gap"][0], 2.556, 3) self.assertAlmostEqual(df["n_ex1_norm"][0], 0.6285, 4) # BranchPointEnergy: self.assertAlmostEqual(df["branch_point_energy"][0], 5.7677, 4)
def generate_mp(max_nsites=None, properties=None, write_to_csv=False, write_to_compressed_json=True): """ Grabs all mp materials. This will return two csv/json.gz files: * mp_nostruct: All MP materials, not including structures * mp_all: All MP materials, including structures Args: max_nsites (int): The maximum number of sites to include in the query. properties (iterable of strings): list of properties supported by MPDataRetrieval write_to_csv (bool): whether to write resulting dataframe to csv write_to_compressed_json (bool): whether to write resulting dataframe to json.gz file Returns (pandas.DataFrame): retrieved/generated data """ # Set default properties if None and ensure is a list if properties is None: properties = ['pretty_formula', 'e_above_hull', 'band_gap', 'total_magnetization', 'elasticity.elastic_anisotropy', 'elasticity.K_VRH', 'elasticity.G_VRH', 'structure', 'energy', 'energy_per_atom', 'formation_energy_per_atom'] elif not isinstance(properties, list): properties = list(properties) # Pick columns to drop structure data from drop_cols = [] for col_name in ["structure", "initial_structure"]: if col_name in properties: drop_cols.append(col_name) mpdr = MPDataRetrieval() if max_nsites is not None: sites_list = [i for i in range(1, max_nsites + 1)] else: sites_list = [i for i in range(1, 101)] + [{"$gt": 100}] df = pd.DataFrame() for site_specifier in tqdm(sites_list, desc="Querying Materials Project"): # While loop to repeat queries if server request fails while True: try: site_response = mpdr.get_dataframe( criteria={"nsites": site_specifier}, properties=properties, index_mpid=True ) break except MPRestError: tqdm.write("Error querying materials project, " "trying again after 5 sec") sleep(5) df = df.append(site_response) tqdm.write("DataFrame with {} entries created".format(len(df))) # Write data out to file if user so chooses if write_to_csv: df.to_csv("mp_all.csv") df.drop(drop_cols, axis=1, inplace=True) df.to_csv("mp_nostruct.csv") if write_to_compressed_json: store_dataframe_as_json(df, "mp_all.json.gz", compression="gz") df = df.drop(drop_cols, axis=1) store_dataframe_as_json(df, "mp_nostruct.json.gz", compression="gz") return df
def transform(self, df, target, prevent_cache_overwrite=False): """ Decorate a dataframe containing composition, structure, bandstructure, and/or DOS objects with descriptors. Args: df (pandas.DataFrame): The dataframe not containing features. target (str): The ML-target property contained in the df. prevent_cache_overwrite (bool): If True, does not try to write any new features to the cache. Returns: df (pandas.DataFrame): Transformed dataframe containing features. """ if self.cache_src and os.path.exists(self.cache_src): logger.debug(self._log_prefix + "Reading cache_src {}".format(self.cache_src)) cached_df = load_dataframe_from_json(self.cache_src) if not all([loc in cached_df.index for loc in df.index]): raise AutomatminerError("Feature cache does not contain all " "entries (by DataFrame index) needed " "to transform the input df.") else: cached_subdf = cached_df.loc[df.index] if target in cached_subdf.columns: if target not in df.columns: logger.warn( self._log_prefix + "Target not present in both cached df and input df." " Cannot perform comparison to ensure index match." ) else: cached_targets = cached_subdf[target] input_targets = df[target] cached_type = regression_or_classification( cached_targets) input_type = regression_or_classification( input_targets) if cached_type != input_type: raise AutomatminerError( "Cached targets appear to be '{}' type, while " "input targets appear to be '{}'." "".format(cached_type, input_type)) problems = {} for ix in input_targets.index: iv = input_targets[ix] cv = cached_targets[ix] if iv != cv: try: if not math.isclose(iv, cv): problems[ix] = [iv, cv] except TypeError: pass if problems: logger.warning( self._log_prefix + "Mismatch between cached targets and input " "targets: \n{}".format(problems)) logger.info(self._log_prefix + "Restored {} features on {} samples from " "cache {}".format(len(cached_subdf.columns), len(df.index), self.cache_src)) return cached_subdf else: transforming_on_fitted = df is self.fitted_input_df df = self._prescreen_df(df, inplace=True) if transforming_on_fitted: df = self.converted_input_df else: df = self._add_composition_from_structure(df) for featurizer_type, featurizers in self.featurizers.items(): if featurizer_type in df.columns: if not transforming_on_fitted: df = self._tidy_column(df, featurizer_type) for f in featurizers: logger.info(self._log_prefix + "Featurizing with {}." "".format(f.__class__.__name__)) df = f.featurize_dataframe( df, featurizer_type, ignore_errors=self.ignore_errors, multiindex=self.multiindex, inplace=False, ) if self.drop_inputs: df = df.drop(columns=[featurizer_type]) else: logger.info(self._log_prefix + "Featurizer type {} not in the dataframe. " "Skipping...".format(featurizer_type)) if self.functionalize: ff = FunctionFeaturizer() ff.set_n_jobs(self.n_jobs) cols = df.columns.tolist() for ft in self.featurizers.keys(): if ft in cols: cols.pop(ft) df = ff.fit_featurize_dataframe( df, cols, ignore_errors=self.ignore_errors, multiindex=self.multiindex, inplace=False, ) if (self.cache_src and not os.path.exists(self.cache_src) and not prevent_cache_overwrite): store_dataframe_as_json(df, self.cache_src) return df
# %% where there are several mp_ids, pick the one with lowest energy above convex hull def get_e_above_hull(mp_id: str) -> float: return mpr.query(mp_id, ["e_above_hull"])["e_above_hull"] phonons["es_above_hull"] = phonons.likely_mp_ids.progress_apply( lambda ids: [get_e_above_hull(id) for id in ids]) phonons["likely_mp_id"] = phonons.apply( lambda row: row.likely_mp_ids[np.argmin(row.es_above_hull)], axis=1) # %% cols = ["structure", "last phdos peak", "likely_mp_id"] store_dataframe_as_json(phonons[cols], "matbench-phonons-with-mp-id.json.gz") phonons[cols] = load_dataframe_from_json("matbench-phonons-with-mp-id.json.gz") # %% phonons[["sg_symbol", "sg_number"]] = phonons.progress_apply( lambda row: row.structure.get_space_group_info(), axis=1, result_type="expand") phonons["crystal_system"] = phonons.structure.progress_apply( lambda struct: SpacegroupAnalyzer(struct).get_crystal_system()) phonons[["sg_symbol", "sg_number", "crystal_system", "volume", "formula"]].to_csv("additional-df-cols.csv", index=False)
any_metals = any(per_comp_is_metal) all_metals = any(per_comp_is_metal) is_metal = None if not all_metals and any_metals: print(f"Problem composition {c}: {df_per_comp_is_metal}\n") problem_compositions.append(c) continue elif all_metals and any_metals: print(f"All metals: {c}") is_metal = 1 elif not all_metals and not any_metals: print(f"No metals: {c}") is_metal = 0 elif all_metals and not any_metals: raise ValueError("Impossible combination of metals.") new_df_dict["composition"].append(c) new_df_dict["is_metal"].append(is_metal) df_new = pd.DataFrame(new_df_dict) df_new = df_new.sort_values(by="composition") df_new = df_new.reset_index(drop=True) df_new["is_metal"] = df_new["is_metal"] == 1 store_dataframe_as_json(df_new, "expt_is_metal.json.gz", compression="gz") print(df_new) print(df_new["is_metal"].value_counts()) print(f"Problem compositions: {problem_compositions}")