def get_feats_hot_encoded(feature_names=keys, filename="feats_encoded.json"): """Get OneHotEncoded features.""" # Deprecated # Kept for reference only from sklearn.preprocessing import OneHotEncoder import pandas as pd encoder = OneHotEncoder(categories="auto", sparse=False) dat = {} for i, j in chem_data.items(): tmp = [] for r, s in j.items(): if r in feature_names: tmp.append(s) dat[Specie(i).Z] = tmp # j.values() df = pd.DataFrame(dat) vals = [] for i in range(len(df.values)): output = encoder.fit_transform( np.array(df.values[i], dtype="float").reshape(-1, 1) ) # .toarray() vals.extend(output.T) vals = np.array(vals, dtype="float").T cols = df.columns.tolist() new_dat = {} for i, j in zip(cols, vals): new_dat[i] = list(j) if filename is not None: from jarvis.db.jsonutils import dumpjson dumpjson(data=new_dat, filename=filename) return new_dat
def upload_jarvisdft_xmls( self, files="/rk2/knc6/DB/XMLs/VASPDASK/*.xml", template_id="5f626925ece4b00035e5277f", json_name="jarvisdft_xmls.json", ): """Upload JARVIS-DFT XML files.""" mem = [] for i in glob.glob(files): jid = i.split("/")[-1].split(".xml")[0] print(jid) try: upload_id = self.upload_xml_file(filename=i, template_id=template_id) print(jid) info = {} info["jid"] = jid info["api_id"] = upload_id mem.append(info) except Exception: info = {} info["jid"] = jid info["api_id"] = "Failed" print("Failed for", i) pass if json_name is not None: dumpjson(filename=json_name, data=mem) return mem
def upload_jarvisff_xmls( self, files="/rk2/knc6/DB/XMLs/LAMMPS/*.xml", template_id="5f6162b4ece4b00034e4fe19", json_name="jarvisff_xmls.json", ): """Upload JARVIS-FF XML files.""" mem = [] for i in glob.glob(files): jid = i.split("/")[-1].split(".xml")[0] try: upload_id = self.upload_xml_file(filename=i, template_id=template_id) info = {} info["jid"] = jid info["api_id"] = upload_id mem.append(info) except Exception: info = {} info["jid"] = jid info["api_id"] = "Failed" print("Failed for", i) pass if json_name is not None: dumpjson(filename=json_name, data=mem) return mem
def get_digitized_feats_hot_encoded( feature_names=keys, filename="feats_encoded.json" ): """Get OneHotEncoded features with digitized features.""" from sklearn.preprocessing import OneHotEncoder import pandas as pd encoder = OneHotEncoder(categories="auto", sparse=False) dat = defaultdict() for i, j in chem_data.items(): tmp = defaultdict() for r, s in j.items(): if r in feature_names: tmp[r] = s dat[Specie(i).Z] = tmp # j.values() df = pd.DataFrame(dat) df = df.T.replace(-9999.0, 0).replace(-0.0, 0).astype("float") for i in df.columns: df[i] = digitize_array(df[i]) df = df.T vals = [] for i in range(len(df.values)): output = encoder.fit_transform( np.array(df.values[i], dtype="float").reshape(-1, 1) ) # .toarray() vals.extend(output.T) vals = np.array(vals, dtype="float").T cols = df.columns.tolist() new_dat = {} for i, j in zip(cols, vals): new_dat[int(i)] = list([int(m) for m in j]) if filename is not None: from jarvis.db.jsonutils import dumpjson dumpjson(data=new_dat, filename=filename) return new_dat
def train_tasks(mb=None, config_template="config_example.json", file_format="poscar"): """Train MatBench clalssification and regression tasks.""" for task in mb.tasks: task.load() if task.metadata.task_type == CLF_KEY: classification = True else: classification = False # Classification tasks if classification: # rocs = [] for ii, fold in enumerate(task.folds): train_df = task.get_train_and_val_data(fold, as_type="df") test_df = task.get_test_data(fold, include_target=True, as_type="df") train_df["is_metal"] = train_df["is_metal"].astype(int) test_df["is_metal"] = test_df["is_metal"].astype(int) # Name of the target property target = [ col for col in train_df.columns if col not in ("id", "structure", "composition") ][0] # Making sure there are spaces or parenthesis which # can cause issue while creating folder fold_name = (task.dataset_name + "_" + target.replace( " ", "_").replace("(", "-").replace(")", "-") + "_fold_" + str(ii)) if not os.path.exists(fold_name): os.makedirs(fold_name) os.chdir(fold_name) # ALIGNN requires the id_prop.csv file f = open("id_prop.csv", "w") for jj, j in train_df.iterrows(): id = j.name atoms = pmg_to_atoms(j.structure) pos_name = id atoms.write_poscar(pos_name) val = j[target] line = str(pos_name) + "," + str(val) + "\n" f.write(line) # There is no pre-defined validation splt, so we will use # a portion of training set as validation set, and # keep test set intact val_df = train_df[0:len(test_df)] for jj, j in val_df.iterrows(): # for jj, j in test_df.iterrows(): id = j.name atoms = pmg_to_atoms(j.structure) pos_name = id atoms.write_poscar(pos_name) val = j[target] line = str(pos_name) + "," + str(val) + "\n" f.write(line) for jj, j in test_df.iterrows(): id = j.name atoms = pmg_to_atoms(j.structure) pos_name = id atoms.write_poscar(pos_name) val = j[target] line = str(pos_name) + "," + str(val) + "\n" f.write(line) n_train = len(train_df) n_val = len(val_df) n_test = len(test_df) config = loadjson(config_template) config["n_train"] = n_train config["n_val"] = n_val config["n_test"] = n_test config["keep_data_order"] = True config["batch_size"] = 32 config["epochs"] = 40 config["classification_threshold"] = 0.01 fname = "config_fold_" + str(ii) + ".json" dumpjson(data=config, filename=fname) f.close() os.chdir("..") outdir_name = (task.dataset_name + "_" + target.replace( " ", "_").replace("(", "-").replace(")", "-") + "_outdir_" + str(ii)) cmd = ("train_folder.py --root_dir " + fold_name + " --config " + fold_name + "/" + fname + " --file_format=" + file_format + " --keep_data_order=True" + " --classification_threshold=0.01" + " --output_dir=" + outdir_name) print(cmd) os.system(cmd) test_csv = outdir_name + "/prediction_results_test_set.csv" df = pd.read_csv(test_csv) target_vals = df.target.values id_vals = df.id.values # Regression tasks # TODO: shorten the script by taking out repetitive lines if not classification: maes = [] for ii, fold in enumerate(task.folds): train_df = task.get_train_and_val_data(fold, as_type="df") test_df = task.get_test_data(fold, include_target=True, as_type="df") # Name of the target property target = [ col for col in train_df.columns if col not in ("id", "structure", "composition") ][0] # Making sure there are spaces or parenthesis which # can cause issue while creating folder fold_name = (task.dataset_name + "_" + target.replace( " ", "_").replace("(", "-").replace(")", "-") + "_fold_" + str(ii)) if not os.path.exists(fold_name): os.makedirs(fold_name) os.chdir(fold_name) # ALIGNN requires the id_prop.csv file f = open("id_prop.csv", "w") for jj, j in train_df.iterrows(): id = j.name atoms = pmg_to_atoms(j.structure) pos_name = id atoms.write_poscar(pos_name) val = j[target] line = str(pos_name) + "," + str(val) + "\n" f.write(line) # There is no pre-defined validation splt, so we will use # a portion of training set as validation set, and # keep test set intact val_df = train_df[0:len(test_df)] for jj, j in val_df.iterrows(): # for jj, j in test_df.iterrows(): id = j.name atoms = pmg_to_atoms(j.structure) pos_name = id atoms.write_poscar(pos_name) val = j[target] line = str(pos_name) + "," + str(val) + "\n" f.write(line) for jj, j in test_df.iterrows(): id = j.name atoms = pmg_to_atoms(j.structure) pos_name = id atoms.write_poscar(pos_name) val = j[target] line = str(pos_name) + "," + str(val) + "\n" f.write(line) n_train = len(train_df) n_val = len(val_df) n_test = len(test_df) config = loadjson(config_template) config["n_train"] = n_train config["n_val"] = n_val config["n_test"] = n_test config["keep_data_order"] = True config["batch_size"] = 32 config["epochs"] = 500 fname = "config_fold_" + str(ii) + ".json" dumpjson(data=config, filename=fname) f.close() os.chdir("..") outdir_name = (task.dataset_name + "_" + target.replace( " ", "_").replace("(", "-").replace(")", "-") + "_outdir_" + str(ii)) cmd = ("train_folder.py --root_dir " + fold_name + " --config " + fold_name + "/" + fname + " --file_format=" + file_format + " --keep_data_order=True" + " --output_dir=" + outdir_name) print(cmd) os.system(cmd) test_csv = outdir_name + "/prediction_results_test_set.csv" df = pd.read_csv(test_csv) target_vals = df.target.values # id_vals = df.id.values pred_vals = df.prediction.values mae = mean_absolute_error(target_vals, pred_vals) maes.append(mae) task.record(fold, pred_vals, params=config) print( "Dataset_name, Fold, MAE=", task.dataset_name, fold, mean_absolute_error(target_vals, pred_vals), ) maes = np.array(maes) print(maes, np.mean(maes), np.std(maes)) print() print() print()
for jid in jids: d = get_jid_data(jid=jid, dataset="dft_3d") atoms = Atoms.from_dict(d["atoms"]).get_primitive_atoms mat = Poscar(atoms) mat.comment = "bulk@" + str(jid) cwd_home = os.getcwd() dir_name = d["jid"] + "_" + str("PBEBO") if not os.path.exists(dir_name): os.makedirs(dir_name) os.chdir(dir_name) job = JobFactory( vasp_cmd=vasp_cmd, poscar=mat, copy_files=copy_files, ) dumpjson(data=job.to_dict(), filename="job_fact.json") write_jobfact_optb88vdw(pyname="job_fact.py", job_json="job_fact.json") # Example job commands, need to change based on your cluster job_line = ("source ~/anaconda2/envs/my_jarvis/bin/activate my_jarvis \n" + "python job_fact.py") name = jid directory = os.getcwd() Queue.pbs( job_line=job_line, jobname=name, directory=directory, submit_cmd=submit_cmd, ) os.chdir(cwd_home)