Exemple #1
0
def get_feats_hot_encoded(feature_names=keys, filename="feats_encoded.json"):
    """Get OneHotEncoded features."""
    # Deprecated
    # Kept for reference only
    from sklearn.preprocessing import OneHotEncoder
    import pandas as pd

    encoder = OneHotEncoder(categories="auto", sparse=False)
    dat = {}
    for i, j in chem_data.items():
        tmp = []
        for r, s in j.items():
            if r in feature_names:
                tmp.append(s)
        dat[Specie(i).Z] = tmp  # j.values()
    df = pd.DataFrame(dat)

    vals = []
    for i in range(len(df.values)):
        output = encoder.fit_transform(
            np.array(df.values[i], dtype="float").reshape(-1, 1)
        )  # .toarray()
        vals.extend(output.T)
    vals = np.array(vals, dtype="float").T
    cols = df.columns.tolist()
    new_dat = {}
    for i, j in zip(cols, vals):
        new_dat[i] = list(j)
    if filename is not None:
        from jarvis.db.jsonutils import dumpjson

        dumpjson(data=new_dat, filename=filename)
    return new_dat
Exemple #2
0
    def upload_jarvisdft_xmls(
        self,
        files="/rk2/knc6/DB/XMLs/VASPDASK/*.xml",
        template_id="5f626925ece4b00035e5277f",
        json_name="jarvisdft_xmls.json",
    ):
        """Upload JARVIS-DFT XML files."""
        mem = []
        for i in glob.glob(files):
            jid = i.split("/")[-1].split(".xml")[0]
            print(jid)
            try:
                upload_id = self.upload_xml_file(filename=i,
                                                 template_id=template_id)
                print(jid)
                info = {}
                info["jid"] = jid
                info["api_id"] = upload_id
                mem.append(info)
            except Exception:
                info = {}
                info["jid"] = jid
                info["api_id"] = "Failed"

                print("Failed for", i)
                pass
        if json_name is not None:
            dumpjson(filename=json_name, data=mem)
        return mem
Exemple #3
0
    def upload_jarvisff_xmls(
        self,
        files="/rk2/knc6/DB/XMLs/LAMMPS/*.xml",
        template_id="5f6162b4ece4b00034e4fe19",
        json_name="jarvisff_xmls.json",
    ):
        """Upload JARVIS-FF XML files."""
        mem = []
        for i in glob.glob(files):
            jid = i.split("/")[-1].split(".xml")[0]
            try:
                upload_id = self.upload_xml_file(filename=i,
                                                 template_id=template_id)
                info = {}
                info["jid"] = jid
                info["api_id"] = upload_id
                mem.append(info)

            except Exception:
                info = {}
                info["jid"] = jid
                info["api_id"] = "Failed"

                print("Failed for", i)
                pass
        if json_name is not None:
            dumpjson(filename=json_name, data=mem)
        return mem
Exemple #4
0
def get_digitized_feats_hot_encoded(
    feature_names=keys, filename="feats_encoded.json"
):
    """Get OneHotEncoded features with digitized features."""
    from sklearn.preprocessing import OneHotEncoder
    import pandas as pd

    encoder = OneHotEncoder(categories="auto", sparse=False)
    dat = defaultdict()
    for i, j in chem_data.items():
        tmp = defaultdict()
        for r, s in j.items():
            if r in feature_names:
                tmp[r] = s
        dat[Specie(i).Z] = tmp  # j.values()
    df = pd.DataFrame(dat)
    df = df.T.replace(-9999.0, 0).replace(-0.0, 0).astype("float")

    for i in df.columns:
        df[i] = digitize_array(df[i])
    df = df.T

    vals = []
    for i in range(len(df.values)):
        output = encoder.fit_transform(
            np.array(df.values[i], dtype="float").reshape(-1, 1)
        )  # .toarray()
        vals.extend(output.T)
    vals = np.array(vals, dtype="float").T
    cols = df.columns.tolist()
    new_dat = {}
    for i, j in zip(cols, vals):
        new_dat[int(i)] = list([int(m) for m in j])
    if filename is not None:
        from jarvis.db.jsonutils import dumpjson

        dumpjson(data=new_dat, filename=filename)
    return new_dat
Exemple #5
0
def train_tasks(mb=None,
                config_template="config_example.json",
                file_format="poscar"):
    """Train MatBench clalssification and regression tasks."""
    for task in mb.tasks:
        task.load()
        if task.metadata.task_type == CLF_KEY:
            classification = True
        else:
            classification = False
        # Classification tasks
        if classification:
            # rocs = []
            for ii, fold in enumerate(task.folds):
                train_df = task.get_train_and_val_data(fold, as_type="df")
                test_df = task.get_test_data(fold,
                                             include_target=True,
                                             as_type="df")
                train_df["is_metal"] = train_df["is_metal"].astype(int)
                test_df["is_metal"] = test_df["is_metal"].astype(int)
                # Name of the target property
                target = [
                    col for col in train_df.columns
                    if col not in ("id", "structure", "composition")
                ][0]
                # Making sure there are spaces or parenthesis which
                # can cause issue while creating folder
                fold_name = (task.dataset_name + "_" + target.replace(
                    " ", "_").replace("(", "-").replace(")", "-") + "_fold_" +
                             str(ii))
                if not os.path.exists(fold_name):
                    os.makedirs(fold_name)
                os.chdir(fold_name)
                # ALIGNN requires the id_prop.csv file
                f = open("id_prop.csv", "w")
                for jj, j in train_df.iterrows():
                    id = j.name
                    atoms = pmg_to_atoms(j.structure)
                    pos_name = id
                    atoms.write_poscar(pos_name)
                    val = j[target]
                    line = str(pos_name) + "," + str(val) + "\n"
                    f.write(line)
                # There is no pre-defined validation splt, so we will use
                # a portion of training set as validation set, and
                # keep test set intact
                val_df = train_df[0:len(test_df)]
                for jj, j in val_df.iterrows():
                    # for jj, j in test_df.iterrows():
                    id = j.name
                    atoms = pmg_to_atoms(j.structure)
                    pos_name = id
                    atoms.write_poscar(pos_name)
                    val = j[target]
                    line = str(pos_name) + "," + str(val) + "\n"
                    f.write(line)
                for jj, j in test_df.iterrows():
                    id = j.name
                    atoms = pmg_to_atoms(j.structure)
                    pos_name = id
                    atoms.write_poscar(pos_name)
                    val = j[target]
                    line = str(pos_name) + "," + str(val) + "\n"
                    f.write(line)
                n_train = len(train_df)
                n_val = len(val_df)
                n_test = len(test_df)
                config = loadjson(config_template)
                config["n_train"] = n_train
                config["n_val"] = n_val
                config["n_test"] = n_test
                config["keep_data_order"] = True
                config["batch_size"] = 32
                config["epochs"] = 40
                config["classification_threshold"] = 0.01
                fname = "config_fold_" + str(ii) + ".json"
                dumpjson(data=config, filename=fname)
                f.close()
                os.chdir("..")
                outdir_name = (task.dataset_name + "_" + target.replace(
                    " ", "_").replace("(", "-").replace(")", "-") +
                               "_outdir_" + str(ii))
                cmd = ("train_folder.py --root_dir " + fold_name +
                       " --config " + fold_name + "/" + fname +
                       " --file_format=" + file_format +
                       " --keep_data_order=True" +
                       " --classification_threshold=0.01" + " --output_dir=" +
                       outdir_name)
                print(cmd)
                os.system(cmd)
                test_csv = outdir_name + "/prediction_results_test_set.csv"
                df = pd.read_csv(test_csv)
                target_vals = df.target.values
                id_vals = df.id.values

        # Regression tasks
        # TODO: shorten the script by taking out repetitive lines
        if not classification:
            maes = []
            for ii, fold in enumerate(task.folds):
                train_df = task.get_train_and_val_data(fold, as_type="df")
                test_df = task.get_test_data(fold,
                                             include_target=True,
                                             as_type="df")
                # Name of the target property
                target = [
                    col for col in train_df.columns
                    if col not in ("id", "structure", "composition")
                ][0]
                # Making sure there are spaces or parenthesis which
                # can cause issue while creating folder
                fold_name = (task.dataset_name + "_" + target.replace(
                    " ", "_").replace("(", "-").replace(")", "-") + "_fold_" +
                             str(ii))
                if not os.path.exists(fold_name):
                    os.makedirs(fold_name)
                os.chdir(fold_name)
                # ALIGNN requires the id_prop.csv file
                f = open("id_prop.csv", "w")
                for jj, j in train_df.iterrows():
                    id = j.name
                    atoms = pmg_to_atoms(j.structure)
                    pos_name = id
                    atoms.write_poscar(pos_name)
                    val = j[target]
                    line = str(pos_name) + "," + str(val) + "\n"
                    f.write(line)
                # There is no pre-defined validation splt, so we will use
                # a portion of training set as validation set, and
                # keep test set intact
                val_df = train_df[0:len(test_df)]
                for jj, j in val_df.iterrows():
                    # for jj, j in test_df.iterrows():
                    id = j.name
                    atoms = pmg_to_atoms(j.structure)
                    pos_name = id
                    atoms.write_poscar(pos_name)
                    val = j[target]
                    line = str(pos_name) + "," + str(val) + "\n"
                    f.write(line)
                for jj, j in test_df.iterrows():
                    id = j.name
                    atoms = pmg_to_atoms(j.structure)
                    pos_name = id
                    atoms.write_poscar(pos_name)
                    val = j[target]
                    line = str(pos_name) + "," + str(val) + "\n"
                    f.write(line)
                n_train = len(train_df)
                n_val = len(val_df)
                n_test = len(test_df)
                config = loadjson(config_template)
                config["n_train"] = n_train
                config["n_val"] = n_val
                config["n_test"] = n_test
                config["keep_data_order"] = True
                config["batch_size"] = 32
                config["epochs"] = 500
                fname = "config_fold_" + str(ii) + ".json"
                dumpjson(data=config, filename=fname)
                f.close()
                os.chdir("..")
                outdir_name = (task.dataset_name + "_" + target.replace(
                    " ", "_").replace("(", "-").replace(")", "-") +
                               "_outdir_" + str(ii))
                cmd = ("train_folder.py --root_dir " + fold_name +
                       " --config " + fold_name + "/" + fname +
                       " --file_format=" + file_format +
                       " --keep_data_order=True" + " --output_dir=" +
                       outdir_name)
                print(cmd)
                os.system(cmd)
                test_csv = outdir_name + "/prediction_results_test_set.csv"
                df = pd.read_csv(test_csv)
                target_vals = df.target.values
                # id_vals = df.id.values
                pred_vals = df.prediction.values
                mae = mean_absolute_error(target_vals, pred_vals)
                maes.append(mae)
                task.record(fold, pred_vals, params=config)
                print(
                    "Dataset_name, Fold, MAE=",
                    task.dataset_name,
                    fold,
                    mean_absolute_error(target_vals, pred_vals),
                )
            maes = np.array(maes)
            print(maes, np.mean(maes), np.std(maes))
            print()
            print()
            print()
Exemple #6
0
for jid in jids:
    d = get_jid_data(jid=jid, dataset="dft_3d")
    atoms = Atoms.from_dict(d["atoms"]).get_primitive_atoms
    mat = Poscar(atoms)
    mat.comment = "bulk@" + str(jid)
    cwd_home = os.getcwd()
    dir_name = d["jid"] + "_" + str("PBEBO")
    if not os.path.exists(dir_name):
        os.makedirs(dir_name)
    os.chdir(dir_name)
    job = JobFactory(
        vasp_cmd=vasp_cmd,
        poscar=mat,
        copy_files=copy_files,
    )
    dumpjson(data=job.to_dict(), filename="job_fact.json")
    write_jobfact_optb88vdw(pyname="job_fact.py", job_json="job_fact.json")

    # Example job commands, need to change based on your cluster
    job_line = ("source ~/anaconda2/envs/my_jarvis/bin/activate my_jarvis \n" +
                "python job_fact.py")
    name = jid
    directory = os.getcwd()
    Queue.pbs(
        job_line=job_line,
        jobname=name,
        directory=directory,
        submit_cmd=submit_cmd,
    )
    os.chdir(cwd_home)