Example #1
0
def transfer_data(df, worker, now):
    this_dir = os.path.dirname(os.path.abspath(__file__))
    user_folder = os.path.join(this_dir, "user_dfs")
    if not os.path.exists(user_folder):
        os.makedirs(user_folder)
    filename = "user_df_" + now + ".json"
    filepath = os.path.join(user_folder, filename)
    store_dataframe_as_json(df, filepath)

    if worker != "local":
        if worker == "cori":
            o = subprocess.check_output(
                ['bash', '-c', '. ~/.bash_profile; cori_get_password'])
            user = os.environ["CORI_USER"]
            host = "lrc-login.lbl.gov"
        elif worker == "lrc":
            o = subprocess.check_output(
                ['bash', '-c', '. ~/.bash_profile; lrc_get_password'])
            user = os.environ["LRC_USER"]
            host = "lrc-login.lbl.gov"
        else:
            raise ValueError(f"Worker {worker} not valid!")

        o_utf = o.decode("utf-8")
        o_all = o_utf.split("\n")
        o_all.remove("")
        password = o_all[-1]

        ssh = SSHClient()
        ssh.load_system_host_keys()
        ssh.connect(host,
                    username=user,
                    password=password,
                    look_for_keys=False)

        with SCPClient(ssh.get_transport()) as scp:
            scp.put(filepath,
                    recursive=True,
                    remote_path="/global/home/users/ardunn")
    else:
        pass
Example #2
0
    def test_store_dataframe_as_json(self):

        # check write produces correct file
        temp_file = os.path.join(self.temp_folder, 'test_dataframe.json')
        test_file = os.path.join(test_dir, "dataframe.json")
        store_dataframe_as_json(self.df, temp_file)

        with zopen(temp_file, 'rb') as f:
            temp_data = json.load(f)

        with zopen(test_file, 'rb') as f:
            test_data = json.load(f)

        # remove version otherwise this will have to be updated everytime
        # the pymatgen version changes
        temp_data["data"][0][0].pop("@version")
        test_data["data"][0][0].pop("@version")

        self.assertDictsAlmostEqual(temp_data, test_data)

        # check writing gzipped json (comparing hashes doesn't work) so have to
        # compare contents
        temp_file = os.path.join(self.temp_folder, 'test_dataframe.json.gz')
        test_file = os.path.join(test_dir, "dataframe.json.gz")
        store_dataframe_as_json(self.df, temp_file, compression='gz')

        with zopen(temp_file, 'rb') as f:
            temp_data = json.load(f)

        with zopen(test_file, 'rb') as f:
            test_data = json.load(f)

        temp_data["data"][0][0].pop("@version")
        test_data["data"][0][0].pop("@version")

        self.assertDictsAlmostEqual(temp_data, test_data)

        # check writing bz2 compressed json (comparing hashes doesn't work)
        # check writing gzipped json (comparing hashes doesn't work) so have to
        # compare contents
        temp_file = os.path.join(self.temp_folder, 'test_dataframe.json.bz2')
        test_file = os.path.join(test_dir, "dataframe.json.bz2")
        store_dataframe_as_json(self.df, temp_file, compression='bz2')

        with zopen(temp_file, 'rb') as f:
            temp_data = json.load(f)

        with zopen(test_file, 'rb') as f:
            test_data = json.load(f)

        temp_data["data"][0][0].pop("@version")
        test_data["data"][0][0].pop("@version")

        self.assertDictsAlmostEqual(temp_data, test_data)
Example #3
0
def generate_json_files():
    diamond = Structure(
        Lattice([[2.189, 0, 1.264], [0.73, 2.064, 1.264], [0, 0, 2.528]]),
        ["C0+", "C0+"], [[2.554, 1.806, 4.423], [0.365, 0.258, 0.632]],
        validate_proximity=False,
        to_unit_cell=False, coords_are_cartesian=True,
        site_properties=None
    )
    df = pd.DataFrame(data={'structure': [diamond]})

    plain_file = os.path.join(test_dir, "dataframe.json")
    store_dataframe_as_json(df, plain_file)

    gz_file = os.path.join(test_dir, "dataframe.json.gz")
    store_dataframe_as_json(df, gz_file, compression='gz')

    bz2_file = os.path.join(test_dir, "dataframe.json.bz2")
    store_dataframe_as_json(df, bz2_file, compression='bz2')
Example #4
0
def generate_json_files():
    diamond = Structure(Lattice([[2.189, 0, 1.264], [0.73, 2.064, 1.264],
                                 [0, 0, 2.528]]), ["C0+", "C0+"],
                        [[2.554, 1.806, 4.423], [0.365, 0.258, 0.632]],
                        validate_proximity=False,
                        to_unit_cell=False,
                        coords_are_cartesian=True,
                        site_properties=None)
    df = pd.DataFrame(data={'structure': [diamond]})

    plain_file = os.path.join(test_dir, "dataframe.json")
    store_dataframe_as_json(df, plain_file)

    gz_file = os.path.join(test_dir, "dataframe.json.gz")
    store_dataframe_as_json(df, gz_file, compression='gz')

    bz2_file = os.path.join(test_dir, "dataframe.json.bz2")
    store_dataframe_as_json(df, bz2_file, compression='bz2')
Example #5
0
    def test_store_dataframe_as_json(self):

        # check write produces correct file
        temp_file = os.path.join(self.temp_folder, 'test_dataframe.json')
        test_file = os.path.join(test_dir, "dataframe.json")
        store_dataframe_as_json(self.df, temp_file)

        self.assertTrue(
            filecmp.cmp(temp_file, test_file), "Json files do not match.")

        # check writing gzipped json (comparing hashes doesn't work) so have to
        # compare contents
        temp_file = os.path.join(self.temp_folder, 'test_dataframe.json.gz')
        test_file = os.path.join(test_dir, "dataframe.json.gz")
        store_dataframe_as_json(self.df, temp_file, compression='gz')

        with zopen(temp_file, 'rb') as f:
            temp_data = json.load(f)

        with zopen(test_file, 'rb') as f:
            test_data = json.load(f)

        self.assertTrue(temp_data == test_data,
                        "Compressed json files do not match.")

        # check writing bz2 compressed json (comparing hashes doesn't work)
        # check writing gzipped json (comparing hashes doesn't work) so have to
        # compare contents
        temp_file = os.path.join(self.temp_folder, 'test_dataframe.json.bz2')
        test_file = os.path.join(test_dir, "dataframe.json.bz2")
        store_dataframe_as_json(self.df, temp_file, compression='bz2')

        with zopen(temp_file, 'rb') as f:
            temp_data = json.load(f)

        with zopen(test_file, 'rb') as f:
            test_data = json.load(f)

        self.assertTrue(temp_data == test_data,
                        "Compressed json files do not match.")
Example #6
0
    def test_store_dataframe_as_json(self):

        # check write produces correct file
        temp_file = os.path.join(self.temp_folder, 'test_dataframe.json')
        test_file = os.path.join(test_dir, "dataframe.json")
        store_dataframe_as_json(self.df, temp_file)

        self.assertTrue(filecmp.cmp(temp_file, test_file),
                        "Json files do not match.")

        # check writing gzipped json (comparing hashes doesn't work) so have to
        # compare contents
        temp_file = os.path.join(self.temp_folder, 'test_dataframe.json.gz')
        test_file = os.path.join(test_dir, "dataframe.json.gz")
        store_dataframe_as_json(self.df, temp_file, compression='gz')

        with zopen(temp_file, 'rb') as f:
            temp_data = json.load(f)

        with zopen(test_file, 'rb') as f:
            test_data = json.load(f)

        self.assertTrue(temp_data == test_data,
                        "Compressed json files do not match.")

        # check writing bz2 compressed json (comparing hashes doesn't work)
        # check writing gzipped json (comparing hashes doesn't work) so have to
        # compare contents
        temp_file = os.path.join(self.temp_folder, 'test_dataframe.json.bz2')
        test_file = os.path.join(test_dir, "dataframe.json.bz2")
        store_dataframe_as_json(self.df, temp_file, compression='bz2')

        with zopen(temp_file, 'rb') as f:
            temp_data = json.load(f)

        with zopen(test_file, 'rb') as f:
            test_data = json.load(f)

        self.assertTrue(temp_data == test_data,
                        "Compressed json files do not match.")
Example #7
0
# -- start F12
from matminer.featurizers.composition import TMetalFraction

tmf_feat = TMetalFraction()
fdf = tmf_feat.featurize_dataframe(fdf,
                                   col_id='composition',
                                   ignore_errors=True)
# -- end F12

# End of the basic featurization

# Saving the database
print("The final dataset has {}".format(fdf.shape))
fdf.to_csv(r"Batteries_predict.csv", index=None, header=True)
store_dataframe_as_json(fdf,
                        'Batteries_predict.json',
                        compression=None,
                        orient='split')
'''
Block 3 - Loading and making predictions
'''

# Saving Id and Formula to an output dataframe odf
odf = pd.DataFrame()  # output dataframe
odf['Id'] = fdf['Id']
odf['Reduced Formula'] = fdf['Reduced Formula']

excluded = [
    'Id', 'Reduced Formula', 'composition', 'composition_oxid',
    'HOMO_character', 'HOMO_element', 'LUMO_character', 'LUMO_element'
]
Example #8
0
    all_gfa = any(per_comp_gfa)
    gfa = None
    if any_gfa and not all_gfa:
        print(f"Problem composition {c}: {df_per_comp_gfa}\n")
        problem_compositions.append(c)
        continue
    elif all_gfa and any_gfa:
        print(f"All gfa: {c}")
        gfa = 1
    elif not all_gfa and not any_gfa:
        print(f"No gfa: {c}")
        gfa = 0
    elif all_gfa and not any_gfa:
        raise ValueError("Impossible combination of gfa values.")

    new_df_dict["composition"].append(c)
    new_df_dict["gfa"].append(gfa)

df_new = pd.DataFrame(new_df_dict)
df_new = df_new.sort_values(by="composition")
df_new = df_new.reset_index(drop=True)

# convert to bools
df_new["gfa"] = df_new["gfa"] == 1

print(df_new)
print(df_new["gfa"].value_counts())
print(f"Problem compositions: {problem_compositions}")

store_dataframe_as_json(df_new, "glass.json.gz", compression="gz")
Example #9
0
# Defining the ions correctly
df[["Id", "Ion"]] = df["Battid"].str.split("_", expand=True)

# Final check to avoid errors in the matminer.composition module
clean3 = df["Reduced Formula"].str[0:1]  # 1st char
df["RF3"] = df["Reduced Formula"].str[1:]
df["Reduced Formula"] = np.where(clean3.str.isnumeric(), df["RF3"],
                                 df["Reduced Formula"])

excluded = [
    "Type", "Unnamed: 11", "Trash", "RF1", "RF2", "RF3", "Battid",
    "Reduced Cell Formula"
]
df = df.drop(excluded, axis=1)

# Print a summary before writing
print("A summary of the dataframe: {}".format(df.describe))
print("The reduced formula column: {}".format(df["Reduced Formula"]))

# Exporting "df" to *.csv and *.json
df.to_csv(r"Batteries_raw.csv", index=None, header=True)
store_dataframe_as_json(df,
                        'Batteries_raw.json',
                        compression=None,
                        orient='split')

# Debug routine
# reduced = df["Reduced Formula"]
# reduced.to_csv(r"reduced.csv", index = None, header = True)
Example #10
0
print("Number of unique compositions:", len(unique))
# raise ValueError

new_df_dict = {"composition": [], "gap expt": []}
for c in tqdm(unique):
    df_per_comp_gaps = df[df["composition"] == c]
    per_comp_gaps = df_per_comp_gaps["gap expt"]
    measurement_range = max(per_comp_gaps) - min(per_comp_gaps)
    if measurement_range > 0.1:
        # print(df_per_comp_gaps)
        # big_diff += 1
        excluded_compositions.append(c)
    else:
        mean_gap = per_comp_gaps.mean()
        gap_diffs = per_comp_gaps - mean_gap
        min_gap_diff = gap_diffs.min()
        min_gap_diff_index = gap_diffs.tolist().index(min_gap_diff)
        actual_gap_diff = per_comp_gaps.tolist()[min_gap_diff_index]
        # if len(per_comp_gaps) > 1:
        #     print(f"{c} decided on {actual_gap_diff} from \n {per_comp_gaps} \n\n")
        new_df_dict["composition"].append(c)
        new_df_dict["gap expt"].append(actual_gap_diff)

df_new = pd.DataFrame(new_df_dict)
df_new = df_new.sort_values(by="composition")
df_new = df_new.reset_index(drop=True)

store_dataframe_as_json(df_new, "expt_gap.json.gz", compression="gz")

print(df_new)
Example #11
0
    else:
        destination = args.destination

    makedirs(destination, exist_ok=True)

    if args.hash_file is None:
        hash_file = join(destination, "file_hashes.txt")
    else:
        hash_file = args.hash_file

    if args.no_hashes:
        for file_path in csv_file_paths:
            dataframe = _csv_to_dataframe(file_path)
            store_dataframe_as_json(
                dataframe,
                join(destination, basename(file_path)[:-4] + ".json"),
                compression=args.compression_type
            )
    else:
        with open(hash_file, "w") as out:
            for file_path in csv_file_paths:
                dataset_name = basename(file_path)[:-4]
                json_destination = join(destination, dataset_name + ".json")

                dataframe = _csv_to_dataframe(file_path)

                store_dataframe_as_json(dataframe, json_destination,
                                        compression=args.compression_type)

                if args.compression_type is not None:
                    json_destination += ("." + args.compression_type)
Example #12
0
    def test_featurize_bsdos(self, refresh_df_init=False, limit=1):
        """
        Tests featurize_dos and featurize_bandstructure.

        Args:
            refresh_df_init (bool): for developers, if the test need to be
                updated set to True. Otherwise set to False to make the final
                test independent of MPRester and faster.
            limit (int): the maximum final number of entries.

        Returns (None):
        """
        target = "color"
        df_bsdos_pickled = "mp_data_with_dos_bandstructure.pickle"
        save_path = os.path.join(TEST_DIR, df_bsdos_pickled)
        if refresh_df_init:
            mpdr = MPDataRetrieval()
            df = mpdr.get_dataframe(
                criteria={"material_id": "mp-149"},
                properties=[
                    "pretty_formula",
                    "dos",
                    "bandstructure",
                    "bandstructure_uniform",
                ],
            )
            store_dataframe_as_json(df, save_path)
        else:
            df = load_dataframe_from_json(save_path)
        df = df.dropna(axis=0)
        df = df.rename(
            columns={
                "bandstructure_uniform": "bandstructure",
                "bandstructure": "line bandstructure",
            }
        )
        df[target] = [["red"]]
        n_cols_init = df.shape[1]

        featurizer = AutoFeaturizer(
            preset="express", ignore_errors=False, multiindex=False
        )
        df = featurizer.fit_transform(df, target)

        # sanity checks
        self.assertTrue(len(df), limit)
        self.assertGreater(len(df.columns), n_cols_init)

        # DOSFeaturizer:
        self.assertEqual(df["cbm_character_1"][0], "p")

        # DopingFermi:
        self.assertAlmostEqual(df["fermi_c1e+20T300"][0], -0.539, 3)

        # Hybridization:
        self.assertAlmostEqual(df["vbm_sp"][0], 0.181, 3)
        self.assertAlmostEqual(df["cbm_s"][0], 0.4416, 3)
        self.assertAlmostEqual(df["cbm_sp"][0], 0.9864, 3)

        # BandFeaturizer:
        self.assertAlmostEqual(df["direct_gap"][0], 2.556, 3)
        self.assertAlmostEqual(df["n_ex1_norm"][0], 0.6285, 4)

        # BranchPointEnergy:
        self.assertAlmostEqual(df["branch_point_energy"][0], 5.7677, 4)
Example #13
0
def generate_mp(max_nsites=None, properties=None, write_to_csv=False,
                write_to_compressed_json=True):
    """
    Grabs all mp materials. This will return two csv/json.gz files:
        * mp_nostruct: All MP materials, not including structures
        * mp_all: All MP materials, including structures

    Args:
        max_nsites (int): The maximum number of sites to include in the query.

        properties (iterable of strings): list of properties supported by
            MPDataRetrieval

        write_to_csv (bool): whether to write resulting dataframe to csv

        write_to_compressed_json (bool): whether to write resulting
            dataframe to json.gz file

    Returns (pandas.DataFrame):
        retrieved/generated data
    """

    # Set default properties if None and ensure is a list
    if properties is None:
        properties = ['pretty_formula', 'e_above_hull', 'band_gap',
                      'total_magnetization', 'elasticity.elastic_anisotropy',
                      'elasticity.K_VRH', 'elasticity.G_VRH', 'structure',
                      'energy', 'energy_per_atom', 'formation_energy_per_atom']
    elif not isinstance(properties, list):
        properties = list(properties)

    # Pick columns to drop structure data from
    drop_cols = []
    for col_name in ["structure", "initial_structure"]:
        if col_name in properties:
            drop_cols.append(col_name)

    mpdr = MPDataRetrieval()
    if max_nsites is not None:
        sites_list = [i for i in range(1, max_nsites + 1)]
    else:
        sites_list = [i for i in range(1, 101)] + [{"$gt": 100}]

    df = pd.DataFrame()
    for site_specifier in tqdm(sites_list, desc="Querying Materials Project"):
        # While loop to repeat queries if server request fails
        while True:
            try:
                site_response = mpdr.get_dataframe(
                    criteria={"nsites": site_specifier},
                    properties=properties, index_mpid=True
                )
                break

            except MPRestError:
                tqdm.write("Error querying materials project, "
                           "trying again after 5 sec")
                sleep(5)

        df = df.append(site_response)

    tqdm.write("DataFrame with {} entries created".format(len(df)))

    # Write data out to file if user so chooses
    if write_to_csv:
        df.to_csv("mp_all.csv")
        df.drop(drop_cols, axis=1, inplace=True)
        df.to_csv("mp_nostruct.csv")

    if write_to_compressed_json:
        store_dataframe_as_json(df, "mp_all.json.gz", compression="gz")
        df = df.drop(drop_cols, axis=1)
        store_dataframe_as_json(df, "mp_nostruct.json.gz", compression="gz")

    return df
Example #14
0
    def transform(self, df, target, prevent_cache_overwrite=False):
        """
        Decorate a dataframe containing composition, structure, bandstructure,
        and/or DOS objects with descriptors.

        Args:
            df (pandas.DataFrame): The dataframe not containing features.
            target (str): The ML-target property contained in the df.
            prevent_cache_overwrite (bool): If True, does not try to write any
                new features to the cache.

        Returns:
            df (pandas.DataFrame): Transformed dataframe containing features.
        """
        if self.cache_src and os.path.exists(self.cache_src):
            logger.debug(self._log_prefix +
                         "Reading cache_src {}".format(self.cache_src))
            cached_df = load_dataframe_from_json(self.cache_src)
            if not all([loc in cached_df.index for loc in df.index]):
                raise AutomatminerError("Feature cache does not contain all "
                                        "entries (by DataFrame index) needed "
                                        "to transform the input df.")
            else:
                cached_subdf = cached_df.loc[df.index]
                if target in cached_subdf.columns:
                    if target not in df.columns:
                        logger.warn(
                            self._log_prefix +
                            "Target not present in both cached df and input df."
                            " Cannot perform comparison to ensure index match."
                        )
                    else:
                        cached_targets = cached_subdf[target]
                        input_targets = df[target]
                        cached_type = regression_or_classification(
                            cached_targets)
                        input_type = regression_or_classification(
                            input_targets)
                        if cached_type != input_type:
                            raise AutomatminerError(
                                "Cached targets appear to be '{}' type, while "
                                "input targets appear to be '{}'."
                                "".format(cached_type, input_type))

                        problems = {}
                        for ix in input_targets.index:
                            iv = input_targets[ix]
                            cv = cached_targets[ix]
                            if iv != cv:
                                try:
                                    if not math.isclose(iv, cv):
                                        problems[ix] = [iv, cv]
                                except TypeError:
                                    pass
                        if problems:
                            logger.warning(
                                self._log_prefix +
                                "Mismatch between cached targets and input "
                                "targets: \n{}".format(problems))

                logger.info(self._log_prefix +
                            "Restored {} features on {} samples from "
                            "cache {}".format(len(cached_subdf.columns),
                                              len(df.index), self.cache_src))
                return cached_subdf
        else:
            transforming_on_fitted = df is self.fitted_input_df
            df = self._prescreen_df(df, inplace=True)

            if transforming_on_fitted:
                df = self.converted_input_df
            else:
                df = self._add_composition_from_structure(df)

            for featurizer_type, featurizers in self.featurizers.items():
                if featurizer_type in df.columns:
                    if not transforming_on_fitted:
                        df = self._tidy_column(df, featurizer_type)

                    for f in featurizers:
                        logger.info(self._log_prefix + "Featurizing with {}."
                                    "".format(f.__class__.__name__))
                        df = f.featurize_dataframe(
                            df,
                            featurizer_type,
                            ignore_errors=self.ignore_errors,
                            multiindex=self.multiindex,
                            inplace=False,
                        )
                    if self.drop_inputs:
                        df = df.drop(columns=[featurizer_type])
                else:
                    logger.info(self._log_prefix +
                                "Featurizer type {} not in the dataframe. "
                                "Skipping...".format(featurizer_type))
            if self.functionalize:
                ff = FunctionFeaturizer()
                ff.set_n_jobs(self.n_jobs)
                cols = df.columns.tolist()
                for ft in self.featurizers.keys():
                    if ft in cols:
                        cols.pop(ft)
                df = ff.fit_featurize_dataframe(
                    df,
                    cols,
                    ignore_errors=self.ignore_errors,
                    multiindex=self.multiindex,
                    inplace=False,
                )
            if (self.cache_src and not os.path.exists(self.cache_src)
                    and not prevent_cache_overwrite):
                store_dataframe_as_json(df, self.cache_src)
            return df
Example #15
0

# %% where there are several mp_ids, pick the one with lowest energy above convex hull
def get_e_above_hull(mp_id: str) -> float:
    return mpr.query(mp_id, ["e_above_hull"])["e_above_hull"]


phonons["es_above_hull"] = phonons.likely_mp_ids.progress_apply(
    lambda ids: [get_e_above_hull(id) for id in ids])

phonons["likely_mp_id"] = phonons.apply(
    lambda row: row.likely_mp_ids[np.argmin(row.es_above_hull)], axis=1)

# %%
cols = ["structure", "last phdos peak", "likely_mp_id"]
store_dataframe_as_json(phonons[cols], "matbench-phonons-with-mp-id.json.gz")

phonons[cols] = load_dataframe_from_json("matbench-phonons-with-mp-id.json.gz")

# %%
phonons[["sg_symbol", "sg_number"]] = phonons.progress_apply(
    lambda row: row.structure.get_space_group_info(),
    axis=1,
    result_type="expand")

phonons["crystal_system"] = phonons.structure.progress_apply(
    lambda struct: SpacegroupAnalyzer(struct).get_crystal_system())

phonons[["sg_symbol", "sg_number", "crystal_system", "volume",
         "formula"]].to_csv("additional-df-cols.csv", index=False)
Example #16
0
    any_metals = any(per_comp_is_metal)
    all_metals = any(per_comp_is_metal)
    is_metal = None
    if not all_metals and any_metals:
        print(f"Problem composition {c}: {df_per_comp_is_metal}\n")
        problem_compositions.append(c)
        continue
    elif all_metals and any_metals:
        print(f"All metals: {c}")
        is_metal = 1
    elif not all_metals and not any_metals:
        print(f"No metals: {c}")
        is_metal = 0
    elif all_metals and not any_metals:
        raise ValueError("Impossible combination of metals.")

    new_df_dict["composition"].append(c)
    new_df_dict["is_metal"].append(is_metal)

df_new = pd.DataFrame(new_df_dict)
df_new = df_new.sort_values(by="composition")
df_new = df_new.reset_index(drop=True)

df_new["is_metal"] = df_new["is_metal"] == 1

store_dataframe_as_json(df_new, "expt_is_metal.json.gz", compression="gz")

print(df_new)
print(df_new["is_metal"].value_counts())
print(f"Problem compositions: {problem_compositions}")