Python load_dataset Examples, matminer.datasets.dataset_retrieval.load_dataset Python Examples

Example #1

0

Show file

File: test_datasets.py Project: akhilesh0807/matminer

    def universal_dataset_check(self, dataset_name, object_headers=None,
                                numeric_headers=None, bool_headers=None,
                                metadata_headers=None):
        # Runs tests common to all datasets,
        # makes it quicker to write tests for new datasets

        # Get rid of dataset if it's on the disk already
        data_path = os.path.join(
            self.dataset_dir,
            dataset_name + "." + self.dataset_dict[dataset_name]['file_type']
        )
        if os.path.exists(data_path):
            os.remove(data_path)

        # Test that dataset can be downloaded
        load_dataset(dataset_name)
        self.assertTrue(os.path.exists(data_path))

        # Test that data is now available and has all its elements
        df = load_dataset(dataset_name, download_if_missing=False)
        self.assertEqual(
            len(df), self.dataset_dict[dataset_name]["num_entries"]
        )

        # Tes all the non-metadata columns are there
        if metadata_headers is None:
            metadata_headers = set()

        self.assertEqual(sorted(list(df)), sorted(
            [header for header in
             self.dataset_dict[dataset_name]['columns'].keys()
             if header not in metadata_headers]
        ))

        # Test each column for appropriate type
        if object_headers is None:
            object_headers = []
        if numeric_headers is None:
            numeric_headers = []
        if bool_headers is None:
            bool_headers = []

        df = load_dataset(dataset_name, include_metadata=True,
                          download_if_missing=False)
        if object_headers:
            self.assertTrue(is_object_dtype(df[object_headers].values))
        if numeric_headers:
            self.assertTrue(is_numeric_dtype(df[numeric_headers].values))
        if bool_headers:
            self.assertTrue(is_bool_dtype(df[bool_headers].values))

        # Make sure all columns are accounted for
        column_headers = object_headers + numeric_headers + bool_headers
        self.assertEqual(sorted(list(df)), sorted(column_headers))

Example #2

0

Show file

File: test_datasets.py Project: akhilesh0807/matminer

    def test_elastic_tensor_2015(self):
        # Run set of universal dataset tests
        object_headers = ['material_id', 'formula', 'structure',
                          'compliance_tensor', 'elastic_tensor',
                          'elastic_tensor_original', 'cif', 'poscar']

        numeric_headers = ['nsites', 'space_group', 'volume',
                           'elastic_anisotropy', 'G_Reuss', 'G_VRH', 'G_Voigt',
                           'K_Reuss', 'K_VRH', 'K_Voigt', 'poisson_ratio',
                           'kpoint_density']

        metadata_headers = {'cif', 'kpoint_density', 'poscar'}

        self.universal_dataset_check(
            "elastic_tensor_2015", object_headers, numeric_headers,
            metadata_headers=metadata_headers
        )

        # Tests unique to this dataset
        df = load_dataset('elastic_tensor_2015', include_metadata=True,
                          download_if_missing=False)
        self.assertEqual(type(df['structure'][0]), Structure)
        tensor_headers = ['compliance_tensor', 'elastic_tensor',
                          'elastic_tensor_original']
        for c in tensor_headers:
            self.assertEqual(type(df[c][0]), np.ndarray)

Example #3

0

Show file

 def download_data(self, name, save_data=True):
     df = load_dataset(name=name)
     if save_data == True:
         df.to_excel(name+'.xlsx', index=False)
         with open('%s.pickle' % name, 'wb') as data_file:
             pickle.dump(df, data_file)
     return df

Example #4

0

Show file

 def setUp(self):
     df = load_dataset("elastic_tensor_2015").rename(
         columns={"formula": "composition"})
     self.df = df[["composition", "K_VRH"]]
     self.df_struc = df[["composition", "structure", "K_VRH"]]
     self.extra_features = df["G_VRH"]
     self.target = "K_VRH"
     self.config = get_preset_config("debug_single")
     self.config_cached = get_preset_config("debug_single",
                                            cache_src=CACHE_SRC)
     self.pipe = MatPipe(**self.config)
     self.pipe_cached = MatPipe(**self.config_cached)

Example #5

0

Show file

File: test_datasets.py Project: akhilesh0807/matminer

    def test_flla(self):
        # Universal tests
        object_headers = ['material_id', 'formula', 'structure']

        numeric_headers = ['e_above_hull', 'nsites', 'formation_energy',
                           'formation_energy_per_atom']

        self.universal_dataset_check("flla", object_headers, numeric_headers)

        # Unique tests
        df = load_dataset('flla', include_metadata=True,
                          download_if_missing=False)
        self.assertEqual(type(df['structure'][0]), Structure)

Example #6

0

Show file

File: test_dataset_retrieval.py Project: RamyaGuru/matminer

 def test_load_dataset(self):
     # Can't find dataset or similar
     with self.assertRaises(ValueError):
         load_dataset("not_real_dataset")
     # Finds similar
     with self.assertRaises(ValueError):
         load_dataset("tensor")
     # Actual dataset is subset of passed dataset name
     dataset_name = sorted(self.dataset_dict.keys())[0]
     with self.assertRaises(ValueError):
         load_dataset("a" + dataset_name + "a")

Example #7

0

Show file

File: test_dataset_retrieval.py Project: wcreus/matminer

 def test_load_dataset(self):
     # Can't find dataset or similar
     with self.assertRaises(ValueError):
         load_dataset("not_real_dataset")
     # Finds similar
     with self.assertRaises(ValueError):
         load_dataset("tensor")
     # Actual dataset is subset of passed dataset name
     dataset_name = sorted(self.dataset_dict.keys())[0]
     with self.assertRaises(ValueError):
         load_dataset("a" + dataset_name + "a")

Example #8

0

Show file

    def test_load_dataset(self):
        # Can't find dataset or similar
        with self.assertRaises(ValueError):
            load_dataset("not_real_dataset")
        # Finds similar
        with self.assertRaises(ValueError):
            load_dataset("tensor")
        # Actual dataset is subset of passed dataset name
        dataset_name = sorted(self.dataset_dict.keys())[0]
        with self.assertRaises(ValueError):
            load_dataset("a" + dataset_name + "a")

        dataset_filename = (dataset_name + "." +
                            self.dataset_dict[dataset_name]["file_type"])
        data_home = os.path.expanduser("~")
        dataset_path = os.path.join(data_home, dataset_filename)
        if os.path.exists(dataset_path):
            os.remove(dataset_path)

        load_dataset(dataset_name, data_home)
        self.assertTrue(os.path.exists(data_home))

Example #9

0

Show file

File: test_datasets.py Project: akhilesh0807/matminer

    def test_piezoelectric_tensor(self):
        # Run universal tests
        object_headers = ['material_id', 'formula', 'structure', 'point_group',
                          'v_max', 'piezoelectric_tensor', 'cif', 'meta',
                          'poscar']

        numeric_headers = ['nsites', 'space_group', 'volume', 'eij_max']

        metadata_headers = {'cif', 'meta', 'poscar'}

        self.universal_dataset_check(
            "piezoelectric_tensor", object_headers, numeric_headers,
            metadata_headers=metadata_headers
        )

        # Dataset specific checks
        df = load_dataset('piezoelectric_tensor', include_metadata=True,
                          download_if_missing=False)
        self.assertEqual(type(df['structure'][0]), Structure)
        self.assertEqual(type(df['piezoelectric_tensor'][0]), np.ndarray)

Example #10

0

Show file

File: test_datasets.py Project: akhilesh0807/matminer

    def test_dielectric_constant(self):
        # Universal Tests
        object_headers = ['material_id', 'formula', 'structure',
                          'e_electronic', 'e_total', 'cif', 'meta',
                          'poscar']

        numeric_headers = ['nsites', 'space_group', 'volume', 'band_gap',
                           'n', 'poly_electronic', 'poly_total']

        bool_headers = ['pot_ferroelectric']

        metadata_headers = {'cif', 'meta', 'poscar'}

        self.universal_dataset_check(
            "dielectric_constant", object_headers, numeric_headers,
            bool_headers=bool_headers, metadata_headers=metadata_headers
        )

        # Unique tests
        df = load_dataset("dielectric_constant", include_metadata=True,
                          download_if_missing=False)
        self.assertEqual(type(df['structure'][0]), Structure)

Example #11

0

Show file

File: phonons.py Project: shizhe1/automatminer

From matminer's dataset library.
"""

from matminer.datasets.dataset_retrieval import load_dataset
from matminer.data_retrieval.retrieve_MP import MPDataRetrieval

import pandas as pd
# pd.set_option('display.height', 1000)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

mpdr = MPDataRetrieval()

df = load_dataset("phonon_dielectric_mp")

print(df)

mpids = df["mpid"].tolist()
dfe = mpdr.get_dataframe(
    criteria={"material_id": {
        "$in": mpids
    }},
    properties=["e_above_hull", "formation_energy_per_atom", "material_id"],
    index_mpid=False)
dfe = dfe.rename(columns={"material_id": "mpid"})

df = pd.merge(df, dfe, how='inner')

df = df[(df["e_above_hull"] < .150)

Example #12

0

Show file

 def setUp(self):
     self.test_df = load_dataset('elastic_tensor_2015').rename(
         columns={"formula": "composition"})
     self.limit = 5

Example #13

0

Show file

File: steels.py Project: shizhe1/automatminer

"""
This file makes the following benchmarking datasets:
    - steels

From matminer's dataset library.
"""

from matminer.datasets.dataset_retrieval import load_dataset


if __name__ == "__main__":
    df = load_dataset("steel_strength")
    df = df[["formula", "yield strength"]]
    df = df.rename(columns={"formula": "composition"})
    print(df)
    df.to_pickle("steels.pickle.gz")

Example #14

0

Show file

File: test_datasets.py Project: zhigangmei/matminer

    def universal_dataset_check(self,
                                dataset_name,
                                object_headers=None,
                                numeric_headers=None,
                                bool_headers=None,
                                test_func=None):

        # "Hard" integrity checks that take a long time.
        # These tests only run if the MATMINER_DATASET_FULL_TEST
        # environment variable is set to True
        if do_complete_test:
            # Get rid of dataset if it's on the disk already
            data_path = os.path.join(
                self.dataset_dir, dataset_name + "." +
                self.dataset_dict[dataset_name]['file_type'])
            if os.path.exists(data_path):
                os.remove(data_path)

            # Test that dataset can be downloaded
            load_dataset(dataset_name)
            self.assertTrue(os.path.exists(data_path))

            # Test that data is now available and has all its elements
            df = load_dataset(dataset_name, download_if_missing=False)
            self.assertEqual(len(df),
                             self.dataset_dict[dataset_name]["num_entries"])

            # Test all columns are there
            self.assertEqual(
                sorted(list(df)),
                sorted([
                    header for header in self.dataset_dict[dataset_name]
                    ['columns'].keys()
                ]))

            # Test each column for appropriate type
            if object_headers is None:
                object_headers = []
            if numeric_headers is None:
                numeric_headers = []
            if bool_headers is None:
                bool_headers = []

            df = load_dataset(dataset_name, download_if_missing=False)
            if object_headers:
                self.assertTrue(is_object_dtype(df[object_headers].values))
            if numeric_headers:
                self.assertTrue(is_numeric_dtype(df[numeric_headers].values))
            if bool_headers:
                self.assertTrue(is_bool_dtype(df[bool_headers].values))

            # Make sure all columns are accounted for
            column_headers = object_headers + numeric_headers + bool_headers
            self.assertEqual(sorted(list(df)), sorted(column_headers))

            # Run tests unique to the dataset
            if test_func is not None:
                test_func(df)

        # "Soft" check that just makes sure the dataset download page is active
        # This runs when on a system with the CI environment var present
        # (e.g. when running a continuous integration VCS system)
        else:
            download_page = requests.head(
                self.dataset_dict[dataset_name]["url"])
            self.assertTrue(download_page.ok)

Example #15

0

Show file

File: glass.py Project: hackingmaterials/matbench

out at the end. It appears there are none.
"""

from matminer.datasets.dataset_retrieval import load_dataset
from matminer.utils.io import store_dataframe_as_json
from matminer.featurizers.conversions import StrToComposition
from tqdm import tqdm

import pandas as pd

# pd.set_option('display.height', 1000)
pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pd.set_option("display.width", 1000)

df = load_dataset("glass_ternary_landolt")

df = df.rename(columns={"formula": "composition"})
df = df[["composition", "gfa"]]

df = StrToComposition(target_col_id="composition_obj").featurize_dataframe(
    df, "composition")
df["composition"] = [c.reduced_formula for c in df["composition_obj"]]
df = df.drop(columns=["composition_obj"])

# print("Ground truth")
# print(df[df["composition"]=="ZrTi9"])  # should be False in final dataframe also!!
# print(df[df["composition"]=="ZrVCo8"]) # should be True in final dataframe also!
# print(df["gfa"].value_counts())    # proportion is about 5000 GFA 2054 no GFA
# raise ValueError

Example #16

0

Show file

File: jdft2d.py Project: shizhe1/automatminer

"""
This file makes the following benchmarking datasets:
    - jdft2d

From matminer's dataset library.
"""

from matminer.datasets.dataset_retrieval import load_dataset

import pandas as pd
# pd.set_option('display.height', 1000)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

df = load_dataset("jarvis_dft_2d")

df = df[["structure", "exfoliation_en"]]
df = df.reset_index(drop=True)

print(df)
df.to_pickle("jdft2d.pickle.gz")

Example #17

0

Show file

"""
This file makes the following benchmarking datasets:
    - castelli

From matminer's dataset library.
"""

from matminer.datasets.dataset_retrieval import load_dataset
from matminer.data_retrieval.retrieve_MP import MPDataRetrieval

import pandas as pd

# pd.set_option('display.height', 1000)
pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pd.set_option("display.width", 1000)

mpdr = MPDataRetrieval()

df = load_dataset("castelli_perovskites")
df = df[["structure", "e_form"]]
df = df.reset_index(drop=True)

print(df)
df.to_pickle("castelli.pickle.gz")

Example #18

0

Show file

from matminer.datasets.dataset_retrieval import load_dataset, get_available_datasets, get_all_dataset_info
datasets = get_available_datasets(print_format=None)

for dataset in datasets:
    if "matbench_" in dataset:
        df = load_dataset(dataset)

        target_col = [col for col in df.columns if col not in ["structure", "composition"]][0]
        print(f"   * - :code:`{dataset}`\n     - :code:`{target_col}`\n     - {df.shape[0]}")


# print(get_all_dataset_info("matbench_steels"))

Example #19

0

Show file

File: test_datasets.py Project: RamyaGuru/matminer

    def universal_dataset_check(self, dataset_name, object_headers=None,
                                numeric_headers=None, bool_headers=None,
                                test_func=None):

        # "Hard" integrity checks that take a long time.
        # These tests only run if the MATMINER_DATASET_FULL_TEST
        # environment variable is set to True
        if do_complete_test:
            # Get rid of dataset if it's on the disk already
            data_path = os.path.join(
                self.dataset_dir,
                dataset_name + "." + self.dataset_dict[dataset_name][
                    'file_type'
                ]
            )
            if os.path.exists(data_path):
                os.remove(data_path)

            # Test that dataset can be downloaded
            load_dataset(dataset_name)
            self.assertTrue(os.path.exists(data_path))

            # Test that data is now available and has all its elements
            df = load_dataset(dataset_name, download_if_missing=False)
            self.assertEqual(
                len(df), self.dataset_dict[dataset_name]["num_entries"]
            )

            # Test all columns are there
            self.assertEqual(sorted(list(df)), sorted(
                [header for header in
                 self.dataset_dict[dataset_name]['columns'].keys()]
            ))

            # Test each column for appropriate type
            if object_headers is None:
                object_headers = []
            if numeric_headers is None:
                numeric_headers = []
            if bool_headers is None:
                bool_headers = []

            df = load_dataset(dataset_name, download_if_missing=False)
            if object_headers:
                self.assertTrue(is_object_dtype(df[object_headers].values))
            if numeric_headers:
                self.assertTrue(is_numeric_dtype(df[numeric_headers].values))
            if bool_headers:
                self.assertTrue(is_bool_dtype(df[bool_headers].values))

            # Make sure all columns are accounted for
            column_headers = object_headers + numeric_headers + bool_headers
            self.assertEqual(sorted(list(df)), sorted(column_headers))

            # Run tests unique to the dataset
            if test_func is not None:
                test_func(df)

        # "Soft" check that just makes sure the dataset download page is active
        # This runs when on a system with the CI environment var present
        # (e.g. when running a continuous integration VCS system)
        else:
            download_page = requests.head(
                self.dataset_dict[dataset_name]["url"]
            )
            self.assertTrue(download_page.ok)

Example #20

0

Show file

File: basic.py Project: Qi-max/automatminer

# A configured MatPipe object will featurize, clean, and learn on a dataset
# automatically, and it made of 4 classes: AutoFeaturizer, DataCleaner,
# FeatureReducer, and an ML adaptor (e.g., TPOTAdaptor). The exact operations
# MatPipe executes are based entirely on how these 4 classes are configured.

# The easiest way to get started is by passing in a preset configuration to
# MatPipe. We can do this with the get_preset_config function; here, we'll use
# the "express" config, which will provide decent results in a reasonable time
# frame (an hour or two).
pipe = MatPipe(**get_preset_config("express"))

# Let's download an example dataset and try predicting bulk moduli.
from sklearn.model_selection import train_test_split
from matminer.datasets.dataset_retrieval import load_dataset
df = load_dataset("elastic_tensor_2015")[["structure", "K_VRH"]]
train, test = train_test_split(df,
                               shuffle=True,
                               random_state=20190301,
                               test_size=0.2)
test_true = test['K_VRH']
test = test.drop(columns=["K_VRH"])

# MatPipe uses an sklearn-esque BaseEstimator API for fitting pipelines and
# predicting properties. Fitting a pipe trains it to the input data; predicting
# with a pipe will output predictions.
pipe.fit(train, target="K_VRH")

# Now we can predict our outputs. They'll appear in a column called
# "K_VRH predicted".
test_predicted = pipe.predict(test, "K_VRH")["K_VRH predicted"]

Example #21

0

Show file

def pretty_column_map(columns_old):
    colmap = {}
    for col in columns_old:
        k = (col.replace("_", "|").replace("-",
                                           "|").replace(" ", "||").replace(
                                               "(", " ").replace(")", ""))
        colmap[col] = k
    return colmap


if __name__ == "__main__":

    # Just trying it out with a single dataset, Dielectric from MP...
    for config in [DIELECTRIC]:
        project = config["data_file"].replace(".json.gz", "")
        df = load_dataset(project)
        pinput = "structure" if "structure" in df.columns else "composition"
        column_map_pretty = pretty_column_map(df.columns.tolist())
        df = df.rename(columns=column_map_pretty)
        target = column_map_pretty[config["target"]]

        # print(pinput)
        # raise ValueError

        # print(df)
        # raise ValueError

        # clean up
        has_more = True
        while has_more:
            resp = client.contributions.delete_entries(project=project,

Example #22

0

Show file

 def setUp(self):
     df = load_dataset("elastic_tensor_2015").rename(
         columns={"formula": "composition"})
     self.df = df[["composition", "K_VRH"]]
     self.extra_features = df["G_VRH"]
     self.target = "K_VRH"

Example #23

0

Show file

File: expt_gap.py Project: shizhe1/automatminer

"""
This file makes the following benchmarking datasets:
    - expt_gap
    - expt_is_metal

From matminer's dataset library.
"""
from matminer.datasets.dataset_retrieval import load_dataset

import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

df = load_dataset("expt_gap")
df = df.rename(columns={"formula": "composition"})
df.to_pickle("expt_gap.pickle.gz")
print(df)
df["is_metal"] = df["gap expt"] == 0
df = df.drop(columns=["gap expt"])
print(df["is_metal"].value_counts())
df = df.reset_index(drop=True)
print(df)
df.to_pickle("expt_is_metal.pickle.gz")

Example #24

0

Show file

File: core.py Project: theiman112860/automatminer

                        else:
                            removed_feat = idx
                        if removed_feat not in rm_feats:
                            rm_feats.append(removed_feat)
                            self.logger.debug('"{}" correlates strongly with '
                                              '"{}"'.format(feature, idx))
                            self.logger.debug(
                                'removing "{}"...'.format(removed_feat))
                        if removed_feat == feature:
                            break
        if len(rm_feats) > 0:
            df = df.drop(rm_feats, axis=1)
            self.logger.info('These {} features were removed due to cross '
                             'correlation with the current features more than '
                             '{}:\n{}'.format(len(rm_feats), R_max, rm_feats))
        return df


if __name__ == "__main__":
    from matminer.datasets.dataset_retrieval import load_dataset
    from automatminer.pipeline import MatPipe, debug_config
    target = "eij_max"
    df = load_dataset("piezoelectric_tensor").rename(
        columns={"formula": "composition"})[[
            target, "composition", "structure"
        ]]

    mp = MatPipe(**debug_config)
    df2 = mp.benchmark(df, target, test_spec=0.2)
    print(df2)