Beispiel #1
0
    def setUp(self):
        self.dataset_names = [
            'flla',
            'elastic_tensor_2015',
            'piezoelectric_tensor',
            'dielectric_constant'
        ]
        self.dataset_attributes = [
            'file_type',
            'url',
            'hash',
            'reference',
            'description',
            'columns',
            'bibtex_refs',
            'num_entries'
        ]

        self.dataset_dict = _load_dataset_dict()

        # current directory, for storing and discarding test_dataset
        current_dir = os.path.dirname(os.path.abspath(__file__))

        # directory where in-use datasets should be stored,
        # either at MATMINER_DATA env var or under matminer/datasets/
        self.dataset_dir = os.environ.get(
            "MATMINER_DATA",
            os.path.abspath(os.path.join(current_dir, os.pardir))
        )

        # Shared set up for test_validate_dataset & test_fetch_external_dataset
        self._path = os.path.join(current_dir, "test_dataset.csv")
        self._url = "https://ndownloader.figshare.com/files/13039562"
        self._hash = "c487f59ce0d48505c36633b4b202027" \
                     "d0c915474b081e8fb0bde8d5474ee59a1"
Beispiel #2
0
    def setUp(self):
        self.dataset_names = [
            'flla', 'elastic_tensor_2015', 'piezoelectric_tensor',
            'dielectric_constant', 'castelli_perovskites', 'boltztrap_mp',
            'phonon_dielectric_mp', 'glass_ternary_hipt',
            'double_perovskites_gap', 'double_perovskites_gap_lumo', 'mp_all',
            'mp_nostruct', 'glass_ternary_landolt',
            'citrine_thermal_conductivity', 'wolverton_oxides',
            'heusler_magnetic', 'steel_strength', 'jarvis_ml_dft_training',
            'jarvis_dft_2d', 'jarvis_dft_3d', 'glass_binary', 'm2ax',
            'expt_gap', 'expt_formation_enthalpy'
        ]
        self.dataset_attributes = [
            'file_type', 'url', 'hash', 'reference', 'description', 'columns',
            'bibtex_refs', 'num_entries'
        ]

        self.dataset_dict = _load_dataset_dict()

        # current directory, for storing and discarding test_dataset
        current_dir = os.path.dirname(os.path.abspath(__file__))

        # directory where in-use datasets should be stored,
        # either at MATMINER_DATA env var or under matminer/datasets/
        self.dataset_dir = os.environ.get(
            "MATMINER_DATA",
            os.path.abspath(os.path.join(current_dir, os.pardir)))

        # Shared set up for test_validate_dataset & test_fetch_external_dataset
        self._path = os.path.join(current_dir, "test_dataset.csv")
        self._url = "https://ndownloader.figshare.com/files/13039562"
        self._hash = "c487f59ce0d48505c36633b4b202027" \
                     "d0c915474b081e8fb0bde8d5474ee59a1"
Beispiel #3
0
 def test_load_dataset_dict(self):
     dataset_dict = _load_dataset_dict()
     # Check to make sure all datasets are present and have string type keys
     self.assertEqual(set(dataset_dict.keys()), set(self.dataset_names))
     # Check the validity of each set of values in each dataset
     for value in dataset_dict.values():
         # Check to make sure each dataset has all attributes
         # and string type keys
         self.assertEqual(set(value.keys()), set(self.dataset_attributes))
         # Make sure string attributes have string values
         for item in [
                 'file_type', 'url', 'hash', 'reference', 'description'
         ]:
             self.assertIsInstance(value[item], str)
         # Make sure int attributes have int values
         self.assertIsInstance(value['num_entries'], int)
         # Make sure refs are in a list and are strings
         self.assertIsInstance(value['bibtex_refs'], list)
         for ref in value['bibtex_refs']:
             self.assertIsInstance(ref, str)
         # Make sure columns is a dict and it has string valued entries
         self.assertIsInstance(value['columns'], dict)
         for column_name, column_description in value['columns'].items():
             self.assertIsInstance(column_name, str)
             self.assertIsInstance(column_description, str)
def load_dataset(name, data_home=None, download_if_missing=True):
    """
    Loads a dataframe containing the dataset specified with the 'name' field.

    Dataset file is stored/loaded from data_home if specified, otherwise at
    the MATMINER_DATA environment variable if set or at matminer/datasets
    by default.

    Args:
        name (str): keyword specifying what dataset to load, run
            matminer.datasets.get_available_datasets() for options

        data_home (str): path to folder to look for dataset file

        download_if_missing (bool): whether to download the dataset if is not
            found on disk

    Returns: (pd.DataFrame,
              tuple -> (pd.DataFrame, pd.DataFrame) if return_lumo = True)
    """
    global _dataset_dict

    if _dataset_dict is None:
        _dataset_dict = _load_dataset_dict()

    if name not in _dataset_dict:
        error_string = "Unrecognized dataset name: {}. \n" \
                       "Use matminer.datasets.get_available_datasets() " \
                       "to see a list of currently available " \
                       "datasets".format(name)

        # Very simple attempt to match unrecognized keyword to existing
        # dataset names in an attempt to give the user immediate feedback
        possible_matches = [
            x for x in _dataset_dict.keys() if name.lower() in x.lower()
        ]

        if possible_matches:
            error_string += "\nCould you have been looking for these similar " \
                            "matches?:\n{}".format(possible_matches)

        raise ValueError(error_string)

    dataset_metadata = _dataset_dict[name]
    data_path = os.path.join(_get_data_home(data_home),
                             name + "." + dataset_metadata['file_type'])
    _validate_dataset(data_path, dataset_metadata['url'],
                      dataset_metadata['hash'], download_if_missing)

    df = load_dataframe_from_json(data_path)

    return df
Beispiel #5
0
def load_dataset(name, data_home=None, download_if_missing=True):
    """
    Loads a dataframe containing the dataset specified with the 'name' field.

    Dataset file is stored/loaded from data_home if specified, otherwise at
    the MATMINER_DATA environment variable if set or at matminer/datasets
    by default.

    Args:
        name (str): keyword specifying what dataset to load, run
            matminer.datasets.get_available_datasets() for options

        data_home (str): path to folder to look for dataset file

        download_if_missing (bool): whether to download the dataset if is not
            found on disk

    Returns: (pd.DataFrame,
              tuple -> (pd.DataFrame, pd.DataFrame) if return_lumo = True)
    """
    global _dataset_dict

    if _dataset_dict is None:
        _dataset_dict = _load_dataset_dict()

    if name not in _dataset_dict:
        error_string = "Unrecognized dataset name: {}. \n" \
                       "Use matminer.datasets.get_available_datasets() " \
                       "to see a list of currently available " \
                       "datasets".format(name)

        # Very simple attempt to match unrecognized keyword to existing
        # dataset names in an attempt to give the user immediate feedback
        possible_matches = [
            x for x in _dataset_dict.keys() if name.lower() in x.lower()
        ]

        if possible_matches:
            error_string += "\nCould you have been looking for these similar " \
                            "matches?:\n{}".format(possible_matches)

        raise ValueError(error_string)

    dataset_metadata = _dataset_dict[name]
    data_path = os.path.join(_get_data_home(data_home),
                             name + "." + dataset_metadata['file_type'])
    _validate_dataset(data_path, dataset_metadata['url'],
                      dataset_metadata['hash'], download_if_missing)

    df = load_dataframe_from_json(data_path)

    return df
Beispiel #6
0
def get_available_datasets(print_format="medium", sort_method='alphabetical'):
    """
    Function for retrieving the datasets available within matminer.

    Args:
        print_format (None, str): None, "short", "medium", or "long":
            None: Don't print anything
            "short": only the dataset names
            "medium": dataset names and their descriptions
            "long": All dataset info associated with the dataset

        sort_method (str): By what metric to sort the datasets when retrieving
            their information.

            alphabetical: sorts by dataset name,
            num_entries: sorts by number of dataset entries

    Returns: (list)
    """
    global _dataset_dict

    if _dataset_dict is None:
        _dataset_dict = _load_dataset_dict()

    if sort_method not in {"alphabetical", "num_entries"}:
        raise ValueError("Error, unsupported sorting metric {}"
                         " see docs for options".format(sort_method))

    if sort_method == 'num_entries':
        dataset_names = sorted(_dataset_dict.keys(),
                               key=lambda x: _dataset_dict[x]["num_entries"],
                               reverse=True)
    else:
        dataset_names = sorted(_dataset_dict.keys())

    # If checks done before for loop to avoid unnecessary repetitive evaluation
    if print_format is not None:
        dataset_string = ""
        if print_format == "short":
            for dataset_name in dataset_names:
                dataset_string += f"{dataset_name}\n"
        elif print_format == "medium":
            for dataset_name in dataset_names:
                dataset_description = get_dataset_description(dataset_name)
                dataset_string += f"{dataset_name}: " \
                                  f"{dataset_description}\n\n"
        elif print_format == "long":
            for dataset_name in dataset_names:
                dataset_string += f"{get_all_dataset_info(dataset_name)}"
        print(dataset_string)

    return dataset_names
Beispiel #7
0
def get_available_datasets(print_datasets=True,
                           print_descriptions=True,
                           sort_method='alphabetical'):
    """
    Function for retrieving the datasets available within matminer.

    Args:
        print_datasets (bool): Whether to, along with returning a
            list of dataset names, also print info on each dataset

        print_descriptions (bool): Whether to print the description of the
            dataset along with the name. Ignored if print_datasets is False

        sort_method (str): By what metric to sort the datasets when retrieving
            their information.

            alphabetical: sorts by dataset name,
            num_entries: sorts by number of dataset entries

    Returns: (list)
    """
    global _dataset_dict

    if _dataset_dict is None:
        _dataset_dict = _load_dataset_dict()

    if sort_method not in {"alphabetical", "num_entries"}:
        raise ValueError("Error, unsupported sorting metric {}"
                         " see docs for options".format(sort_method))

    if sort_method == 'num_entries':
        dataset_names = sorted(_dataset_dict.keys(),
                               key=lambda x: _dataset_dict[x]["num_entries"],
                               reverse=True)
    else:
        dataset_names = sorted(_dataset_dict.keys())

    # If checks done before for loop to avoid unnecessary repetitive evaluation
    if print_datasets and print_descriptions:
        for name in dataset_names:
            # Printing blank line with sep=\n to give extra line break
            print(name, _dataset_dict[name]["description"], "", sep="\n")
    elif print_datasets:
        for name in dataset_names:
            print(name)

    return dataset_names
def get_available_datasets(print_datasets=True, print_descriptions=True,
                           sort_method='alphabetical'):
    """
    Function for retrieving the datasets available within matminer.

    Args:
        print_datasets (bool): Whether to, along with returning a
            list of dataset names, also print info on each dataset

        print_descriptions (bool): Whether to print the description of the
            dataset along with the name. Ignored if print_datasets is False

        sort_method (str): By what metric to sort the datasets when retrieving
            their information.

            alphabetical: sorts by dataset name,
            num_entries: sorts by number of dataset entries

    Returns: (list)
    """
    global _dataset_dict

    if _dataset_dict is None:
        _dataset_dict = _load_dataset_dict()

    if sort_method not in {"alphabetical", "num_entries"}:
        raise ValueError("Error, unsupported sorting metric {}"
                         " see docs for options".format(sort_method))

    if sort_method == 'num_entries':
        dataset_names = sorted(_dataset_dict.keys(),
                               key=lambda x: _dataset_dict[x]["num_entries"],
                               reverse=True)
    else:
        dataset_names = sorted(_dataset_dict.keys())

    # If checks done before for loop to avoid unnecessary repetitive evaluation
    if print_datasets and print_descriptions:
        for name in dataset_names:
            # Printing blank line with sep=\n to give extra line break
            print(name, _dataset_dict[name]["description"], "", sep="\n")
    elif print_datasets:
        for name in dataset_names:
            print(name)

    return dataset_names
def get_dataset_attribute(dataset_name, attrib_key):
    """
    Helper function for getting generic attributes of the dataset

    Args:
        dataset_name (str): Name of the dataset querying info from

        attrib_key (str): Name of attribute to pull

    Returns: Dataset attribute
    """
    # Load the dictionary into a global variable, keep around for future access
    global _dataset_dict

    if _dataset_dict is None:
        _dataset_dict = _load_dataset_dict()

    return _dataset_dict[dataset_name][attrib_key]
Beispiel #10
0
def get_dataset_attribute(dataset_name, attrib_key):
    """
    Helper function for getting generic attributes of the dataset

    Args:
        dataset_name (str): Name of the dataset querying info from

        attrib_key (str): Name of attribute to pull

    Returns: Dataset attribute
    """
    # Load the dictionary into a global variable, keep around for future access
    global _dataset_dict

    if _dataset_dict is None:
        _dataset_dict = _load_dataset_dict()

    return _dataset_dict[dataset_name][attrib_key]
Beispiel #11
0
 def test_load_dataset_dict(self):
     dataset_dict = _load_dataset_dict()
     # Check to make sure all datasets are present and have string type keys
     self.assertEqual(set(dataset_dict.keys()), set(self.dataset_names))
     # Check the validity of each set of values in each dataset
     for value in dataset_dict.values():
         # Check to make sure each dataset has all attributes
         # and string type keys
         self.assertEqual(set(value.keys()), set(self.dataset_attributes))
         # Make sure string attributes have string values
         for item in ['file_type', 'url', 'hash', 'reference',
                      'description']:
             self.assertIsInstance(value[item], str)
         # Make sure int attributes have int values
         self.assertIsInstance(value['num_entries'], int)
         # Make sure refs are in a list and are strings
         self.assertIsInstance(value['bibtex_refs'], list)
         for ref in value['bibtex_refs']:
             self.assertIsInstance(ref, str)
         # Make sure columns is a dict and it has string valued entries
         self.assertIsInstance(value['columns'], dict)
         for column_name, column_description in value['columns'].items():
             self.assertIsInstance(column_name, str)
             self.assertIsInstance(column_description, str)
Beispiel #12
0
def load_dataset(name,
                 data_home=None,
                 download_if_missing=True,
                 include_metadata=False):
    """
    Loads a dataframe containing the dataset specified with the 'name' field.

    Dataset file is stored/loaded from data_home if specified, otherwise at
    the MATMINER_DATA environment variable if set or at matminer/datasets
    by default.

    Args:
        name (str): keyword specifying what dataset to load, run
            matminer.datasets.available_datasets() for options

        data_home (str): path to folder to look for dataset file

        download_if_missing (bool): whether to download the dataset if is not
            found on disk

        include_metadata (bool): optional argument for some datasets with
            metadata fields

    Returns: (pd.DataFrame)
    """
    dataset_dict = _load_dataset_dict()

    if name not in dataset_dict:
        error_string = "Unrecognized dataset name: {}. \n" \
                       "Use matminer.datasets.available_datasets() " \
                       "to see a list of currently available " \
                       "datasets".format(name)

        # Very simple attempt to match unrecognized keyword to existing
        # dataset names in an attempt to give the user immediate feedback
        possible_matches = [
            x for x in dataset_dict.keys() if name.lower() in x.lower()
        ]

        if possible_matches:
            error_string += "\nCould you have been looking for these similar " \
                            "matches?:\n{}".format(possible_matches)

        raise ValueError(error_string)

    dataset_metadata = dataset_dict[name]
    data_path = os.path.join(_get_data_home(data_home),
                             name + "." + dataset_metadata['file_type'])
    _validate_dataset(data_path, dataset_metadata['url'],
                      dataset_metadata['hash'], download_if_missing)

    df = load_dataframe_from_json(data_path)

    if not include_metadata:
        if name == "elastic_tensor_2015":
            df = df.drop(['cif', 'kpoint_density', 'poscar'], axis=1)

        elif name in {"piezoelectric_tensor", "dielectric_constant"}:
            df = df.drop(['cif', 'meta', 'poscar'], axis=1)

    return df
                ref = input()
            new_reference = "\n".join(reference_lines).strip()
            if new_reference:
                print('The following will be added:')
                print(new_reference)
                reference = new_reference

        elif attrib_name == "url":
            url = input("Add a file download url: ").strip()

        else:
            print("Invalid option")


if __name__ == '__main__':
    _dataset_dict = _load_dataset_dict()
    with open(".dataset_data_backup.json", 'w') as outfile:
        json.dump(_dataset_dict, outfile, indent=4, sort_keys=True)

    unsaved_changes = False
    _temp_dataset = deepcopy(_dataset_dict)

    quit_flag = False
    while not quit_flag:
        print(intro_message)
        command = input(">>> ")
        command = command.strip().lower()
        # Show current datasets
        if command == "1":
            print("Current Datasets:")
            pprint(_temp_dataset, width=150)
                ref = input()
            new_reference = "\n".join(reference_lines).strip()
            if new_reference:
                print('The following will be added:')
                print(new_reference)
                reference = new_reference

        elif attrib_name == "url":
            url = input("Add a file download url: ").strip()

        else:
            print("Invalid option")


if __name__ == '__main__':
    _dataset_dict = _load_dataset_dict()
    with open(".dataset_data_backup.json", 'w') as outfile:
        json.dump(_dataset_dict, outfile, indent=4, sort_keys=True)

    unsaved_changes = False
    _temp_dataset = deepcopy(_dataset_dict)

    quit_flag = False
    while not quit_flag:
        print(intro_message)
        command = input(">>> ")
        command = command.strip().lower()
        # Show current datasets
        if command == "1":
            print("Current Datasets:")
            pprint(_temp_dataset, width=150)
Beispiel #15
0
"""
Metadata for matbench.
"""

from matminer.datasets.utils import _load_dataset_dict
from monty.serialization import loadfn

from matbench.constants import (
    MBV01_DATASET_METADATA_PATH,
    MBV01_VALIDATION_DATA_PATH,
)
from matbench.util import RecursiveDotDict

MATMINER_DATASET_METADATA = _load_dataset_dict()

mbv01_metadata = loadfn(MBV01_DATASET_METADATA_PATH)
for d in mbv01_metadata.keys():
    mbv01_metadata[d].update(MATMINER_DATASET_METADATA[d])
mbv01_metadata = RecursiveDotDict(mbv01_metadata)


mbv01_validation = loadfn(MBV01_VALIDATION_DATA_PATH)
mbv01_validation = RecursiveDotDict(mbv01_validation)