def _check(self, data): dataset = self._dataset / (data + '.pd.xz') sha256_file = self._dataset / 'sha256.yml' # fetch data from source if not in local if not dataset.exists(): raise RuntimeError( "data {0} not exist, please run <preset.sync('{0}')> to download from the repository" .format(data), 'See also: https://xenonpy.readthedocs.io/en/latest/tutorials/1-dataset.html' ) # check sha256 value sha256_file.touch() # make sure sha256_file file exist. sha256 = self._yaml.load(sha256_file) if sha256 is None: sha256 = {} if data not in sha256: sha256_ = get_sha256(str(dataset)) sha256[data] = sha256_ self._yaml.dump(sha256, sha256_file) else: sha256_ = sha256[data] if sha256_ != config(data): warn( "local version {0} is different from the latest version {1}." "you can use <Preset.sync('{0}', to='{1}')> to fix it.".format( data, config('db_version')), RuntimeWarning)
def __init__(self): self._dataset = Path(__cfg_root__) / 'dataset' self._ext_data = config('ext_data') super().__init__(str(self._dataset), config('userdata'), *self._ext_data, backend='pandas', prefix=('dataset', )) yaml = YAML(typ='safe') yaml.indent(mapping=2, sequence=4, offset=2) self._yaml = yaml
def test_config_1(): tmp = config('name') assert tmp == 'xenonpy' with pytest.raises(RuntimeError): config('no_exist') tmp = config(new_key='test') assert tmp is None tmp = config('new_key') assert tmp == 'test' tmp = config('github_username', other_key='other') assert tmp == 'yoshida-lab' tmp = config('other_key') assert tmp == 'other'
def build(self, *keys, save_to=None, **kwargs): # build materials project dataset def mp_builder(api_key, mp_ids): # print('Will fetch %s inorganic compounds from Materials Project' % len(mp_ids)) # split requests into fixed number groups # eg: grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx def grouper(iterable, n, fillvalue=None): """Collect data into fixed-length chunks or blocks""" args = [iter(iterable)] * max(n, 1) return zip_longest(fillvalue=fillvalue, *args) # the following props will be fetched mp_props = [ 'band_gap', 'density', 'volume', 'material_id', 'pretty_formula', 'elements', 'efermi', 'e_above_hull', 'formation_energy_per_atom', 'final_energy_per_atom', 'unit_cell_formula', 'structure' ] entries = [] mpid_groups = [g for g in grouper(mp_ids, len(mp_ids) // 10)] with MPRester(api_key) as mpr: for group in tqdm(mpid_groups): mpid_list = [id for id in filter(None, group)] chunk = mpr.query({"material_id": { "$in": mpid_list }}, mp_props) entries.extend(chunk) df = pd.DataFrame(entries, index=[e['material_id'] for e in entries]) df = df.drop('material_id', axis=1) df = df.rename(columns={'unit_cell_formula': 'composition'}) df = df.reindex(columns=sorted(df.columns)) return df for key in keys: if key is 'mp_samples': if 'api_key' not in kwargs: raise RuntimeError( 'api key of materials projects database is needed') if 'mp_ids' in kwargs: ids = kwargs['mp_ids'] if isinstance(ids, (list, tuple)): mp_ids = ids elif isinstance(ids, str): mp_ids = [ s.decode('utf-8') for s in np.loadtxt(ids, 'S20') ] else: raise ValueError( 'parameter `mp_ids` can only be a str to specific the ids file path' 'or a list-like object contain the ids') else: ids = Path(__file__).absolute().parent / 'mp_ids.txt' mp_ids = [ s.decode('utf-8') for s in np.loadtxt(str(ids), 'S20') ] data = mp_builder(kwargs['api_key'], mp_ids) if not save_to: save_to = Path(config('userdata')) / 'mp_samples.pd.xz' save_to = save_to.expanduser().absolute() data.to_pickle(save_to) self._make_index(prefix=['dataset']) return raise ValueError('no available key(s) in %s, these can only be %s' % (keys, self.__builder__))