Ejemplo n.º 1
0
    def _check(self, data):

        dataset = self._dataset / (data + '.pd.xz')
        sha256_file = self._dataset / 'sha256.yml'

        # fetch data from source if not in local
        if not dataset.exists():
            raise RuntimeError(
                "data {0} not exist, please run <preset.sync('{0}')> to download from the repository"
                .format(data),
                'See also: https://xenonpy.readthedocs.io/en/latest/tutorials/1-dataset.html'
            )

        # check sha256 value
        sha256_file.touch()  # make sure sha256_file file exist.
        sha256 = self._yaml.load(sha256_file)
        if sha256 is None:
            sha256 = {}
        if data not in sha256:
            sha256_ = get_sha256(str(dataset))
            sha256[data] = sha256_
            self._yaml.dump(sha256, sha256_file)
        else:
            sha256_ = sha256[data]

        if sha256_ != config(data):
            warn(
                "local version {0} is different from the latest version {1}."
                "you can use <Preset.sync('{0}', to='{1}')> to fix it.".format(
                    data, config('db_version')), RuntimeWarning)
Ejemplo n.º 2
0
    def __init__(self):
        self._dataset = Path(__cfg_root__) / 'dataset'
        self._ext_data = config('ext_data')
        super().__init__(str(self._dataset),
                         config('userdata'),
                         *self._ext_data,
                         backend='pandas',
                         prefix=('dataset', ))

        yaml = YAML(typ='safe')
        yaml.indent(mapping=2, sequence=4, offset=2)

        self._yaml = yaml
Ejemplo n.º 3
0
def test_config_1():
    tmp = config('name')
    assert tmp == 'xenonpy'

    with pytest.raises(RuntimeError):
        config('no_exist')

    tmp = config(new_key='test')
    assert tmp is None

    tmp = config('new_key')
    assert tmp == 'test'

    tmp = config('github_username', other_key='other')
    assert tmp == 'yoshida-lab'

    tmp = config('other_key')
    assert tmp == 'other'
Ejemplo n.º 4
0
    def build(self, *keys, save_to=None, **kwargs):

        # build materials project dataset
        def mp_builder(api_key, mp_ids):

            #     print('Will fetch %s inorganic compounds from Materials Project' % len(mp_ids))

            # split requests into fixed number groups
            # eg: grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx
            def grouper(iterable, n, fillvalue=None):
                """Collect data into fixed-length chunks or blocks"""
                args = [iter(iterable)] * max(n, 1)
                return zip_longest(fillvalue=fillvalue, *args)

            # the following props will be fetched
            mp_props = [
                'band_gap', 'density', 'volume', 'material_id',
                'pretty_formula', 'elements', 'efermi', 'e_above_hull',
                'formation_energy_per_atom', 'final_energy_per_atom',
                'unit_cell_formula', 'structure'
            ]

            entries = []
            mpid_groups = [g for g in grouper(mp_ids, len(mp_ids) // 10)]

            with MPRester(api_key) as mpr:
                for group in tqdm(mpid_groups):
                    mpid_list = [id for id in filter(None, group)]
                    chunk = mpr.query({"material_id": {
                        "$in": mpid_list
                    }}, mp_props)
                    entries.extend(chunk)

            df = pd.DataFrame(entries,
                              index=[e['material_id'] for e in entries])
            df = df.drop('material_id', axis=1)
            df = df.rename(columns={'unit_cell_formula': 'composition'})
            df = df.reindex(columns=sorted(df.columns))

            return df

        for key in keys:
            if key is 'mp_samples':
                if 'api_key' not in kwargs:
                    raise RuntimeError(
                        'api key of materials projects database is needed')
                if 'mp_ids' in kwargs:
                    ids = kwargs['mp_ids']
                    if isinstance(ids, (list, tuple)):
                        mp_ids = ids
                    elif isinstance(ids, str):
                        mp_ids = [
                            s.decode('utf-8') for s in np.loadtxt(ids, 'S20')
                        ]
                    else:
                        raise ValueError(
                            'parameter `mp_ids` can only be a str to specific the ids file path'
                            'or a list-like object contain the ids')
                else:
                    ids = Path(__file__).absolute().parent / 'mp_ids.txt'
                    mp_ids = [
                        s.decode('utf-8') for s in np.loadtxt(str(ids), 'S20')
                    ]
                data = mp_builder(kwargs['api_key'], mp_ids)
                if not save_to:
                    save_to = Path(config('userdata')) / 'mp_samples.pd.xz'
                    save_to = save_to.expanduser().absolute()
                data.to_pickle(save_to)
                self._make_index(prefix=['dataset'])
                return

        raise ValueError('no available key(s) in %s, these can only be %s' %
                         (keys, self.__builder__))