Beispiel #1
0
def _generate_pkl(pot, dbs=None, **args):
    """Generates a pickle file for a single potential and its default databases.
    """
    from matdb.plotting.potentials import generate
    from matdb.atoms import AtomsList
    from cPickle import dump
    outdir = path.join(args["folder"], pot.fqn)
    if not path.isdir(outdir):
        mkdir(outdir)

    if dbs is not None:
        configs = AtomsList()
        for db in dbs:
            configs.extend(list(db.iconfigs))

        pdis = generate(args["plots"],
                        pot.calculator,
                        configs,
                        outdir,
                        args["base64"],
                        valkey=args["valkey"])
    else:
        pdis = generate(args["plots"], pot.calculator,
                        pot.configs(args["subset"]), outdir, args["base64"])

    pklname = "{}-{}-plotgen.pkl".format(args["subset"], args["plots"])
    target = path.join(outdir, pklname)
    with open(target, 'w') as f:
        dump(pdis, f)
Beispiel #2
0
    def rset(self):
        """Constructs the Hessian matrix for the *best* convergence parameters
        in this group and it's possible sub-sequences.

        Returns:
            list: list of :class:`~matdb.atoms.Atoms`; each atoms object will have a `H`
            matrix in its info dictionary.
        """
        if len(self.sequence) == 0:
            #We are at the bottom of the stack; attach the hessian matrix
            #to the atoms object it corresponds to.
            self.atoms.info["H"] = self.H
            result = AtomsList()
            result.append(self.atoms)
            return result
        else:
            #Check where we are in the stack. If we are just below the database,
            #then we want to return a list of hessian matrices and atoms
            #objects. If we are not, then we must a parameter grid of sequences
            #to select from.
            if isinstance(self.parent, Hessian):
                #We have many dynamical matrices to choose from. We need to decide
                #what "best" means and then return that one.
                bestkey = self._best_bands()
                return self.sequence[bestkey].rset
            else:
                result = AtomsList()
                for p in self.sequence.values():
                    result.extend(p.rset)
                return result
Beispiel #3
0
    def quantities(self,
                   params=None,
                   properties=None,
                   aggregators=None,
                   kind="train",
                   **kwargs):
        """Returns datasets derived from the atoms objects that are present in
        this trainers compiled databases.

        .. note:: If a property is missing from a particular atoms object, it is
          just ignored. That means the arrays returned from this method may not
          all have exactly the same length as the number of entries in the
          database.

        Args:
            params (list): list of `str` parameter names to extract from each atoms
              object.
            properties (list): list of `str` property names to extract from each
              atoms object.
            aggregators (dict): keys are `str` property names; values are `str`
              FQN of importable functions that can be applied to a
              :class:`numpy.ndarray` to produce a single scalar value. These are
              used to reduce an array of property values to a single number for a
              particular configuration. If not specified, the raw arrays are
              returned instead.
            kind (str): one of ['train', 'holdout', 'super', '*']. Specifies which of
              the database sets to use. If '*' is specified, then all of them are
              combined.
            kwargs (dict): additional dummy arguments that aren't needed, but allow the `**` syntax to be used.

        Returns:
            dict: keys are either property or parameter names. Values are
            :class:`numpy.ndarray` for parameters; for properties, since the arrays
            may have different sizes, the value will be a list of
            :class:`numpy.ndarray`.
        """
        assert kind in ["train", "holdout", "super", '*']
        if kind == '*':
            db = AtomsList()
            for k in ["train", "holdout", "super"]:
                db.extend(self.configs(k))
        else:
            db = self.configs(kind)

        result = {}
        if params is not None:
            for pname in params:
                result[pname] = np.array(getattr(db, pname))
        if properties is not None:
            for pname in properties:
                value = getattr(db, pname)
                if pname in aggregators:
                    aggmod, aggfun = import_fqdn(aggregators[pname])
                    result[pname] = aggfun(value)
                else:
                    result[pname] = value

        return result
Beispiel #4
0
    def fitting_configs(self):
        """Returns a :class:`matdb.atoms.AtomsList` for all configs in this
        group.
        """
        configs = AtomsList()
        if len(self.sequence) == 0:
            for config in self.config_atoms.values():
                configs.append(config)
        else:
            for seq in self.sequence.values():
                configs.extend(seq.fitting_configs)

        return configs
Beispiel #5
0
 def fitting_configs(self):
     """Returns a :class:`~matdb.atoms.AtomsList` for all configs in this
     group. This list includes a single *duplicated* configuration for each
     of the eigenvalue/eigenvector combinations of the Hessian matrix.
     """
     if len(self.sequence) == 0:
         if self.ready():
             return self.config_atoms.values()
         else:
             return AtomsList()
     else:
         result = AtomsList()
         for g in self.sequence.values():
             result.extend(g.fitting_configs)
         return result
Beispiel #6
0
def h5cat(files, target):
    """Concatenates a list of h5 AtomsList files into a single AtomsList.

    Args:
        files (list): list of `string` file paths to combine.
        target (str): name/path of the output file that will include all of the
          combined files.
    """
    # Local import to prevent cyclic imports
    from matdb.atoms import AtomsList
    result = AtomsList()
    for fname in files:
        ilist = AtomsList(fname)
        result.extend(ilist)
    result.write(target)
Beispiel #7
0
 def rset(self):
     """Returns a :class:`matdb.atoms.AtomsList`, one for each config in the
     latest result set.
     """
     if len(self.sequence) == 0:
         # Return the configurations from this group; it is at the
         # bottom of the stack
         result = AtomsList()
         for epath in self.atoms_paths():
             result.append(Atoms(path.join(epath, 'pre_comp_atoms.h5')))
         return result
     else:
         result = []
         for e in self.sequence.values():
             result.extend(e.rset())
         return result
Beispiel #8
0
    def rset(self):
        """Returns the reusable set to the next database group.

        Returns:
            list: list of :class:`~matdb.atoms.Atoms`
        """
        if len(self.sequence) == 0:
            #We are at the bottom of the stack;
            result = AtomsList()
            for config in self.fitting_configs:
                result.append(Atoms(path.join(config, "atoms.h5")))
            return result
        else:
            #Check where we are in the stack. If we are just below the database,
            #then we want to return the atoms objects for all database entries.
            #If we are not, then we must a parameter grid of sequences
            #to select from.
            result = []
            for g in self.sequence.values():
                result.extend(g.rset)
            return AtomsList(result)
Beispiel #9
0
def split(atlist,
          splits,
          targets,
          dbdir,
          ran_seed,
          dbfile=None,
          recalc=0,
          nonsplit=None):
    """Splits the :class:`~matdb.atoms.AtomsList` multiple times, one for
    each `split` setting in the database specification.

    Args:
        atlsit (AtomsList or list): the list of :class:`matdb.atoms.Atoms` objects
          to be split or a list to the files containing the atoms objects.
        splits (dict): the splits to perform.
        targets (dict): the files to save the splits in, these should
          contain a {} in the name which will be replaced with the split
          name. The dictionary must have the format {"train": file_name,
          "holdout": file_name, "super": file_name}.
        dbdir (str): the root *splits* directory for the database.
        dbfile (str): the _dbfile for a legacy database.
        ran_seed (int or float): the random seed for the splits (i.e. the controllers
          random seed).
        recalc (int): when non-zero, re-split the data and overwrite any
          existing *.h5 files. This parameter decreases as
          rewrites proceed down the stack. To re-calculate
          lower-level h5 files, increase this value.
        nonsplit (AtomsList): a list of atoms to include in the training
          set "as-is" because they cannot be split (they only have meaning
          together).
    """
    from matdb.utility import dbcat

    assert nonsplit is None or isinstance(nonsplit, AtomsList)
    for name, train_perc in splits.items():
        train_file = targets["train"](name)
        holdout_file = targets["holdout"](name)
        super_file = targets["super"](name)
        idfile = path.join(dbdir, "{0}-ids.pkl".format(name))

        if (path.isfile(train_file) and path.isfile(holdout_file)
                and path.isfile(super_file)):
            if recalc <= 0:
                return
            else:
                if path.isfile(idfile):
                    with open(idfile, 'rb') as f:
                        data = load(f)
                for fname in [train_file, holdout_file, super_file]:
                    new_name = fname.replace(
                        name, "{0}_{1}".format(name, data["uuid"]))
                    rename(fname, new_name)
                remove(idfile)

        #Compile a list of all the sub-configurations we can include in the
        #training.
        if not isinstance(atlist, AtomsList):
            subconfs = AtomsList(atlist)
        else:
            subconfs = atlist

        if path.isfile(idfile):
            with open(idfile, 'rb') as f:
                data = load(f)
            subconfs = data["subconfs"]
            ids = data["ids"]
            Ntrain = data["Ntrain"]
            Nhold = data["Nhold"]
            Ntot = data["Ntot"]
            Nsuper = data["Nsuper"]
        else:
            Ntot = len(subconfs)
            Ntrain = int(np.ceil(Ntot * train_perc))
            ids = np.arange(Ntot)
            Nhold = int(np.ceil((Ntot - Ntrain) * train_perc))
            Nsuper = Ntot - Ntrain - Nhold
            np.random.shuffle(ids)

            #We need to save these ids so that we don't mess up the statistics on
            #the training and validation sets.
            data = {
                "uuid": str(uuid4()),
                "subconfs": subconfs,
                "ids": ids,
                "Ntrain": Ntrain,
                "Nhold": Nhold,
                "Ntot": Ntot,
                "Nsuper": Nsuper,
                "ran_seed": ran_seed
            }
            with open(idfile, 'wb') as f:
                dump(data, f)

        #Only write the minimum necessary files. Use dbcat to create the
        #database version and configuration information. There is duplication
        #here because we also store the ids again. We retain the pkl file above
        #so that we can recreate *exactly* the same split again later.
        if not path.isfile(train_file):
            tids = ids[0:Ntrain]
            #Make sure that we have some atoms to write in the first place!
            if len(tids) > 0:
                altrain = subconfs[tids]
            else:
                altrain = AtomsList()
            #Add the unsplittable configurations to the training set as-is.
            Nunsplit = 0
            if nonsplit is not None:
                altrain.extend(nonsplit)
                Nunsplit = len(nonsplit)
            altrain.write(train_file)

            if dbfile is not None:
                dbcat([dbfile],
                      train_file,
                      docat=False,
                      ids=tids,
                      N=Ntrain + Nunsplit)
            else:
                dbcat([],
                      train_file,
                      docat=False,
                      ids=tids,
                      N=Ntrain + Nunsplit)
        if not path.isfile(holdout_file):
            hids = ids[Ntrain:-Nsuper]
            alhold = subconfs[hids]
            alhold.write(holdout_file)
            if dbfile is not None:
                dbcat([dbfile], holdout_file, docat=False, ids=hids, N=Nhold)
            else:
                dbcat([], holdout_file, docat=False, ids=hids, N=Nhold)
        if not path.isfile(super_file):
            sids = ids[-Nsuper:]
            alsuper = subconfs[sids]
            alsuper.write(super_file)
            if dbfile is not None:
                dbcat([dbfile], super_file, docat=False, ids=sids, N=Nsuper)
            else:
                dbcat([], super_file, docat=False, ids=sids, N=Nsuper)
Beispiel #10
0
    def _create_dbfull(self, folder, pattern, energy, force, virial, config_type):
        """Creates the full combined database.
        """
        from matdb.utility import chdir, dbcat
        from glob import glob
        from tqdm import tqdm
        from os import path

        #NB! There is a subtle bug here: if you try and open a matdb.atoms.Atoms
        #within the context manager of `chdir`, something messes up with the
        #memory sharing in fortran and it dies. This has to be separate.
        with chdir(folder):
            self.dbfiles = glob(pattern)
        rewrites = []

        for dbfile in self.dbfiles:
            #Look at the first configuration in the atoms list to
            #determine if it matches the energy, force, virial and
            #config type parameter names.
            dbpath = path.join(folder, dbfile)
            params, doforce = _atoms_conform(dbpath, energy, force, virial)
            if len(params) > 0 or doforce:
                msg.std("Conforming database file {}.".format(dbpath))
                al = AtomsList(dbpath)
                outpath = path.join(self.root, dbfile.replace(".xyz",".h5"))
                for ai in tqdm(al):
                    for target, source in params.items():
                        if (target == "config_type" and
                            config_type is not None):
                            ai.params[target] = config_type
                        else:
                            ai.add_param(target,ai.params[source])
                            del ai.params[source]
                            if source in ai.info: #pragma: no cover
                                                  #(if things were
                                                  #dane correctly by
                                                  #the atoms object
                                                  #this should never
                                                  #be used. It exists
                                                  #mainly as a
                                                  #safegaurd.
                                msg.warn("The atoms object didn't properly "
                                         "update the parameters of the legacy "
                                         "atoms object.")
                                del ai.info[source]

                    if doforce:
                        ai.add_property("ref_force",ai.properties[force])
                        del ai.properties[force]

                al.write(outpath)

                #Mark this db as non-conforming so that we created a new
                #version of it.
                rewrites.append(dbfile)

                dbcat([dbpath], outpath, docat=False, renames=params,
                      doforce=doforce)

        # We want a single file to hold all of the data for all the atoms in the database.
        all_atoms = AtomsList()
        for dbfile in self.dbfiles:
            if dbfile in rewrites:
                infile = dbfile.replace(".xyz",".h5")
                all_atoms.extend(AtomsList(path.join(self.root, infile)))
            else:
                dbpath = path.join(folder, dbfile)
                all_atoms.extend(AtomsList(dbpath))

        all_atoms.write(self._dbfull)

        #Finally, create the config file.
        from matdb.utility import dbcat
        with chdir(folder):
            dbcat(self.dbfiles, self._dbfull, config_type=self.config_type, docat=False)