Beispiel #1
0
    def train(
        self,
        inputs,
        targets,
        data=None,
        optimizer=(None, None),
        epochs=100,
        regularization=None,
        convergence=None,
        lossfxn=None,
        device="cpu",
        batch_size=None,
        lr_scheduler=None,
        independent_loss=True,
        loss_weights=None,
    ):
        """Train the models

        Parameters
        ----------
        inputs : dict
            Dictionary with hashed feature space.
        targets : list
            The expected values that the model has to learn aka y.
        model : object
            The NeuralNetwork class.
        data : object
            Data object created from the handler.
        optimizer : tuple
            The optimizer is a tuple with the structure:
                >>> ('adam', {'lr': float, 'weight_decay'=float})

        epochs : int
            Number of full training cycles.
        regularization : float
            This is the L2 regularization. It is not the same as weight decay.
        convergence : dict
            Instead of using epochs, users can set a convergence criterion.
                >>> convergence = {"rmse": [0.04, 0.02]}
        lossfxn : obj
            A loss function object.
        device : str
            Calculation can be run in the cpu or cuda (gpu).
        batch_size : int
            Number of data points per batch to use for training. Default is None.
        lr_scheduler : tuple
            Tuple with structure: scheduler's name and a dictionary with keyword
            arguments.

            >>> lr_scheduler = ('ReduceLROnPlateau',
                                {'mode': 'min', 'patience': 10})
        independent_loss : bool
            Whether or not models' weight are optimized independently.
        loss_weights : list
            How much the loss of model(i) contributes to the total loss.
        """

        self.epochs = epochs

        # Convergence criterion
        if isinstance(convergence["rmse"], float) or isinstance(
                convergence["rmse"], int):
            convergence["rmse"] = np.array(
                [convergence["rmse"] for model in range(len(self.models))])
        elif isinstance(convergence["rmse"], list):
            if len(convergence["rmse"]) != len(self.models):
                raise (
                    "Your convergence list is not the same length of the number of models"
                )
            convergence["rmse"] = np.array(convergence["rmse"])

        logger.info(" ")
        logging.info("Model Merger")
        logging.info("============")
        now = datetime.datetime.now()
        logger.info("Module accessed on {}.".format(
            now.strftime("%Y-%m-%d %H:%M:%S")))
        logging.info("Merging the following models:")

        for model in self.models:
            logging.info("    - {}.".format(model.name()))

        logging.info("Loss functions:")

        if loss_weights is None:
            self.loss_weights = [1.0 / len(lossfxn) for l in lossfxn]
        else:
            self.loss_weights = loss_weights

        for index, l in enumerate(lossfxn):
            logging.info("    - Name: {}; Weight: {}.".format(
                l.__name__, self.loss_weights[index]))
        logging.info("Convergence criterion: {}.".format(convergence))

        # If no batch_size provided then the whole training set length is the batch.
        if batch_size is None:
            batch_size = len(inputs.values())

        if isinstance(batch_size, int):
            chunks = []
            for inputs_ in inputs:

                if inspect.ismethod(inputs_):
                    chunks.append(inputs_)
                else:
                    chunks.append(
                        list(get_chunks(inputs_, batch_size, svm=False)))

            targets = [
                list(get_chunks(target, batch_size, svm=False))
                for target in targets
            ]
            atoms_per_image = list(
                get_chunks(data.atoms_per_image, batch_size, svm=False))

        if lossfxn is None:
            self.lossfxn = [None for model in self.models]
        else:
            self.lossfxn = lossfxn

        self.device = device

        # Population of extra Attributes needed by the models, and further data
        # preprocessing

        for index, loss in enumerate(lossfxn):
            _args, _varargs, _keywords, _defaults = inspect.getargspec(loss)
            if "latent" in _args:
                train = dynamic_import("train",
                                       "ml4chem.atomistic.models",
                                       alt_name="autoencoders")
                self.inputs_chunk_vals = train.get_inputs_chunks(chunks[index])
            else:
                self.inputs_chunk_vals = None

        parameters = []
        for index, model in enumerate(self.models):
            parameters += model.parameters()
            if model.name() == "PytorchPotentials":
                # These models require targets as tensors
                self.atoms_per_image = torch.tensor(atoms_per_image,
                                                    requires_grad=False,
                                                    dtype=torch.float)
                _targets = [
                    torch.tensor(batch, requires_grad=False)
                    for batch in targets[index]
                ]
                targets[index] = _targets
                del _targets
            elif model.name() in ModelMerger.autoencoders:
                targets[index] = lod_to_list(targets[index])

        # Data scattering
        client = dask.distributed.get_client()

        # self.targets = [client.scatter(target) for target in targets]
        self.targets = [target for target in targets]

        self.chunks = []

        for i, chunk in enumerate(chunks):
            if inspect.ismethod(chunk) is False:
                self.chunks.append(client.scatter(chunk))
            else:
                # This list comprehension is useful to have the same number of
                # functions as the same number of chunks without users' input.
                chunk = [chunk for _ in range(len(self.targets[i]))]
                self.chunks.append(chunk)

        del chunks

        logger.info(" ")
        logging.info("Batch Information")
        logging.info("-----------------")
        logging.info("Number of batches:")
        for index, c in enumerate(self.chunks):
            logging.info("    - Model {}, {}.".format(index, len(c)))
        logging.info("Batch size: {} elements per batch.\n".format(batch_size))

        # Define optimizer

        self.optimizer_name, self.optimizer = get_optimizer(
            optimizer, parameters)

        if lr_scheduler is not None:
            self.scheduler = get_lr_scheduler(self.optimizer, lr_scheduler)

        logger.info(" ")
        logger.info("Starting training...")
        logger.info(" ")

        logger.info("{:6s} {:19s} {:12s} {:8s}".format("Epoch", "Time Stamp",
                                                       "Loss", "RMSE (ave)"))
        logger.info("{:6s} {:19s} {:12s} {:8s}".format("------",
                                                       "-------------------",
                                                       "------------",
                                                       "--------------"))

        converged = False
        epoch = 0

        if independent_loss is False:
            # Convert list of chunks from [[a, c], [b, d]] to [[a, b], [c, d]]
            self.chunks = list(map(list, zip(*self.chunks)))

        old_state_dict = {}

        for key in self.models[1].state_dict():
            old_state_dict[key] = self.models[1].state_dict()[key].clone()

        from ml4chem.atomistic.models.autoencoders import Annealer

        annealer = Annealer()

        while not converged:
            epoch += 1
            self.annealing = annealer.update(epoch)

            self.optimizer.zero_grad()  # clear previous gradients

            if independent_loss:
                losses = []
                outputs = []
                for model_index, model in enumerate(self.models):
                    loss, output = self.closure(model_index,
                                                model,
                                                independent_loss,
                                                name=model.name())
                    losses.append(loss)
                    outputs.append(output)

            else:
                loss, outputs = self.closure(index, self.models,
                                             independent_loss)

            rmse = []
            for i, model in enumerate(self.models):
                outputs_ = outputs[i]
                targets_ = self.targets[i]

                if model.name() == "VAE":
                    # VAE usually returns a complex output with mus and sigmas
                    # but we only need mus at this stage.
                    outputs_ = [sublist[0] for sublist in outputs_]
                rmse.append(compute_rmse(outputs_, targets_))
            rmse = np.array(rmse)

            _rmse = np.average(rmse)

            if self.optimizer_name != "LBFGS":
                self.optimizer.step()
            else:
                options = {
                    "closure": self.closure,
                    "current_loss": loss,
                    "max_ls": 10
                }
                self.optimizer.step(options)

            ts = time.time()
            ts = datetime.datetime.fromtimestamp(ts).strftime("%Y-%m-%d "
                                                              "%H:%M:%S")
            logger.info("{:6d} {} {:8e} {:8f}".format(epoch, ts, loss, _rmse))

            if convergence is None and epoch == self.epochs:
                converged = True
            elif convergence is not None and (rmse <=
                                              convergence["rmse"]).all():
                converged = True
                new_state_dict = {}

                for key in self.models[1].state_dict():
                    new_state_dict[key] = self.models[1].state_dict(
                    )[key].clone()

                for key in old_state_dict:
                    if not (old_state_dict[key] == new_state_dict[key]).all():
                        print("Diff in {}".format(key))
                    else:
                        print("No diff in {}".format(key))
            print(convergence)
            print(rmse)

        print("Final")
        print(convergence)
        print(rmse)
Beispiel #2
0
    def __init__(
        self,
        inputs,
        targets,
        model=None,
        data=None,
        optimizer=(None, None),
        regularization=None,
        epochs=100,
        convergence=None,
        lossfxn=None,
        device="cpu",
        batch_size=None,
        lr_scheduler=None,
    ):

        self.initial_time = time.time()

        atoms_per_image = data.atoms_per_image

        if batch_size is None:
            batch_size = len(inputs.values())

        if isinstance(batch_size, int):
            # Data batches
            chunks = list(get_chunks(inputs, batch_size, svm=False))
            targets = list(get_chunks(targets, batch_size, svm=False))
            atoms_per_image = list(
                get_chunks(atoms_per_image, batch_size, svm=False))

        logger.info(" ")
        logging.info("Batch Information")
        logging.info("-----------------")
        logging.info("Number of batches: {}.".format(len(chunks)))
        logging.info("Batch size: {} elements per batch.".format(batch_size))
        logger.info(" ")

        atoms_per_image = torch.tensor(atoms_per_image,
                                       requires_grad=False,
                                       dtype=torch.float)

        targets = torch.tensor(targets, requires_grad=False)

        if device == "cuda":
            logger.info("Moving data to CUDA...")

            atoms_per_image = atoms_per_image.cuda()
            targets = targets.cuda()
            _inputs = OrderedDict()

            for hash, f in inputs.items():
                _inputs[hash] = []
                for features in f:
                    symbol, vector = features
                    _inputs[hash].append((symbol, vector.cuda()))

            inputs = _inputs

            move_time = time.time() - self.initial_time
            h, m, s = convert_elapsed_time(move_time)
            logger.info("Data moved to GPU in {} hours {} minutes {:.2f} \
                         seconds.".format(h, m, s))
            logger.info(" ")

        # Define optimizer
        self.optimizer_name, self.optimizer = get_optimizer(
            optimizer, model.parameters())

        if lr_scheduler is not None:
            self.scheduler = get_lr_scheduler(self.optimizer, lr_scheduler)

        logger.info(" ")
        logger.info("Starting training...")
        logger.info(" ")

        logger.info("{:6s} {:19s} {:12s} {:8s} {:8s}".format(
            "Epoch", "Time Stamp", "Loss", "RMSE/img", "RMSE/atom"))
        logger.info("{:6s} {:19s} {:12s} {:8s} {:8s}".format(
            "------", "-------------------", "------------", "--------",
            "---------"))
        self.atoms_per_image = atoms_per_image
        self.convergence = convergence
        self.device = device
        self.epochs = epochs
        self.model = model
        self.lr_scheduler = lr_scheduler

        # Data scattering
        client = dask.distributed.get_client()
        self.chunks = [client.scatter(chunk) for chunk in chunks]
        self.targets = [client.scatter(target) for target in targets]

        if lossfxn is None:
            self.lossfxn = AtomicMSELoss
        else:
            self.lossfxn = lossfxn

        # Let the hunger games begin...
        self.trainer()
Beispiel #3
0
    def __init__(
        self,
        inputs,
        targets,
        model=None,
        data=None,
        optimizer=(None, None),
        regularization=None,
        epochs=100,
        convergence=None,
        lossfxn=None,
        device="cpu",
        batch_size=None,
        lr_scheduler=None,
        **kwargs
    ):

        supported_keys = ["anneal", "penalize_latent"]

        if len(kwargs.items()) == 0:
            for k in supported_keys:
                setattr(self, k, None)
        else:
            for k, v in kwargs.items():
                if k in supported_keys:
                    setattr(self, k, v)

        self.initial_time = time.time()

        if device == "cuda":
            pass
            """
            logger.info('Moving data to CUDA...')

            targets = targets.cuda()
            _inputs = OrderedDict()

            for hash, f in inputs.items():
                _inputs[hash] = []
                for features in f:
                    symbol, vector = features
                    _inputs[hash].append((symbol, vector.cuda()))

            del inputs
            inputs = _inputs

            move_time = time.time() - initial_time
            h, m, s = convert_elapsed_time(move_time)
            logger.info('Data moved to GPU in {} hours {} minutes {:.2f}
                         seconds.' .format(h, m, s))
            """

        if batch_size is None:
            batch_size = len(inputs.values())

        if isinstance(batch_size, int):
            chunks = list(get_chunks(inputs, batch_size, svm=False))
            targets_ = list(get_chunks(targets, batch_size, svm=False))

        del targets

        # This change is needed because the targets are features or
        # positions and they are built as a dictionary.

        targets = lod_to_list(targets_)

        logging.info("Batch size: {} elements per batch.".format(batch_size))

        if device == "cuda":
            logger.info("Moving data to CUDA...")

            targets = targets.cuda()
            _inputs = OrderedDict()

            for hash, f in inputs.items():
                _inputs[hash] = []
                for features in f:
                    symbol, vector = features
                    _inputs[hash].append((symbol, vector.cuda()))

            inputs = _inputs

            move_time = time.time() - self.initial_time
            h, m, s = convert_elapsed_time(move_time)
            logger.info(
                "Data moved to GPU in {} hours {} minutes {:.2f} \
                         seconds.".format(
                    h, m, s
                )
            )
            logger.info(" ")

        # Define optimizer
        self.optimizer_name, self.optimizer = get_optimizer(
            optimizer, model.parameters()
        )
        if lr_scheduler is not None:
            self.scheduler = get_lr_scheduler(self.optimizer, lr_scheduler)

        if lossfxn is None:
            self.lossfxn = MSELoss
            self.inputs_chunk_vals = None

        else:
            logger.info("Using custom loss function...")
            logger.info("")

            self.lossfxn = lossfxn
            self.inputs_chunk_vals = self.get_inputs_chunks(chunks)

        logger.info(" ")
        logger.info("Starting training...")
        logger.info(" ")

        logger.info(
            "{:6s} {:19s} {:12s} {:9s}".format("Epoch", "Time Stamp", "Loss", "Rec Err")
        )
        logger.info(
            "{:6s} {:19s} {:12s} {:9s}".format(
                "------", "-------------------", "------------", "--------"
            )
        )

        # Data scattering
        client = dask.distributed.get_client()
        self.chunks = [client.scatter(chunk) for chunk in chunks]
        self.targets = [client.scatter(target) for target in targets]

        self.device = device
        self.epochs = epochs
        self.model = model
        self.lr_scheduler = lr_scheduler
        self.convergence = convergence

        # Let the hunger game begin...
        self.trainer()
Beispiel #4
0
    def __init__(
        self,
        inputs,
        targets,
        model=None,
        data=None,
        optimizer=(None, None),
        regularization=None,
        epochs=100,
        convergence=None,
        lossfxn=None,
        device="cpu",
        batch_size=None,
        lr_scheduler=None,
        uncertainty=None,
        checkpoint=None,
        test=None,
    ):

        self.initial_time = time.time()

        if lossfxn is None:
            lossfxn = AtomicMSELoss

        logger.info("")
        logger.info("Training")
        logger.info("========")
        logger.info(f"Convergence criteria: {convergence}")
        logger.info(f"Loss function: {lossfxn.__name__}")
        if uncertainty is not None:
            logger.info("Options:")
            logger.info(f"    - Uncertainty penalization: {pformat(uncertainty)}")
        logger.info("")

        atoms_per_image = data.atoms_per_image

        if batch_size is None:
            batch_size = len(inputs.values())

        if isinstance(batch_size, int):
            # Data batches
            chunks = list(get_chunks(inputs, batch_size, svm=False))
            targets = list(get_chunks(targets, batch_size, svm=False))
            atoms_per_image = list(get_chunks(atoms_per_image, batch_size, svm=False))

            if uncertainty != None:
                uncertainty = list(get_chunks(uncertainty, batch_size, svm=False))
                uncertainty = [
                    torch.tensor(u, requires_grad=False, dtype=torch.float)
                    for u in uncertainty
                ]

        logger.info("")
        logging.info("Batch Information")
        logging.info("-----------------")
        logging.info("Number of batches: {}.".format(len(chunks)))
        logging.info("Batch size: {} elements per batch.".format(batch_size))
        logger.info(" ")

        atoms_per_image = [
            torch.tensor(n_atoms, requires_grad=False, dtype=torch.float)
            for n_atoms in atoms_per_image
        ]

        targets = [torch.tensor(t, requires_grad=False) for t in targets]

        if device == "cuda":
            logger.info("Moving data to CUDA...")

            atoms_per_image = atoms_per_image.cuda()
            targets = targets.cuda()
            _inputs = OrderedDict()

            for hash, f in inputs.items():
                _inputs[hash] = []
                for features in f:
                    symbol, vector = features
                    _inputs[hash].append((symbol, vector.cuda()))

            inputs = _inputs

            move_time = time.time() - self.initial_time
            h, m, s = convert_elapsed_time(move_time)
            logger.info(
                "Data moved to GPU in {} hours {} minutes {:.2f} \
                         seconds.".format(
                    h, m, s
                )
            )
            logger.info(" ")

        # Define optimizer
        self.optimizer_name, self.optimizer = get_optimizer(
            optimizer, model.parameters()
        )

        if lr_scheduler is not None:
            self.scheduler = get_lr_scheduler(self.optimizer, lr_scheduler)

        self.atoms_per_image = atoms_per_image
        self.convergence = convergence
        self.device = device
        self.epochs = epochs
        self.model = model
        self.lr_scheduler = lr_scheduler
        self.lossfxn = lossfxn
        self.checkpoint = checkpoint
        self.test = test

        # Data scattering
        client = dask.distributed.get_client()
        self.chunks = [client.scatter(chunk) for chunk in chunks]
        self.targets = [client.scatter(target) for target in targets]

        if uncertainty != None:
            self.uncertainty = [client.scatter(u) for u in uncertainty]
        else:
            self.uncertainty = uncertainty

        # Let the hunger games begin...
        self.trainer()
Beispiel #5
0
    def train(self,
              inputs,
              targets,
              data=None,
              optimizer=(None, None),
              regularization=None,
              epochs=100,
              convergence=None,
              lossfxn=None,
              device="cpu",
              batch_size=None,
              lr_scheduler=None,
              independent_loss=True,
              loss_weights=None):

        logger.info(" ")
        logging.info("Model Merger")
        logging.info("============")
        logging.info("Merging the following models:")

        for model in self.models:
            logging.info("    - {}.".format(model.name()))

        logging.info("Loss functions:")

        if loss_weights is None:
            self.loss_weights = [1. / len(lossfxn) for l in lossfxn]
        else:
            self.loss_weights = loss_weights

        for l in lossfxn:
            logging.info("    - {}.".format(l.__name__))

        # If no batch_size provided then the whole training set length is the batch.
        if batch_size is None:
            batch_size = len(inputs.values())

        if isinstance(batch_size, int):
            chunks = []
            for inputs_ in inputs:

                if inspect.ismethod(inputs_):
                    chunks.append(inputs_)
                else:
                    chunks.append(
                        list(get_chunks(inputs_, batch_size, svm=False)))

            targets = [
                list(get_chunks(target, batch_size, svm=False))
                for target in targets
            ]
            atoms_per_image = list(
                get_chunks(data.atoms_per_image, batch_size, svm=False))

        if lossfxn is None:
            self.lossfxn = [None for model in self.models]
        else:
            self.lossfxn = lossfxn

        self.device = device

        # Population of extra Attributes needed by the models, and further data
        # preprocessing

        for index, loss in enumerate(lossfxn):
            _args, _varargs, _keywords, _defaults = inspect.getargspec(loss)
            if "latent" in _args:
                train = dynamic_import("train",
                                       "ml4chem.models",
                                       alt_name="autoencoders")
                self.inputs_chunk_vals = train.get_inputs_chunks(chunks[index])

        parameters = []
        for index, model in enumerate(self.models):
            parameters += model.parameters()
            if model.name() == "PytorchPotentials":
                # These models require targets as tensors
                self.atoms_per_image = torch.tensor(atoms_per_image,
                                                    requires_grad=False,
                                                    dtype=torch.float)
                _targets = [
                    torch.tensor(batch, requires_grad=False)
                    for batch in targets[index]
                ]
                targets[index] = _targets
                del _targets
            elif model.name() == "AutoEncoder":
                targets[index] = lod_to_list(targets[index])

        # Data scattering
        client = dask.distributed.get_client()

        # self.targets = [client.scatter(target) for target in targets]
        self.targets = [target for target in targets]

        self.chunks = []

        for i, chunk in enumerate(chunks):
            if inspect.ismethod(chunk) is False:
                self.chunks.append(client.scatter(chunk))
            else:
                # This list comprehension is useful to have the same number of
                # functions as the same number of chunks without users' input.
                chunk = [chunk for _ in range(len(self.targets[i]))]
                self.chunks.append(chunk)

        del chunks

        logger.info(" ")
        logging.info("Batch Information")
        logging.info("-----------------")
        logging.info("Number of batches:")
        for index, c in enumerate(self.chunks):
            logging.info('    - Model {}, {}.'.format(index, len(c)))
        logging.info("Batch size: {} elements per batch.\n".format(batch_size))

        # Define optimizer

        self.optimizer_name, self.optimizer = get_optimizer(
            optimizer, parameters)

        if lr_scheduler is not None:
            self.scheduler = get_lr_scheduler(self.optimizer, lr_scheduler)

        logger.info(" ")
        logger.info("Starting training...")
        logger.info(" ")

        logger.info("{:6s} {:19s} {:12s} {:8s}".format("Epoch", "Time Stamp",
                                                       "Loss", "RMSE (ave)"))
        logger.info("{:6s} {:19s} {:12s} {:8s}".format("------",
                                                       "-------------------",
                                                       "------------",
                                                       "--------------"))

        converged = False
        epoch = 0

        if independent_loss is False:
            # Convert list of chunks from [[a, c], [b, d]] to [[a, b], [c, d]]
            self.chunks = list(map(list, zip(*self.chunks)))

        old_state_dict = {}

        for key in self.models[1].state_dict():
            old_state_dict[key] = self.models[1].state_dict()[key].clone()

        while not converged:
            epoch += 1

            self.optimizer.zero_grad()  # clear previous gradients

            if independent_loss:
                losses = []
                for model_index, model in enumerate(self.models):
                    name = model.name()
                    loss, outputs = self.closure(model_index,
                                                 model,
                                                 independent_loss,
                                                 name=name)
                    losses.append(loss)

            else:
                loss, outputs = self.closure(index, self.models,
                                             independent_loss)

            rmse = []
            for i, model in enumerate(self.models):
                rmse.append(compute_rmse(outputs[i], self.targets[i]))
            # print(outputs[1])
            # print(targets[1])

            # print(rmse)
            _rmse = np.average(rmse)

            if self.optimizer_name != "LBFGS":
                self.optimizer.step()
            else:
                options = {
                    "closure": self.closure,
                    "current_loss": loss,
                    "max_ls": 10
                }
                self.optimizer.step(options)

            ts = time.time()
            ts = datetime.datetime.fromtimestamp(ts).strftime("%Y-%m-%d "
                                                              "%H:%M:%S")
            logger.info("{:6d} {} {:8e} {:8f}".format(epoch, ts, loss, _rmse))

            if convergence is None and epoch == self.epochs:
                converged = True
            elif convergence is not None and all(i <= convergence["rmse"]
                                                 for i in rmse):
                converged = True
                new_state_dict = {}

                for key in self.models[1].state_dict():
                    new_state_dict[key] = self.models[1].state_dict(
                    )[key].clone()

                for key in old_state_dict:
                    if not (old_state_dict[key] == new_state_dict[key]).all():
                        print('Diff in {}'.format(key))
                    else:
                        print('No diff in {}'.format(key))
Beispiel #6
0
    def calculate(self, images=None, purpose="training", data=None, svm=False):
        """Calculate the features per atom in an atoms objects

        Parameters
        ----------
        image : dict
            Hashed images using the Data class.
        purpose : str
            The supported purposes are: 'training', 'inference'.
        data : obj
            data object
        svm : bool
            Whether or not these features are going to be used for kernel
            methods.

        Returns
        -------
        feature_space : dict
            A dictionary with key hash and value as a list with the following
            structure: {'hash': [('H', [vector]]}
        reference_space : dict
            A reference space useful for SVM models.
        """

        client = dask.distributed.get_client()
        logger.info(" ")
        logger.info("Featurization")
        logger.info("=============")
        now = datetime.datetime.now()
        logger.info("Module accessed on {}.".format(
            now.strftime("%Y-%m-%d %H:%M:%S")))
        logger.info(f"Module name: {self.name()}.")

        # FIXME the block below should become a function.
        if os.path.isfile(self.filename) and self.overwrite is False:
            logger.warning(f"Loading features from {self.filename}.")
            logger.info(" ")
            svm_keys = [b"feature_space", b"reference_space"]
            data = load(self.filename)

            data_hashes = list(data.keys())
            image_hashes = list(images.keys())

            if image_hashes == data_hashes:
                # Check if both lists are the same.
                return data
            elif any(i in image_hashes for i in data_hashes):
                # Check if any of the elem
                _data = {}
                for hash in image_hashes:
                    _data[hash] = data[hash]
                return _data

            if svm_keys == list(data.keys()):
                feature_space = data[svm_keys[0]]
                reference_space = data[svm_keys[1]]
                return feature_space, reference_space

        initial_time = time.time()

        # Verify that we know the unique element symbols
        if data.unique_element_symbols is None:
            logger.info(f"Getting unique element symbols for {purpose}")

            unique_element_symbols = data.get_unique_element_symbols(
                images, purpose=purpose)

            unique_element_symbols = unique_element_symbols[purpose]

            logger.info(f"Unique chemical elements: {unique_element_symbols}")

        elif isinstance(data.unique_element_symbols, dict):
            unique_element_symbols = data.unique_element_symbols[purpose]

            logger.info(f"Unique chemical elements: {unique_element_symbols}")

        # we make the features
        self.GP = self.custom.get("GP", None)

        if self.GP is None:
            custom = self.custom.get("user_input", None)
            self.GP = self.make_symmetry_functions(
                unique_element_symbols,
                custom=custom,
                angular_type=self.angular_type)
            self.custom.update({"GP": self.GP})
        else:
            logger.info(
                "Using parameters from file to create symmetry functions...\n")

        self.print_features_params(self.GP)

        symbol = data.unique_element_symbols[purpose][0]
        sample = np.zeros(len(self.GP[symbol]))

        self.dimension = len(sample)

        preprocessor = Preprocessing(self.preprocessor, purpose=purpose)
        preprocessor.set(purpose=purpose)

        # We start populating computations to get atomic features.
        logger.info("")
        logger.info(
            "Embarrassingly parallel computation of atomic features...")

        stacked_features = []
        atoms_index_map = [
        ]  # This list is used to reconstruct images from atoms.

        if self.batch_size is None:
            self.batch_size = data.get_total_number_atoms()

        chunks = get_chunks(images, self.batch_size, svm=svm)

        ini = end = 0
        for chunk in chunks:
            images_ = OrderedDict(chunk)
            intermediate = []

            for image in images_.items():
                _, image = image
                end = ini + len(image)
                atoms_index_map.append(list(range(ini, end)))
                ini = end
                for atom in image:
                    index = atom.index
                    symbol = atom.symbol

                    cutoff_keys = ["radial", "angular"]
                    n_symbols, neighborpositions = {}, {}

                    if isinstance(self.cutoff, dict):
                        for cutoff_key in cutoff_keys:
                            nl = get_neighborlist(
                                image, cutoff=self.cutoff[cutoff_key])
                            # n_indices: neighbor indices for central atom_i.
                            # n_offsets: neighbor offsets for central atom_i.
                            n_indices, n_offsets = nl[atom.index]

                            n_symbols_ = np.array(
                                image.get_chemical_symbols())[n_indices]
                            n_symbols[cutoff_key] = n_symbols_

                            neighborpositions_ = image.positions[
                                n_indices] + np.dot(n_offsets,
                                                    image.get_cell())
                            neighborpositions[cutoff_key] = neighborpositions_
                    else:
                        for cutoff_key in cutoff_keys:
                            nl = get_neighborlist(image, cutoff=self.cutoff)
                            # n_indices: neighbor indices for central atom_i.
                            # n_offsets: neighbor offsets for central atom_i.
                            n_indices, n_offsets = nl[atom.index]

                            n_symbols_ = np.array(
                                image.get_chemical_symbols())[n_indices]
                            n_symbols[cutoff_key] = n_symbols_

                            neighborpositions_ = image.positions[
                                n_indices] + np.dot(n_offsets,
                                                    image.get_cell())
                            neighborpositions[cutoff_key] = neighborpositions_

                    afp = self.get_atomic_features(
                        atom,
                        index,
                        symbol,
                        n_symbols,
                        neighborpositions,
                        image_molecule=image,
                        weighted=self.weighted,
                        n_indices=n_indices,
                    )

                    intermediate.append(afp)

            intermediate = client.persist(intermediate,
                                          scheduler=self.scheduler)
            stacked_features += intermediate
            del intermediate

        scheduler_time = time.time() - initial_time

        dask.distributed.wait(stacked_features)

        h, m, s = convert_elapsed_time(scheduler_time)
        logger.info("... finished in {} hours {} minutes {:.2f}"
                    " seconds.".format(h, m, s))

        logger.info("")

        if self.preprocessor is not None:

            scaled_feature_space = []

            # To take advantage of dask_ml we need to convert our numpy array
            # into a dask array.
            logger.info("Converting features to dask array...")
            stacked_features = [
                da.from_delayed(lazy, dtype=float, shape=sample.shape)
                for lazy in stacked_features
            ]
            layout = {0: tuple(len(i) for i in atoms_index_map), 1: -1}
            # stacked_features = dask.array.stack(stacked_features, axis=0).rechunk(layout)
            stacked_features = da.stack(stacked_features,
                                        axis=0).rechunk(layout)

            logger.info("Shape of array is {} and chunks {}.".format(
                stacked_features.shape, stacked_features.chunks))

            # Note that dask_ml by default convert the output of .fit
            # in a concrete value.
            if purpose == "training":
                stacked_features = preprocessor.fit(stacked_features,
                                                    scheduler=self.scheduler)
            else:
                stacked_features = preprocessor.transform(stacked_features)

            atoms_index_map = [
                client.scatter(indices) for indices in atoms_index_map
            ]
            # stacked_features = [client.scatter(features) for features in stacked_features]
            stacked_features = client.scatter(stacked_features, broadcast=True)

            logger.info("Stacking features using atoms index map...")

            for indices in atoms_index_map:
                features = client.submit(self.stack_features,
                                         *(indices, stacked_features))

                # features = self.stack_features(indices, stacked_features)

                scaled_feature_space.append(features)

        else:
            scaled_feature_space = []
            atoms_index_map = [
                client.scatter(chunk) for chunk in atoms_index_map
            ]
            stacked_features = client.scatter(stacked_features, broadcast=True)

            for indices in atoms_index_map:
                features = client.submit(self.stack_features,
                                         *(indices, stacked_features))
                scaled_feature_space.append(features)

            scaled_feature_space = client.gather(scaled_feature_space)

        # Clean
        del stacked_features

        # Restack images
        feature_space = []

        if svm and purpose == "training":
            logger.info("Building array with reference space.")
            reference_space = []

            for i, image in enumerate(images.items()):
                restacked = client.submit(
                    self.restack_image, *(i, image, scaled_feature_space, svm))

                # image = (hash, ase_image) -> tuple
                for atom in image[1]:
                    restacked_atom = client.submit(
                        self.restack_atom, *(i, atom, scaled_feature_space))
                    reference_space.append(restacked_atom)

                feature_space.append(restacked)

            reference_space = client.gather(reference_space)

        elif svm is False and purpose == "training":
            for i, image in enumerate(images.items()):
                restacked = client.submit(
                    self.restack_image, *(i, image, scaled_feature_space, svm))
                feature_space.append(restacked)

        else:
            try:
                for i, image in enumerate(images.items()):
                    restacked = client.submit(
                        self.restack_image,
                        *(i, image, scaled_feature_space, svm))
                    feature_space.append(restacked)

            except UnboundLocalError:
                # scaled_feature_space does not exist.
                for i, image in enumerate(images.items()):
                    restacked = client.submit(self.restack_image,
                                              *(i, image, feature_space, svm))
                    feature_space.append(restacked)

        feature_space = client.gather(feature_space)
        feature_space = OrderedDict(feature_space)

        fp_time = time.time() - initial_time

        h, m, s = convert_elapsed_time(fp_time)

        logger.info("Featurization finished in {} hours {} minutes {:.2f}"
                    " seconds.".format(h, m, s))

        if svm and purpose == "training":
            client.restart()  # Reclaims memory aggressively
            preprocessor.save_to_file(preprocessor, self.save_preprocessor)

            if self.filename is not None:
                logger.info(f"features saved to {self.filename}.")
                data = {"feature_space": feature_space}
                data.update({"reference_space": reference_space})
                dump(data, filename=self.filename)
                self.feature_space = feature_space
                self.reference_space = reference_space

            return self.feature_space, self.reference_space

        elif svm is False and purpose == "training":
            client.restart()  # Reclaims memory aggressively
            preprocessor.save_to_file(preprocessor, self.save_preprocessor)

            if self.filename is not None:
                logger.info(f"features saved to {self.filename}.")
                dump(feature_space, filename=self.filename)
                self.feature_space = feature_space

            return self.feature_space
        else:
            self.feature_space = feature_space
            return self.feature_space
Beispiel #7
0
    def calculate(self, images=None, purpose="training", data=None, svm=False):
        """Calculate the features per atom in an atoms objects

        Parameters
        ----------
        image : dict
            Hashed images using the Data class.
        purpose : str
            The supported purposes are: 'training', 'inference'.
        data : obj
            data object
        svm : bool
            Whether or not these features are going to be used for kernel
            methods.

        Returns
        -------
        feature_space : dict
            A dictionary with key hash and value as a list with the following
            structure: {'hash': [('H', [vector]]}
        reference_space : dict
            A reference space useful for SVM models.
        """

        client = dask.distributed.get_client()
        logger.info(" ")
        logger.info("Featurization")
        logger.info("=============")
        now = datetime.datetime.now()
        logger.info("Module accessed on {}.".format(
            now.strftime("%Y-%m-%d %H:%M:%S")))

        # FIXME the block below should become a function.
        if os.path.isfile(self.filename) and self.overwrite is False:
            logger.warning("Loading features from {}.".format(self.filename))
            logger.info(" ")
            svm_keys = [b"feature_space", b"reference_space"]
            data = load(self.filename)

            data_hashes = list(data.keys())
            image_hashes = list(images.keys())

            if image_hashes == data_hashes:
                # Check if both lists are the same.
                return data
            elif any(i in image_hashes for i in data_hashes):
                # Check if any of the elem
                _data = {}
                for hash in image_hashes:
                    _data[hash] = data[hash]
                return _data

            if svm_keys == list(data.keys()):
                feature_space = data[svm_keys[0]]
                reference_space = data[svm_keys[1]]
                return feature_space, reference_space

        initial_time = time.time()

        # Verify that we know the unique element symbols
        if data.unique_element_symbols is None:
            logger.info(
                "Getting unique element symbols for {}".format(purpose))

            unique_element_symbols = data.get_unique_element_symbols(
                images, purpose=purpose)

            unique_element_symbols = unique_element_symbols[purpose]

            logger.info(
                "Unique chemical elements: {}".format(unique_element_symbols))

        elif isinstance(data.unique_element_symbols, dict):
            unique_element_symbols = data.unique_element_symbols[purpose]

            logger.info(
                "Unique chemical elements: {}".format(unique_element_symbols))

        # we make the features
        preprocessor = Preprocessing(self.preprocessor, purpose=purpose)
        preprocessor.set(purpose=purpose)

        # We start populating computations to get atomic features.
        logger.info("")
        logger.info(
            "Embarrassingly parallel computation of atomic features...")

        stacked_features = []
        atoms_symbols_map = [
        ]  # This list is used to reconstruct images from atoms.

        if self.batch_size is None:
            self.batch_size = data.get_total_number_atoms()

        chunks = get_chunks(images, self.batch_size, svm=svm)

        for chunk in chunks:
            images_ = OrderedDict(chunk)
            intermediate = []

            for image in images_.items():
                key, image = image
                atoms_symbols_map.append(image.get_chemical_symbols())
                # Use .create() class method from dscribe.
                _features = dask.delayed(self.create)(image)
                intermediate.append(_features)

            intermediate = client.compute(intermediate,
                                          scheduler=self.scheduler)
            stacked_features += intermediate
            del intermediate

        # scheduler_time = time.time() - initial_time

        # dask.distributed.wait(stacked_features)

        logger.info("")

        if self.preprocessor is not None:
            raise NotImplementedError

        else:
            scaled_feature_space = []
            atoms_symbols_map = [
                client.scatter(chunk) for chunk in atoms_symbols_map
            ]
            stacked_features = client.scatter(stacked_features, broadcast=True)

            for image_index, symbols in enumerate(atoms_symbols_map):
                features = client.submit(
                    self.stack_features,
                    *(symbols, image_index, stacked_features))
                scaled_feature_space.append(features)

            scaled_feature_space = client.gather(scaled_feature_space)

        # Clean
        del stacked_features

        # Restack images
        feature_space = []

        if svm and purpose == "training":

            for i, image in enumerate(images.items()):
                restacked = client.submit(
                    self.restack_image, *(i, image, scaled_feature_space, svm))

                feature_space.append(restacked)

        elif svm is False and purpose == "training":
            for i, image in enumerate(images.items()):
                restacked = client.submit(
                    self.restack_image, *(i, image, scaled_feature_space, svm))
                feature_space.append(restacked)

        else:
            try:
                for i, image in enumerate(images.items()):
                    restacked = client.submit(
                        self.restack_image,
                        *(i, image, scaled_feature_space, svm))
                    feature_space.append(restacked)

            except UnboundLocalError:
                # scaled_feature_space does not exist.
                for i, image in enumerate(images.items()):
                    restacked = client.submit(self.restack_image,
                                              *(i, image, feature_space, svm))
                    feature_space.append(restacked)

        feature_space = client.gather(feature_space)

        if svm and purpose == "training":
            # FIXME This might need to be improved
            logger.info("Building array with reference space.")
            hashes, reference_space = list(zip(*feature_space))
            del hashes
            reference_space = list(
                itertools.chain.from_iterable(reference_space))
            logger.info("Finished reference space.")

        feature_space = OrderedDict(feature_space)

        fp_time = time.time() - initial_time

        h, m, s = convert_elapsed_time(fp_time)

        logger.info("Featurization finished in {} hours {} minutes {:.2f}"
                    " seconds.".format(h, m, s))

        if svm and purpose == "training":
            client.restart()  # Reclaims memory aggressively
            preprocessor.save_to_file(preprocessor, self.save_preprocessor)

            if self.filename is not None:
                logger.info("features saved to {}.".format(self.filename))
                data = {"feature_space": feature_space}
                data.update({"reference_space": reference_space})
                dump(data, filename=self.filename)
                self.feature_space = feature_space
                self.reference_space = reference_space

            return self.feature_space, self.reference_space

        elif svm is False and purpose == "training":
            client.restart()  # Reclaims memory aggressively
            preprocessor.save_to_file(preprocessor, self.save_preprocessor)

            if self.filename is not None:
                logger.info("features saved to {}.".format(self.filename))
                dump(feature_space, filename=self.filename)
                self.feature_space = feature_space

            return self.feature_space
        else:
            self.feature_space = feature_space
            return self.feature_space
Beispiel #8
0
    def prepare_model(self,
                      feature_space,
                      reference_features,
                      data=None,
                      purpose="training"):
        """Prepare the Kernel Ridge Regression model

        Parameters
        ----------
        feature_space : dict
            A dictionary with hash, fingerprint structure.
        reference_features : dict
            A dictionary with raveled tuples of symbol, atomic fingerprint.
        data : object
            DataSet object created from the handler.
        purpose : str
            Purpose of this model: 'training', 'inference'.


        Notes
        -----
        This method builds the atomic kernel matrices and the LT vectors needed
        to apply the atomic decomposition Ansatz.
        """
        if purpose == "training":
            logger.info("Model Training")
            logger.info("Model name: {}.".format(self.name()))
            logger.info("Kernel parameters:")
            logger.info("    - Kernel function: {}.".format(self.kernel))
            logger.info("    - Sigma: {}.".format(self.sigma))
            logger.info("    - Lamda: {}.".format(self.lamda))

        dim = len(reference_features)
        """
        Atomic kernel matrices
        """

        initial_time = time.time()

        logger.info("Computing Kernel Matrix...")
        # We start populating computations with delayed functions to
        # operate with dask's scheduler
        logger.warning("    Adding calculations to scheduler...")

        computations = self.get_kernel_matrix(feature_space,
                                              reference_features)

        scheduler_time = time.time() - initial_time
        h, m, s = convert_elapsed_time(scheduler_time)
        logger.info("    {} kernel evaluations added in {} hours {} minutes "
                    "{:.2f} seconds.".format(len(computations), h, m, s))

        if self.batch_size is not None:
            computations = list(get_chunks(computations, self.batch_size))
            logger.info(
                "    The calculations were batched in groups of {}.".format(
                    self.batch_size))

        # We compute the calculations with dask and the result is converted
        # to numpy array.
        logger.info("    Evaluating atomic similarities...")

        if self.batch_size is None:
            kernel_matrix = dask.compute(*computations,
                                         scheduler=self.scheduler)
        else:
            kernel_matrix = []
            for i, chunk in enumerate(computations):
                kernel_matrix.append(
                    dask.compute(*chunk, scheduler=self.scheduler))

        self.K = np.array(kernel_matrix).reshape(dim, dim)

        build_time = time.time() - initial_time
        h, m, s = convert_elapsed_time(build_time)
        logger.info("Kernel matrix built in {} hours {} minutes {:.2f} "
                    "seconds.".format(h, m, s))
        """
        LT Vectors
        """
        # We build the LT matrix needed for ADA
        logger.info("Building LT matrix")
        computations = []
        for index, feature_space in enumerate(feature_space.items()):
            computations.append(self.get_lt(index))

        self.LT = np.array((dask.compute(*computations,
                                         scheduler=self.scheduler)))

        lt_time = time.time() - initial_time
        h, m, s = convert_elapsed_time(lt_time)
        logger.info(
            "LT matrix built in {} hours {} minutes {:.2f} seconds.".format(
                h, m, s))
Beispiel #9
0
    def get_kernel_matrix(self, feature_space, reference_features, purpose):
        """Get kernel matrix delayed computations


        Parameters
        ----------
        features : dict, list
            Dictionary with hash and features, or a list.
        reference_space : array
            Array with reference feature space.
        purpose : str
            Purpose of this kernel matrix. Accepted arguments are 'training',
            and 'inference'.

        Returns
        -------
        kernel_matrix
            List with kernel matrix values.


        Notes
        -----
        This class method expects the feature_space to be an OrderedDict and
        reference_space but it turns out that for computing variances, it
        might be the case the feature_space is also a list.
        """

        call = {"exponential": exponential, "laplacian": laplacian, "rbf": rbf}

        initial_time = time.time()
        if isinstance(reference_features, dict):
            # This is the case when the reference_features are a
            # dictionary, too. If that's true we have to convert it to a list.
            reference_features = list(reference_features.values())[0]

        chunks = list(get_chunks(feature_space, self.batch_size))

        logger.info(
            "    The calculations are distributed in {} batches of {} atoms.".
            format(len(chunks), self.batch_size))

        counter = 0
        kernel_matrix = []

        for c, chunk in enumerate(chunks):
            chunk_initial_time = time.time()
            logger.info(
                "        Computing kernel functions for chunk {}...".format(c))
            intermediates = []

            if isinstance(feature_space, dict) and isinstance(
                    reference_features, list):
                if isinstance(chunk, dict) is False:
                    chunk = OrderedDict(chunk)

                reference_lenght = len(reference_features)

                for hash, _feature_space in chunk.items():
                    f_map = []
                    for i_symbol, i_afp in _feature_space:
                        i_symbol = decode(i_symbol)
                        f_map.append(1)

                        if purpose == "training":

                            for j in range(counter, reference_lenght):
                                j_symbol, j_afp = reference_features[j]

                                kernel = call[self.kernel](i_afp, j_afp,
                                                           i_symbol, j_symbol,
                                                           self.sigma)

                                intermediates.append(kernel)
                            counter += 1
                        else:
                            for j_symbol, j_afp in reference_features:
                                j_symbol = decode(j_symbol)
                                kernel = call[self.kernel](i_afp, j_afp,
                                                           i_symbol, j_symbol,
                                                           self.sigma)
                                intermediates.append(kernel)
                    self.fingerprint_map.append(f_map)

            elif isinstance(feature_space, list) and isinstance(
                    reference_features, list):
                for i_symbol, i_afp in chunk:
                    for j_symbol, j_afp in reference_features:
                        i_symbol = decode(i_symbol)
                        j_symbol = decode(j_symbol)

                        kernel = call[self.kernel](i_afp, j_afp, i_symbol,
                                                   j_symbol, self.sigma)
                        intermediates.append(kernel)

            # Compute stuff from above
            kernel_matrix += dask.compute(intermediates,
                                          scheduler=self.scheduler)[0]
            del intermediates

        chunk_final_time = time.time() - chunk_initial_time
        h, m, s = convert_elapsed_time(chunk_final_time)
        logger.info("          ...finished in {} hours {} minutes {:.2f} "
                    "seconds.".format(h, m, s))
        # dask.distributed.wait(kernel_matrix)

        del reference_features

        # kernel_matrix = client.gather(kernel_matrix)
        build_time = time.time() - initial_time
        h, m, s = convert_elapsed_time(build_time)
        logger.info("Kernel matrix built in {} hours {} minutes {:.2f} "
                    "seconds.".format(h, m, s))
        """
        LT Vectors
        """
        # We build the LT matrix needed for ADA
        if purpose == "training":
            self.LT = []
            logger.info("Building LT matrix")
            computations = []
            for index, feature_space in enumerate(feature_space.items()):
                computations.append(self.get_lt(index))

            computations = list(get_chunks(computations, self.batch_size))
            logger.info(
                "    The calculations are distributed in {} batches of {} molecules."
                .format(len(computations), self.batch_size))
            for chunk in computations:
                self.LT += dask.compute(*chunk, scheduler=self.scheduler)

            self.LT = np.array(self.LT)
            del computations
            del chunk
            lt_time = time.time() - initial_time
            h, m, s = convert_elapsed_time(lt_time)
            logger.info(
                "LT matrix built in {} hours {} minutes {:.2f} seconds.".
                format(h, m, s))

        return kernel_matrix