Beispiel #1
0
    def trainer(self):
        """Run the training class"""

        converged = False
        _loss = []
        _rmse = []
        epoch = 0

        while not converged:
            epoch += 1

            self.optimizer.zero_grad()  # clear previous gradients
            loss, outputs_ = train.closure(
                self.chunks,
                self.targets,
                self.model,
                self.lossfxn,
                self.atoms_per_image,
                self.device,
            )
            # We step the optimizer
            if self.optimizer_name != "LBFGS":
                self.optimizer.step()
            else:
                options = {
                    "closure": self.closure,
                    "current_loss": loss,
                    "max_ls": 10
                }
                self.optimizer.step(options)

            # RMSE per image and per/atom
            client = dask.distributed.get_client()

            rmse = client.submit(compute_rmse, *(outputs_, self.targets))
            atoms_per_image = self.atoms_per_image.view(1, -1)
            rmse_atom = client.submit(
                compute_rmse, *(outputs_, self.targets, atoms_per_image))
            rmse = rmse.result()
            rmse_atom = rmse_atom.result()

            _loss.append(loss.item())
            _rmse.append(rmse)

            # In the case that lr_scheduler is not None
            if self.lr_scheduler is not None:
                self.scheduler.step(loss)

            ts = time.time()
            ts = datetime.datetime.fromtimestamp(ts).strftime("%Y-%m-%d "
                                                              "%H:%M:%S")
            logger.info("{:6d} {} {:8e} {:8f} {:8f}".format(
                epoch, ts, loss, rmse, rmse_atom))

            if self.convergence is None and epoch == self.epochs:
                converged = True
            elif self.convergence is not None and rmse < self.convergence[
                    "energy"]:
                converged = True

        training_time = time.time() - self.initial_time

        h, m, s = convert_elapsed_time(training_time)
        logger.info(
            "Training finished in {} hours {} minutes {:.2f} seconds.".format(
                h, m, s))
Beispiel #2
0
    def calculate_features(self,
                           images=None,
                           purpose="training",
                           data=None,
                           svm=False):
        """Calculate the features per atom in an atoms objects

        Parameters
        ----------
        image : dict
            Hashed images using the DataSet class.
        purpose : str
            The supported purposes are: 'training', 'inference'.
        data : obj
            data object
        svm : bool
            Whether or not these features are going to be used for kernel
            methods.

        Returns
        -------
        feature_space : dict
            A dictionary with key hash and value as a list with the following
            structure: {'hash': [('H', [vector]]}
        reference_space : dict
            A reference space useful for SVM models.
        """

        logger.info(" ")
        logger.info("Fingerprinting")
        logger.info("==============")

        # FIXME the block below should become a function.
        if os.path.isfile(self.filename) and self.overwrite is False:
            logger.warning("Loading features from {}.".format(self.filename))
            logger.info(" ")
            svm_keys = [b"feature_space", b"reference_space"]
            data = load(self.filename)

            data_hashes = list(data.keys())
            image_hashes = list(images.keys())

            if image_hashes == data_hashes:
                # Check if both lists are the same.
                return data
            elif any(i in image_hashes for i in data_hashes):
                # Check if any of the elem
                _data = {}
                for hash in image_hashes:
                    _data[hash] = data[hash]
                return _data

            if svm_keys == list(data.keys()):
                feature_space = data[svm_keys[0]]
                reference_space = data[svm_keys[1]]
                return feature_space, reference_space

        initial_time = time.time()

        # Verify that we know the unique element symbols
        if data.unique_element_symbols is None:
            logger.info(
                "Getting unique element symbols for {}".format(purpose))

            unique_element_symbols = data.get_unique_element_symbols(
                images, purpose=purpose)

            unique_element_symbols = unique_element_symbols[purpose]

            logger.info(
                "Unique chemical elements: {}".format(unique_element_symbols))

        # we make the features
        self.GP = self.custom.get("GP", None)

        if self.GP is None:
            custom = self.custom.get("user_input", None)
            self.GP = self.make_symmetry_functions(
                unique_element_symbols,
                custom=custom,
                angular_type=self.angular_type)
            self.custom.update({"GP": self.GP})
        else:
            logger.info(
                'Using parameters from file to create symmetry functions...\n')

        self.print_fingerprint_params(self.GP)

        preprocessor = Preprocessing(self.preprocessor, purpose=purpose)
        preprocessor.set(purpose=purpose)

        # We start populating computations to get atomic fingerprints.
        logger.info("")
        logger.info("Adding atomic feature calculations to scheduler...")

        ini = end = 0

        computations = []
        atoms_index_map = [
        ]  # This list is used to reconstruct images from atoms.

        for image in images.items():
            key, image = image
            end = ini + len(image)
            atoms_index_map.append(list(range(ini, end)))
            ini = end
            for atom in image:
                index = atom.index
                symbol = atom.symbol
                nl = get_neighborlist(image, cutoff=self.cutoff)
                # n_indices: neighbor indices for central atom_i.
                # n_offsets: neighbor offsets for central atom_i.
                n_indices, n_offsets = nl[atom.index]

                n_symbols = np.array(image.get_chemical_symbols())[n_indices]
                neighborpositions = image.positions[n_indices] + np.dot(
                    n_offsets, image.get_cell())

                afp = self.get_atomic_fingerprint(
                    atom,
                    index,
                    symbol,
                    n_symbols,
                    neighborpositions,
                    self.preprocessor,
                    image_molecule=image,
                    weighted=self.weighted,
                    n_indices=n_indices,
                )

                computations.append(afp)

        scheduler_time = time.time() - initial_time

        h, m, s = convert_elapsed_time(scheduler_time)
        logger.info("... finished in {} hours {} minutes {:.2f}"
                    " seconds.".format(h, m, s))

        # In this block we compute the fingerprints.
        logger.info("")
        logger.info("Computing fingerprints...")

        stacked_features = dask.compute(*computations,
                                        scheduler=self.scheduler)

        if self.preprocessor is not None:
            stacked_features = np.array(stacked_features)

        # Clean
        del computations

        if purpose == "training":
            # To take advantage of dask_ml we need to convert our numpy array
            # into a dask array.
            client = dask.distributed.get_client()

            if self.preprocessor is not None:
                scaled_feature_space = []
                dim = stacked_features.shape
                stacked_features = dask.array.from_array(stacked_features,
                                                         chunks=dim)
                stacked_features = preprocessor.fit(stacked_features,
                                                    scheduler=self.scheduler)
                atoms_index_map = [
                    client.scatter(chunk) for chunk in atoms_index_map
                ]

                for indices in atoms_index_map:
                    features = client.submit(self.stack_features,
                                             *(indices, stacked_features))
                    scaled_feature_space.append(features)

                # More data processing depending on the method used.

            else:
                feature_space = []
                atoms_index_map = [
                    client.scatter(chunk) for chunk in atoms_index_map
                ]

                for indices in atoms_index_map:
                    features = client.submit(self.stack_features,
                                             *(indices, stacked_features))
                    feature_space.append(features)

            del stacked_features
            computations = []

            if svm:
                reference_space = []

                for i, image in enumerate(images.items()):
                    computations.append(
                        self.restack_image(
                            i,
                            image,
                            scaled_feature_space=scaled_feature_space,
                            svm=svm))

                    # image = (hash, ase_image) -> tuple
                    for atom in image[1]:
                        reference_space.append(
                            self.restack_atom(i, atom, scaled_feature_space))

                reference_space = dask.compute(*reference_space,
                                               scheduler=self.scheduler)
            else:
                try:
                    for i, image in enumerate(images.items()):
                        computations.append(
                            self.restack_image(
                                i,
                                image,
                                scaled_feature_space=scaled_feature_space,
                                svm=svm,
                            ))

                except UnboundLocalError:
                    # scaled_feature_space does not exist.
                    for i, image in enumerate(images.items()):
                        computations.append(
                            self.restack_image(i,
                                               image,
                                               feature_space=feature_space,
                                               svm=svm))

            feature_space = dask.compute(*computations,
                                         scheduler=self.scheduler)
            feature_space = OrderedDict(feature_space)
            del computations

            preprocessor.save_to_file(preprocessor, self.save_preprocessor)

            fp_time = time.time() - initial_time

            h, m, s = convert_elapsed_time(fp_time)
            logger.info("Fingerprinting finished in {} hours {} minutes {:.2f}"
                        " seconds.".format(h, m, s))

            if svm:
                if self.filename is not None:
                    logger.info("Fingerprints saved to {}.".format(
                        self.filename))
                    data = {"feature_space": feature_space}
                    data.update({"reference_space": reference_space})
                    dump(data, filename=self.filename)
                return feature_space, reference_space
            else:
                if self.filename is not None:
                    logger.info("Fingerprints saved to {}.".format(
                        self.filename))
                    dump(feature_space, filename=self.filename)
                return feature_space

        elif purpose == "inference":
            feature_space = OrderedDict()
            scaled_feature_space = preprocessor.transform(stacked_features)

            # TODO this has to be parallelized.
            for key, image in images.items():
                if key not in feature_space.keys():
                    feature_space[key] = []
                for index, atom in enumerate(image):
                    symbol = atom.symbol

                    if svm:
                        scaled = scaled_feature_space[index]
                        # TODO change this to something more elegant later
                        try:
                            self.reference_space
                        except AttributeError:
                            # If self.reference does not exist it means that
                            # reference_space is being loaded by Messagepack.
                            symbol = symbol.encode("utf-8")
                    else:
                        scaled = torch.tensor(
                            scaled_feature_space[index],
                            requires_grad=False,
                            dtype=torch.float,
                        )

                    feature_space[key].append((symbol, scaled))

            fp_time = time.time() - initial_time

            h, m, s = convert_elapsed_time(fp_time)

            logger.info("Fingerprinting finished in {} hours {} minutes {:.2f}"
                        " seconds.".format(h, m, s))

            return feature_space
Beispiel #3
0
    def __init__(
        self,
        inputs,
        targets,
        model=None,
        data=None,
        optimizer=(None, None),
        regularization=None,
        epochs=100,
        convergence=None,
        lossfxn=None,
        device="cpu",
        batch_size=None,
        lr_scheduler=None,
    ):

        self.initial_time = time.time()

        atoms_per_image = data.atoms_per_image

        if batch_size is None:
            batch_size = len(inputs.values())

        if isinstance(batch_size, int):
            # Data batches
            chunks = list(get_chunks(inputs, batch_size, svm=False))
            targets = list(get_chunks(targets, batch_size, svm=False))
            atoms_per_image = list(
                get_chunks(atoms_per_image, batch_size, svm=False))

        logger.info(" ")
        logging.info("Batch Information")
        logging.info("-----------------")
        logging.info("Number of batches: {}.".format(len(chunks)))
        logging.info("Batch size: {} elements per batch.".format(batch_size))
        logger.info(" ")

        atoms_per_image = torch.tensor(atoms_per_image,
                                       requires_grad=False,
                                       dtype=torch.float)

        targets = torch.tensor(targets, requires_grad=False)

        if device == "cuda":
            logger.info("Moving data to CUDA...")

            atoms_per_image = atoms_per_image.cuda()
            targets = targets.cuda()
            _inputs = OrderedDict()

            for hash, f in inputs.items():
                _inputs[hash] = []
                for features in f:
                    symbol, vector = features
                    _inputs[hash].append((symbol, vector.cuda()))

            inputs = _inputs

            move_time = time.time() - self.initial_time
            h, m, s = convert_elapsed_time(move_time)
            logger.info("Data moved to GPU in {} hours {} minutes {:.2f} \
                         seconds.".format(h, m, s))
            logger.info(" ")

        # Define optimizer
        self.optimizer_name, self.optimizer = get_optimizer(
            optimizer, model.parameters())

        if lr_scheduler is not None:
            self.scheduler = get_lr_scheduler(self.optimizer, lr_scheduler)

        logger.info(" ")
        logger.info("Starting training...")
        logger.info(" ")

        logger.info("{:6s} {:19s} {:12s} {:8s} {:8s}".format(
            "Epoch", "Time Stamp", "Loss", "RMSE/img", "RMSE/atom"))
        logger.info("{:6s} {:19s} {:12s} {:8s} {:8s}".format(
            "------", "-------------------", "------------", "--------",
            "---------"))
        self.atoms_per_image = atoms_per_image
        self.convergence = convergence
        self.device = device
        self.epochs = epochs
        self.model = model
        self.lr_scheduler = lr_scheduler

        # Data scattering
        client = dask.distributed.get_client()
        self.chunks = [client.scatter(chunk) for chunk in chunks]
        self.targets = [client.scatter(target) for target in targets]

        if lossfxn is None:
            self.lossfxn = AtomicMSELoss
        else:
            self.lossfxn = lossfxn

        # Let the hunger games begin...
        self.trainer()
Beispiel #4
0
    def __init__(
        self,
        inputs,
        targets,
        model=None,
        data=None,
        optimizer=(None, None),
        regularization=None,
        epochs=100,
        convergence=None,
        lossfxn=None,
        device="cpu",
        batch_size=None,
        lr_scheduler=None,
        uncertainty=None,
        checkpoint=None,
        test=None,
    ):

        self.initial_time = time.time()

        if lossfxn is None:
            lossfxn = AtomicMSELoss

        logger.info("")
        logger.info("Training")
        logger.info("========")
        logger.info(f"Convergence criteria: {convergence}")
        logger.info(f"Loss function: {lossfxn.__name__}")
        if uncertainty is not None:
            logger.info("Options:")
            logger.info(f"    - Uncertainty penalization: {pformat(uncertainty)}")
        logger.info("")

        atoms_per_image = data.atoms_per_image

        if batch_size is None:
            batch_size = len(inputs.values())

        if isinstance(batch_size, int):
            # Data batches
            chunks = list(get_chunks(inputs, batch_size, svm=False))
            targets = list(get_chunks(targets, batch_size, svm=False))
            atoms_per_image = list(get_chunks(atoms_per_image, batch_size, svm=False))

            if uncertainty != None:
                uncertainty = list(get_chunks(uncertainty, batch_size, svm=False))
                uncertainty = [
                    torch.tensor(u, requires_grad=False, dtype=torch.float)
                    for u in uncertainty
                ]

        logger.info("")
        logging.info("Batch Information")
        logging.info("-----------------")
        logging.info("Number of batches: {}.".format(len(chunks)))
        logging.info("Batch size: {} elements per batch.".format(batch_size))
        logger.info(" ")

        atoms_per_image = [
            torch.tensor(n_atoms, requires_grad=False, dtype=torch.float)
            for n_atoms in atoms_per_image
        ]

        targets = [torch.tensor(t, requires_grad=False) for t in targets]

        if device == "cuda":
            logger.info("Moving data to CUDA...")

            atoms_per_image = atoms_per_image.cuda()
            targets = targets.cuda()
            _inputs = OrderedDict()

            for hash, f in inputs.items():
                _inputs[hash] = []
                for features in f:
                    symbol, vector = features
                    _inputs[hash].append((symbol, vector.cuda()))

            inputs = _inputs

            move_time = time.time() - self.initial_time
            h, m, s = convert_elapsed_time(move_time)
            logger.info(
                "Data moved to GPU in {} hours {} minutes {:.2f} \
                         seconds.".format(
                    h, m, s
                )
            )
            logger.info(" ")

        # Define optimizer
        self.optimizer_name, self.optimizer = get_optimizer(
            optimizer, model.parameters()
        )

        if lr_scheduler is not None:
            self.scheduler = get_lr_scheduler(self.optimizer, lr_scheduler)

        self.atoms_per_image = atoms_per_image
        self.convergence = convergence
        self.device = device
        self.epochs = epochs
        self.model = model
        self.lr_scheduler = lr_scheduler
        self.lossfxn = lossfxn
        self.checkpoint = checkpoint
        self.test = test

        # Data scattering
        client = dask.distributed.get_client()
        self.chunks = [client.scatter(chunk) for chunk in chunks]
        self.targets = [client.scatter(target) for target in targets]

        if uncertainty != None:
            self.uncertainty = [client.scatter(u) for u in uncertainty]
        else:
            self.uncertainty = uncertainty

        # Let the hunger games begin...
        self.trainer()
Beispiel #5
0
    def trainer(self):
        """Run the training class"""

        logger.info(" ")
        logger.info("Starting training...\n")

        if self.test is None:
            logger.info(
                "{:6s} {:19s} {:12s} {:12s} {:8s}".format(
                    "Epoch", "Time Stamp", "Loss", "Error/img", "Error/atom"
                )
            )
            logger.info(
                "{:6s} {:19s} {:12s} {:8s} {:8s}".format(
                    "------",
                    "-------------------",
                    "------------",
                    "------------",
                    "------------",
                )
            )

        else:
            test_features = self.test.get("features", None)
            test_targets = self.test.get("targets", None)
            test_data = self.test.get("data", None)

            logger.info(
                "{:6s} {:19s} {:12s} {:12s} {:12s} {:12s} {:16s}".format(
                    "Epoch",
                    "Time Stamp",
                    "Loss",
                    "Error/img",
                    "Error/atom",
                    "Error/img (t)",
                    "Error/atom (t)",
                )
            )
            logger.info(
                "{:6s} {:19s} {:12s} {:8s} {:8s} {:8s} {:8s}".format(
                    "------",
                    "-------------------",
                    "------------",
                    "------------",
                    "------------",
                    "------------",
                    "------------",
                )
            )

        converged = False
        _loss = []
        _rmse = []
        epoch = 0

        client = dask.distributed.get_client()

        while not converged:
            epoch += 1

            self.optimizer.zero_grad()  # clear previous gradients
            loss, outputs_ = train.closure(
                self.chunks,
                self.targets,
                self.uncertainty,
                self.model,
                self.lossfxn,
                self.atoms_per_image,
                self.device,
            )
            # We step the optimizer
            if self.optimizer_name != "LBFGS":
                self.optimizer.step()
            else:
                options = {"closure": self.closure, "current_loss": loss, "max_ls": 10}
                self.optimizer.step(options)

            # RMSE per image and per/atom

            rmse = client.submit(compute_rmse, *(outputs_, self.targets))
            atoms_per_image = torch.cat(self.atoms_per_image)

            rmse_atom = client.submit(
                compute_rmse, *(outputs_, self.targets, atoms_per_image)
            )
            rmse = rmse.result()
            rmse_atom = rmse_atom.result()
            _loss.append(loss.item())
            _rmse.append(rmse)
            # In the case that lr_scheduler is not None
            if self.lr_scheduler is not None:
                self.scheduler.step(loss)
                print("Epoch {} lr {}".format(epoch, get_lr(self.optimizer)))

            ts = time.time()
            ts = datetime.datetime.fromtimestamp(ts).strftime("%Y-%m-%d " "%H:%M:%S")

            if self.test is None:
                logger.info(
                    "{:6d} {} {:8e} {:4e} {:4e}".format(
                        epoch, ts, loss.detach(), rmse, rmse_atom
                    )
                )
            else:
                test_model = self.model.eval()
                test_predictions = test_model(test_features).detach()
                rmse_test = client.submit(
                    compute_rmse, *(test_predictions, test_targets)
                )

                atoms_per_image_test = torch.tensor(
                    test_data.atoms_per_image, requires_grad=False
                )
                rmse_atom_test = client.submit(
                    compute_rmse,
                    *(test_predictions, test_targets, atoms_per_image_test),
                )

                rmse_test = rmse_test.result()
                rmse_atom_test = rmse_atom_test.result()

                logger.info(
                    "{:6d} {} {:8e} {:4e} {:4e} {:4e} {:4e}".format(
                        epoch,
                        ts,
                        loss.detach(),
                        rmse,
                        rmse_atom,
                        rmse_test,
                        rmse_atom_test,
                    )
                )

            if self.checkpoint is not None:
                self.checkpoint_save(epoch, self.model, **self.checkpoint)

            if self.convergence is None and epoch == self.epochs:
                converged = True
            elif self.convergence is not None and rmse < self.convergence["energy"]:
                converged = True

        training_time = time.time() - self.initial_time

        h, m, s = convert_elapsed_time(training_time)
        logger.info(
            "Training finished in {} hours {} minutes {:.2f} seconds.".format(h, m, s)
        )
Beispiel #6
0
    def trainer(self):
        """Run the training class"""

        converged = False
        _loss = []
        _rmse = []
        epoch = 0

        annealer = Annealer()
        while not converged:
            epoch += 1

            if self.anneal:
                annealing = annealer.update(epoch)
                print(annealing)
            else:
                annealing = None

            self.optimizer.zero_grad()  # clear previous gradients

            args = {
                "chunks": self.chunks,
                "targets": self.targets,
                "model": self.model,
                "lossfxn": self.lossfxn,
                "device": self.device,
                "inputs_chunk_vals": self.inputs_chunk_vals,
                "annealing": annealing,
            }

            if self.penalize_latent:
                args.update({"penalize_latent": self.penalize_latent})

            loss, outputs_ = train.closure(**args)

            if self.optimizer_name != "LBFGS":
                self.optimizer.step()
            else:
                self.optimizer.extra_arguments = args
                options = {"closure": train.closure, "current_loss": loss, "max_ls": 10}
                self.optimizer.step(options)

            # RMSE per image and per/atom
            rmse = []

            client = dask.distributed.get_client()

            rmse = client.submit(compute_rmse, *(outputs_, self.targets))
            rmse = rmse.result()

            _loss.append(loss.item())
            _rmse.append(rmse)

            if self.lr_scheduler is not None:
                self.scheduler.step(loss)

            ts = time.time()
            ts = datetime.datetime.fromtimestamp(ts).strftime("%Y-%m-%d " "%H:%M:%S")
            logger.info("{:6d} {} {:8e} {:8f}".format(epoch, ts, loss, rmse))

            if self.convergence is not None and rmse < self.convergence["rmse"]:
                converged = True

            elif self.convergence is not None and epoch == self.epochs:
                converged = True

            elif self.convergence is None and epoch == self.epochs:
                converged = True

            # elif cycles == stop:
            #   converged = True

        training_time = time.time() - self.initial_time

        h, m, s = convert_elapsed_time(training_time)
        logger.info(
            "Training finished in {} hours {} minutes {:.2f} seconds.".format(h, m, s)
        )
Beispiel #7
0
    def calculate_features(self,
                           images=None,
                           purpose="training",
                           data=None,
                           svm=False):
        """Return features per atom in an atoms objects

        Parameters
        ----------
        image : dict
            Hashed images using the DataSet class.
        purpose : str
            The supported purposes are: 'training', 'inference'.
        data : obj
            data object
        svm : bool
            Whether or not these features are going to be used for kernel
            methods.

        Returns
        -------
        feature_space : dict
            A dictionary with key hash and value as a list with the following
            structure: {'hash': [('H', [vector]]}
        """

        logger.info(" ")
        logger.info("Fingerprinting")
        logger.info("==============")

        if os.path.isfile(self.filename) and self.overwrite is False:
            logger.warning("Loading features from {}.".format(self.filename))
            logger.info(" ")
            svm_keys = [b"feature_space", b"reference_space"]
            data = load(self.filename)

            if svm_keys == list(data.keys()):
                feature_space = data[svm_keys[0]]
                reference_space = data[svm_keys[1]]
                return feature_space, reference_space
            else:
                return data

        initial_time = time.time()

        # Verify that we know the unique element symbols
        if data.unique_element_symbols is None:
            logger.info(
                "Getting unique element symbols for {}".format(purpose))

            unique_element_symbols = data.get_unique_element_symbols(
                images, purpose=purpose)

            unique_element_symbols = unique_element_symbols[purpose]

            logger.info(
                "Unique chemical elements: {}".format(unique_element_symbols))

        preprocessor = Preprocessing(self.preprocessor, purpose=purpose)
        preprocessor.set(purpose=purpose)

        # We start populating computations with delayed functions to operate
        # with dask's scheduler. These computations get cartesian coordinates.
        computations = []

        for image in images.items():
            key, image = image

            feature_vectors = []

            computations.append(feature_vectors)

            for atom in image:
                if self.preprocessor is not None:
                    # In this case we will preprocess data and need numpy
                    # arrays to operate with sklearn.
                    afp = self.get_atomic_features(atom, svm=True)
                    feature_vectors.append(afp[1])
                else:
                    afp = self.get_atomic_features(atom, svm=svm)
                    feature_vectors.append(afp)

        # In this block we compute the delayed functions in computations.
        feature_space = dask.compute(*computations, scheduler=self.scheduler)

        hashes = list(images.keys())

        if self.preprocessor is not None and purpose == "training":
            feature_space = np.array(feature_space)
            dim = feature_space.shape

            if len(dim) > 1:
                d1, d2, d3 = dim
                feature_space = feature_space.reshape(d1 * d2, d3)
                feature_space = preprocessor.fit(feature_space,
                                                 scheduler=self.scheduler)
                feature_space = feature_space.reshape(d1, d2, d3)
            else:
                atoms_index_map = []
                stack = []

                d1 = ini = end = 0

                for i in feature_space:
                    end = ini + len(i)
                    atoms_map = list(range(ini, end))
                    atoms_index_map.append(atoms_map)
                    ini = end

                    for j in i:
                        stack.append(j)
                        d1 += 1

                feature_space = np.array(stack)

                d2 = len(stack[0])
                del stack

            # More data processing depending on the method used.
            computations = []

            if svm:
                reference_space = []

                for i, image in enumerate(images.items()):
                    computations.append(
                        self.restack_image(i, image, feature_space, svm=svm))

                    # image = (hash, ase_image) -> tuple
                    for atom in image[1]:
                        reference_space.append(
                            self.restack_atom(i, atom, feature_space))

                reference_space = dask.compute(*reference_space,
                                               scheduler=self.scheduler)
            else:
                for i, image in enumerate(images.items()):
                    computations.append(
                        self.restack_image(i, image, feature_space, svm=svm))

            feature_space = dask.compute(*computations,
                                         scheduler=self.scheduler)

            feature_space = OrderedDict(feature_space)

            # Save preprocessor.
            preprocessor.save_to_file(preprocessor, self.save_preprocessor)

        elif self.preprocessor is not None and purpose == "inference":
            # We take stacked features and preprocess them
            stacked_features = np.array(feature_space)
            d1, d2, d3 = stacked_features.shape
            stacked_features = stacked_features.reshape(d1 * d2, d3)
            feature_space = OrderedDict()
            scaled_feature_space = preprocessor.transform(stacked_features)

            # Once preprocessed, they are wrapped as a dictionary.
            # TODO this has to be parallelized.
            for key, image in images.items():
                if key not in feature_space.keys():
                    feature_space[key] = []
                for index, atom in enumerate(image):
                    symbol = atom.symbol

                    if svm:
                        scaled = scaled_feature_space[index]
                        # TODO change this to something more elegant later
                        try:
                            self.reference_space
                        except AttributeError:
                            # If self.reference does not exist it means that
                            # reference_space is being loaded by Messagepack.
                            symbol = symbol.encode("utf-8")
                    else:
                        scaled = torch.tensor(
                            scaled_feature_space[index],
                            requires_grad=False,
                            dtype=torch.float,
                        )

                    feature_space[key].append((symbol, scaled))
        else:

            feature_space = OrderedDict(zip(hashes, feature_space))

        fp_time = time.time() - initial_time

        h, m, s = convert_elapsed_time(fp_time)

        logger.info("Fingerprinting finished in {} hours {} minutes {:.2f} "
                    "seconds.\n".format(h, m, s))

        if svm:
            data = {"feature_space": feature_space}
            dump(data, filename=self.filename)
        else:
            dump(feature_space, filename=self.filename)

        return feature_space
Beispiel #8
0
    def __init__(
        self,
        inputs,
        targets,
        model=None,
        data=None,
        optimizer=(None, None),
        regularization=None,
        epochs=100,
        convergence=None,
        lossfxn=None,
        device="cpu",
        batch_size=None,
        lr_scheduler=None,
        **kwargs
    ):

        supported_keys = ["anneal", "penalize_latent"]

        if len(kwargs.items()) == 0:
            for k in supported_keys:
                setattr(self, k, None)
        else:
            for k, v in kwargs.items():
                if k in supported_keys:
                    setattr(self, k, v)

        self.initial_time = time.time()

        if device == "cuda":
            pass
            """
            logger.info('Moving data to CUDA...')

            targets = targets.cuda()
            _inputs = OrderedDict()

            for hash, f in inputs.items():
                _inputs[hash] = []
                for features in f:
                    symbol, vector = features
                    _inputs[hash].append((symbol, vector.cuda()))

            del inputs
            inputs = _inputs

            move_time = time.time() - initial_time
            h, m, s = convert_elapsed_time(move_time)
            logger.info('Data moved to GPU in {} hours {} minutes {:.2f}
                         seconds.' .format(h, m, s))
            """

        if batch_size is None:
            batch_size = len(inputs.values())

        if isinstance(batch_size, int):
            chunks = list(get_chunks(inputs, batch_size, svm=False))
            targets_ = list(get_chunks(targets, batch_size, svm=False))

        del targets

        # This change is needed because the targets are features or
        # positions and they are built as a dictionary.

        targets = lod_to_list(targets_)

        logging.info("Batch size: {} elements per batch.".format(batch_size))

        if device == "cuda":
            logger.info("Moving data to CUDA...")

            targets = targets.cuda()
            _inputs = OrderedDict()

            for hash, f in inputs.items():
                _inputs[hash] = []
                for features in f:
                    symbol, vector = features
                    _inputs[hash].append((symbol, vector.cuda()))

            inputs = _inputs

            move_time = time.time() - self.initial_time
            h, m, s = convert_elapsed_time(move_time)
            logger.info(
                "Data moved to GPU in {} hours {} minutes {:.2f} \
                         seconds.".format(
                    h, m, s
                )
            )
            logger.info(" ")

        # Define optimizer
        self.optimizer_name, self.optimizer = get_optimizer(
            optimizer, model.parameters()
        )
        if lr_scheduler is not None:
            self.scheduler = get_lr_scheduler(self.optimizer, lr_scheduler)

        if lossfxn is None:
            self.lossfxn = MSELoss
            self.inputs_chunk_vals = None

        else:
            logger.info("Using custom loss function...")
            logger.info("")

            self.lossfxn = lossfxn
            self.inputs_chunk_vals = self.get_inputs_chunks(chunks)

        logger.info(" ")
        logger.info("Starting training...")
        logger.info(" ")

        logger.info(
            "{:6s} {:19s} {:12s} {:9s}".format("Epoch", "Time Stamp", "Loss", "Rec Err")
        )
        logger.info(
            "{:6s} {:19s} {:12s} {:9s}".format(
                "------", "-------------------", "------------", "--------"
            )
        )

        # Data scattering
        client = dask.distributed.get_client()
        self.chunks = [client.scatter(chunk) for chunk in chunks]
        self.targets = [client.scatter(target) for target in targets]

        self.device = device
        self.epochs = epochs
        self.model = model
        self.lr_scheduler = lr_scheduler
        self.convergence = convergence

        # Let the hunger game begin...
        self.trainer()
Beispiel #9
0
    def calculate(self, images=None, purpose="training", data=None, svm=False):
        """Calculate the features per atom in an atoms objects

        Parameters
        ----------
        image : dict
            Hashed images using the Data class.
        purpose : str
            The supported purposes are: 'training', 'inference'.
        data : obj
            data object
        svm : bool
            Whether or not these features are going to be used for kernel
            methods.

        Returns
        -------
        feature_space : dict
            A dictionary with key hash and value as a list with the following
            structure: {'hash': [('H', [vector]]}
        reference_space : dict
            A reference space useful for SVM models.
        """

        client = dask.distributed.get_client()
        logger.info(" ")
        logger.info("Featurization")
        logger.info("=============")
        now = datetime.datetime.now()
        logger.info("Module accessed on {}.".format(
            now.strftime("%Y-%m-%d %H:%M:%S")))
        logger.info(f"Module name: {self.name()}.")

        # FIXME the block below should become a function.
        if os.path.isfile(self.filename) and self.overwrite is False:
            logger.warning(f"Loading features from {self.filename}.")
            logger.info(" ")
            svm_keys = [b"feature_space", b"reference_space"]
            data = load(self.filename)

            data_hashes = list(data.keys())
            image_hashes = list(images.keys())

            if image_hashes == data_hashes:
                # Check if both lists are the same.
                return data
            elif any(i in image_hashes for i in data_hashes):
                # Check if any of the elem
                _data = {}
                for hash in image_hashes:
                    _data[hash] = data[hash]
                return _data

            if svm_keys == list(data.keys()):
                feature_space = data[svm_keys[0]]
                reference_space = data[svm_keys[1]]
                return feature_space, reference_space

        initial_time = time.time()

        # Verify that we know the unique element symbols
        if data.unique_element_symbols is None:
            logger.info(f"Getting unique element symbols for {purpose}")

            unique_element_symbols = data.get_unique_element_symbols(
                images, purpose=purpose)

            unique_element_symbols = unique_element_symbols[purpose]

            logger.info(f"Unique chemical elements: {unique_element_symbols}")

        elif isinstance(data.unique_element_symbols, dict):
            unique_element_symbols = data.unique_element_symbols[purpose]

            logger.info(f"Unique chemical elements: {unique_element_symbols}")

        # we make the features
        self.GP = self.custom.get("GP", None)

        if self.GP is None:
            custom = self.custom.get("user_input", None)
            self.GP = self.make_symmetry_functions(
                unique_element_symbols,
                custom=custom,
                angular_type=self.angular_type)
            self.custom.update({"GP": self.GP})
        else:
            logger.info(
                "Using parameters from file to create symmetry functions...\n")

        self.print_features_params(self.GP)

        symbol = data.unique_element_symbols[purpose][0]
        sample = np.zeros(len(self.GP[symbol]))

        self.dimension = len(sample)

        preprocessor = Preprocessing(self.preprocessor, purpose=purpose)
        preprocessor.set(purpose=purpose)

        # We start populating computations to get atomic features.
        logger.info("")
        logger.info(
            "Embarrassingly parallel computation of atomic features...")

        stacked_features = []
        atoms_index_map = [
        ]  # This list is used to reconstruct images from atoms.

        if self.batch_size is None:
            self.batch_size = data.get_total_number_atoms()

        chunks = get_chunks(images, self.batch_size, svm=svm)

        ini = end = 0
        for chunk in chunks:
            images_ = OrderedDict(chunk)
            intermediate = []

            for image in images_.items():
                _, image = image
                end = ini + len(image)
                atoms_index_map.append(list(range(ini, end)))
                ini = end
                for atom in image:
                    index = atom.index
                    symbol = atom.symbol

                    cutoff_keys = ["radial", "angular"]
                    n_symbols, neighborpositions = {}, {}

                    if isinstance(self.cutoff, dict):
                        for cutoff_key in cutoff_keys:
                            nl = get_neighborlist(
                                image, cutoff=self.cutoff[cutoff_key])
                            # n_indices: neighbor indices for central atom_i.
                            # n_offsets: neighbor offsets for central atom_i.
                            n_indices, n_offsets = nl[atom.index]

                            n_symbols_ = np.array(
                                image.get_chemical_symbols())[n_indices]
                            n_symbols[cutoff_key] = n_symbols_

                            neighborpositions_ = image.positions[
                                n_indices] + np.dot(n_offsets,
                                                    image.get_cell())
                            neighborpositions[cutoff_key] = neighborpositions_
                    else:
                        for cutoff_key in cutoff_keys:
                            nl = get_neighborlist(image, cutoff=self.cutoff)
                            # n_indices: neighbor indices for central atom_i.
                            # n_offsets: neighbor offsets for central atom_i.
                            n_indices, n_offsets = nl[atom.index]

                            n_symbols_ = np.array(
                                image.get_chemical_symbols())[n_indices]
                            n_symbols[cutoff_key] = n_symbols_

                            neighborpositions_ = image.positions[
                                n_indices] + np.dot(n_offsets,
                                                    image.get_cell())
                            neighborpositions[cutoff_key] = neighborpositions_

                    afp = self.get_atomic_features(
                        atom,
                        index,
                        symbol,
                        n_symbols,
                        neighborpositions,
                        image_molecule=image,
                        weighted=self.weighted,
                        n_indices=n_indices,
                    )

                    intermediate.append(afp)

            intermediate = client.persist(intermediate,
                                          scheduler=self.scheduler)
            stacked_features += intermediate
            del intermediate

        scheduler_time = time.time() - initial_time

        dask.distributed.wait(stacked_features)

        h, m, s = convert_elapsed_time(scheduler_time)
        logger.info("... finished in {} hours {} minutes {:.2f}"
                    " seconds.".format(h, m, s))

        logger.info("")

        if self.preprocessor is not None:

            scaled_feature_space = []

            # To take advantage of dask_ml we need to convert our numpy array
            # into a dask array.
            logger.info("Converting features to dask array...")
            stacked_features = [
                da.from_delayed(lazy, dtype=float, shape=sample.shape)
                for lazy in stacked_features
            ]
            layout = {0: tuple(len(i) for i in atoms_index_map), 1: -1}
            # stacked_features = dask.array.stack(stacked_features, axis=0).rechunk(layout)
            stacked_features = da.stack(stacked_features,
                                        axis=0).rechunk(layout)

            logger.info("Shape of array is {} and chunks {}.".format(
                stacked_features.shape, stacked_features.chunks))

            # Note that dask_ml by default convert the output of .fit
            # in a concrete value.
            if purpose == "training":
                stacked_features = preprocessor.fit(stacked_features,
                                                    scheduler=self.scheduler)
            else:
                stacked_features = preprocessor.transform(stacked_features)

            atoms_index_map = [
                client.scatter(indices) for indices in atoms_index_map
            ]
            # stacked_features = [client.scatter(features) for features in stacked_features]
            stacked_features = client.scatter(stacked_features, broadcast=True)

            logger.info("Stacking features using atoms index map...")

            for indices in atoms_index_map:
                features = client.submit(self.stack_features,
                                         *(indices, stacked_features))

                # features = self.stack_features(indices, stacked_features)

                scaled_feature_space.append(features)

        else:
            scaled_feature_space = []
            atoms_index_map = [
                client.scatter(chunk) for chunk in atoms_index_map
            ]
            stacked_features = client.scatter(stacked_features, broadcast=True)

            for indices in atoms_index_map:
                features = client.submit(self.stack_features,
                                         *(indices, stacked_features))
                scaled_feature_space.append(features)

            scaled_feature_space = client.gather(scaled_feature_space)

        # Clean
        del stacked_features

        # Restack images
        feature_space = []

        if svm and purpose == "training":
            logger.info("Building array with reference space.")
            reference_space = []

            for i, image in enumerate(images.items()):
                restacked = client.submit(
                    self.restack_image, *(i, image, scaled_feature_space, svm))

                # image = (hash, ase_image) -> tuple
                for atom in image[1]:
                    restacked_atom = client.submit(
                        self.restack_atom, *(i, atom, scaled_feature_space))
                    reference_space.append(restacked_atom)

                feature_space.append(restacked)

            reference_space = client.gather(reference_space)

        elif svm is False and purpose == "training":
            for i, image in enumerate(images.items()):
                restacked = client.submit(
                    self.restack_image, *(i, image, scaled_feature_space, svm))
                feature_space.append(restacked)

        else:
            try:
                for i, image in enumerate(images.items()):
                    restacked = client.submit(
                        self.restack_image,
                        *(i, image, scaled_feature_space, svm))
                    feature_space.append(restacked)

            except UnboundLocalError:
                # scaled_feature_space does not exist.
                for i, image in enumerate(images.items()):
                    restacked = client.submit(self.restack_image,
                                              *(i, image, feature_space, svm))
                    feature_space.append(restacked)

        feature_space = client.gather(feature_space)
        feature_space = OrderedDict(feature_space)

        fp_time = time.time() - initial_time

        h, m, s = convert_elapsed_time(fp_time)

        logger.info("Featurization finished in {} hours {} minutes {:.2f}"
                    " seconds.".format(h, m, s))

        if svm and purpose == "training":
            client.restart()  # Reclaims memory aggressively
            preprocessor.save_to_file(preprocessor, self.save_preprocessor)

            if self.filename is not None:
                logger.info(f"features saved to {self.filename}.")
                data = {"feature_space": feature_space}
                data.update({"reference_space": reference_space})
                dump(data, filename=self.filename)
                self.feature_space = feature_space
                self.reference_space = reference_space

            return self.feature_space, self.reference_space

        elif svm is False and purpose == "training":
            client.restart()  # Reclaims memory aggressively
            preprocessor.save_to_file(preprocessor, self.save_preprocessor)

            if self.filename is not None:
                logger.info(f"features saved to {self.filename}.")
                dump(feature_space, filename=self.filename)
                self.feature_space = feature_space

            return self.feature_space
        else:
            self.feature_space = feature_space
            return self.feature_space
Beispiel #10
0
    def calculate(self, images=None, purpose="training", data=None, svm=False):
        """Calculate the features per atom in an atoms objects

        Parameters
        ----------
        image : dict
            Hashed images using the Data class.
        purpose : str
            The supported purposes are: 'training', 'inference'.
        data : obj
            data object
        svm : bool
            Whether or not these features are going to be used for kernel
            methods.

        Returns
        -------
        feature_space : dict
            A dictionary with key hash and value as a list with the following
            structure: {'hash': [('H', [vector]]}
        reference_space : dict
            A reference space useful for SVM models.
        """

        client = dask.distributed.get_client()
        logger.info(" ")
        logger.info("Featurization")
        logger.info("=============")
        now = datetime.datetime.now()
        logger.info("Module accessed on {}.".format(
            now.strftime("%Y-%m-%d %H:%M:%S")))

        # FIXME the block below should become a function.
        if os.path.isfile(self.filename) and self.overwrite is False:
            logger.warning("Loading features from {}.".format(self.filename))
            logger.info(" ")
            svm_keys = [b"feature_space", b"reference_space"]
            data = load(self.filename)

            data_hashes = list(data.keys())
            image_hashes = list(images.keys())

            if image_hashes == data_hashes:
                # Check if both lists are the same.
                return data
            elif any(i in image_hashes for i in data_hashes):
                # Check if any of the elem
                _data = {}
                for hash in image_hashes:
                    _data[hash] = data[hash]
                return _data

            if svm_keys == list(data.keys()):
                feature_space = data[svm_keys[0]]
                reference_space = data[svm_keys[1]]
                return feature_space, reference_space

        initial_time = time.time()

        # Verify that we know the unique element symbols
        if data.unique_element_symbols is None:
            logger.info(
                "Getting unique element symbols for {}".format(purpose))

            unique_element_symbols = data.get_unique_element_symbols(
                images, purpose=purpose)

            unique_element_symbols = unique_element_symbols[purpose]

            logger.info(
                "Unique chemical elements: {}".format(unique_element_symbols))

        elif isinstance(data.unique_element_symbols, dict):
            unique_element_symbols = data.unique_element_symbols[purpose]

            logger.info(
                "Unique chemical elements: {}".format(unique_element_symbols))

        # we make the features
        preprocessor = Preprocessing(self.preprocessor, purpose=purpose)
        preprocessor.set(purpose=purpose)

        # We start populating computations to get atomic features.
        logger.info("")
        logger.info(
            "Embarrassingly parallel computation of atomic features...")

        stacked_features = []
        atoms_symbols_map = [
        ]  # This list is used to reconstruct images from atoms.

        if self.batch_size is None:
            self.batch_size = data.get_total_number_atoms()

        chunks = get_chunks(images, self.batch_size, svm=svm)

        for chunk in chunks:
            images_ = OrderedDict(chunk)
            intermediate = []

            for image in images_.items():
                key, image = image
                atoms_symbols_map.append(image.get_chemical_symbols())
                # Use .create() class method from dscribe.
                _features = dask.delayed(self.create)(image)
                intermediate.append(_features)

            intermediate = client.compute(intermediate,
                                          scheduler=self.scheduler)
            stacked_features += intermediate
            del intermediate

        # scheduler_time = time.time() - initial_time

        # dask.distributed.wait(stacked_features)

        logger.info("")

        if self.preprocessor is not None:
            raise NotImplementedError

        else:
            scaled_feature_space = []
            atoms_symbols_map = [
                client.scatter(chunk) for chunk in atoms_symbols_map
            ]
            stacked_features = client.scatter(stacked_features, broadcast=True)

            for image_index, symbols in enumerate(atoms_symbols_map):
                features = client.submit(
                    self.stack_features,
                    *(symbols, image_index, stacked_features))
                scaled_feature_space.append(features)

            scaled_feature_space = client.gather(scaled_feature_space)

        # Clean
        del stacked_features

        # Restack images
        feature_space = []

        if svm and purpose == "training":

            for i, image in enumerate(images.items()):
                restacked = client.submit(
                    self.restack_image, *(i, image, scaled_feature_space, svm))

                feature_space.append(restacked)

        elif svm is False and purpose == "training":
            for i, image in enumerate(images.items()):
                restacked = client.submit(
                    self.restack_image, *(i, image, scaled_feature_space, svm))
                feature_space.append(restacked)

        else:
            try:
                for i, image in enumerate(images.items()):
                    restacked = client.submit(
                        self.restack_image,
                        *(i, image, scaled_feature_space, svm))
                    feature_space.append(restacked)

            except UnboundLocalError:
                # scaled_feature_space does not exist.
                for i, image in enumerate(images.items()):
                    restacked = client.submit(self.restack_image,
                                              *(i, image, feature_space, svm))
                    feature_space.append(restacked)

        feature_space = client.gather(feature_space)

        if svm and purpose == "training":
            # FIXME This might need to be improved
            logger.info("Building array with reference space.")
            hashes, reference_space = list(zip(*feature_space))
            del hashes
            reference_space = list(
                itertools.chain.from_iterable(reference_space))
            logger.info("Finished reference space.")

        feature_space = OrderedDict(feature_space)

        fp_time = time.time() - initial_time

        h, m, s = convert_elapsed_time(fp_time)

        logger.info("Featurization finished in {} hours {} minutes {:.2f}"
                    " seconds.".format(h, m, s))

        if svm and purpose == "training":
            client.restart()  # Reclaims memory aggressively
            preprocessor.save_to_file(preprocessor, self.save_preprocessor)

            if self.filename is not None:
                logger.info("features saved to {}.".format(self.filename))
                data = {"feature_space": feature_space}
                data.update({"reference_space": reference_space})
                dump(data, filename=self.filename)
                self.feature_space = feature_space
                self.reference_space = reference_space

            return self.feature_space, self.reference_space

        elif svm is False and purpose == "training":
            client.restart()  # Reclaims memory aggressively
            preprocessor.save_to_file(preprocessor, self.save_preprocessor)

            if self.filename is not None:
                logger.info("features saved to {}.".format(self.filename))
                dump(feature_space, filename=self.filename)
                self.feature_space = feature_space

            return self.feature_space
        else:
            self.feature_space = feature_space
            return self.feature_space
Beispiel #11
0
    def prepare_model(self,
                      feature_space,
                      reference_features,
                      data=None,
                      purpose="training"):
        """Prepare the Kernel Ridge Regression model

        Parameters
        ----------
        feature_space : dict
            A dictionary with hash, fingerprint structure.
        reference_features : dict
            A dictionary with raveled tuples of symbol, atomic fingerprint.
        data : object
            DataSet object created from the handler.
        purpose : str
            Purpose of this model: 'training', 'inference'.


        Notes
        -----
        This method builds the atomic kernel matrices and the LT vectors needed
        to apply the atomic decomposition Ansatz.
        """
        if purpose == "training":
            logger.info("Model Training")
            logger.info("Model name: {}.".format(self.name()))
            logger.info("Kernel parameters:")
            logger.info("    - Kernel function: {}.".format(self.kernel))
            logger.info("    - Sigma: {}.".format(self.sigma))
            logger.info("    - Lamda: {}.".format(self.lamda))

        dim = len(reference_features)
        """
        Atomic kernel matrices
        """

        initial_time = time.time()

        logger.info("Computing Kernel Matrix...")
        # We start populating computations with delayed functions to
        # operate with dask's scheduler
        logger.warning("    Adding calculations to scheduler...")

        computations = self.get_kernel_matrix(feature_space,
                                              reference_features)

        scheduler_time = time.time() - initial_time
        h, m, s = convert_elapsed_time(scheduler_time)
        logger.info("    {} kernel evaluations added in {} hours {} minutes "
                    "{:.2f} seconds.".format(len(computations), h, m, s))

        if self.batch_size is not None:
            computations = list(get_chunks(computations, self.batch_size))
            logger.info(
                "    The calculations were batched in groups of {}.".format(
                    self.batch_size))

        # We compute the calculations with dask and the result is converted
        # to numpy array.
        logger.info("    Evaluating atomic similarities...")

        if self.batch_size is None:
            kernel_matrix = dask.compute(*computations,
                                         scheduler=self.scheduler)
        else:
            kernel_matrix = []
            for i, chunk in enumerate(computations):
                kernel_matrix.append(
                    dask.compute(*chunk, scheduler=self.scheduler))

        self.K = np.array(kernel_matrix).reshape(dim, dim)

        build_time = time.time() - initial_time
        h, m, s = convert_elapsed_time(build_time)
        logger.info("Kernel matrix built in {} hours {} minutes {:.2f} "
                    "seconds.".format(h, m, s))
        """
        LT Vectors
        """
        # We build the LT matrix needed for ADA
        logger.info("Building LT matrix")
        computations = []
        for index, feature_space in enumerate(feature_space.items()):
            computations.append(self.get_lt(index))

        self.LT = np.array((dask.compute(*computations,
                                         scheduler=self.scheduler)))

        lt_time = time.time() - initial_time
        h, m, s = convert_elapsed_time(lt_time)
        logger.info(
            "LT matrix built in {} hours {} minutes {:.2f} seconds.".format(
                h, m, s))
Beispiel #12
0
    def get_kernel_matrix(self, feature_space, reference_features, purpose):
        """Get kernel matrix delayed computations


        Parameters
        ----------
        features : dict, list
            Dictionary with hash and features, or a list.
        reference_space : array
            Array with reference feature space.
        purpose : str
            Purpose of this kernel matrix. Accepted arguments are 'training',
            and 'inference'.

        Returns
        -------
        kernel_matrix
            List with kernel matrix values.


        Notes
        -----
        This class method expects the feature_space to be an OrderedDict and
        reference_space but it turns out that for computing variances, it
        might be the case the feature_space is also a list.
        """

        call = {"exponential": exponential, "laplacian": laplacian, "rbf": rbf}

        initial_time = time.time()
        if isinstance(reference_features, dict):
            # This is the case when the reference_features are a
            # dictionary, too. If that's true we have to convert it to a list.
            reference_features = list(reference_features.values())[0]

        chunks = list(get_chunks(feature_space, self.batch_size))

        logger.info(
            "    The calculations are distributed in {} batches of {} atoms.".
            format(len(chunks), self.batch_size))

        counter = 0
        kernel_matrix = []

        for c, chunk in enumerate(chunks):
            chunk_initial_time = time.time()
            logger.info(
                "        Computing kernel functions for chunk {}...".format(c))
            intermediates = []

            if isinstance(feature_space, dict) and isinstance(
                    reference_features, list):
                if isinstance(chunk, dict) is False:
                    chunk = OrderedDict(chunk)

                reference_lenght = len(reference_features)

                for hash, _feature_space in chunk.items():
                    f_map = []
                    for i_symbol, i_afp in _feature_space:
                        i_symbol = decode(i_symbol)
                        f_map.append(1)

                        if purpose == "training":

                            for j in range(counter, reference_lenght):
                                j_symbol, j_afp = reference_features[j]

                                kernel = call[self.kernel](i_afp, j_afp,
                                                           i_symbol, j_symbol,
                                                           self.sigma)

                                intermediates.append(kernel)
                            counter += 1
                        else:
                            for j_symbol, j_afp in reference_features:
                                j_symbol = decode(j_symbol)
                                kernel = call[self.kernel](i_afp, j_afp,
                                                           i_symbol, j_symbol,
                                                           self.sigma)
                                intermediates.append(kernel)
                    self.fingerprint_map.append(f_map)

            elif isinstance(feature_space, list) and isinstance(
                    reference_features, list):
                for i_symbol, i_afp in chunk:
                    for j_symbol, j_afp in reference_features:
                        i_symbol = decode(i_symbol)
                        j_symbol = decode(j_symbol)

                        kernel = call[self.kernel](i_afp, j_afp, i_symbol,
                                                   j_symbol, self.sigma)
                        intermediates.append(kernel)

            # Compute stuff from above
            kernel_matrix += dask.compute(intermediates,
                                          scheduler=self.scheduler)[0]
            del intermediates

        chunk_final_time = time.time() - chunk_initial_time
        h, m, s = convert_elapsed_time(chunk_final_time)
        logger.info("          ...finished in {} hours {} minutes {:.2f} "
                    "seconds.".format(h, m, s))
        # dask.distributed.wait(kernel_matrix)

        del reference_features

        # kernel_matrix = client.gather(kernel_matrix)
        build_time = time.time() - initial_time
        h, m, s = convert_elapsed_time(build_time)
        logger.info("Kernel matrix built in {} hours {} minutes {:.2f} "
                    "seconds.".format(h, m, s))
        """
        LT Vectors
        """
        # We build the LT matrix needed for ADA
        if purpose == "training":
            self.LT = []
            logger.info("Building LT matrix")
            computations = []
            for index, feature_space in enumerate(feature_space.items()):
                computations.append(self.get_lt(index))

            computations = list(get_chunks(computations, self.batch_size))
            logger.info(
                "    The calculations are distributed in {} batches of {} molecules."
                .format(len(computations), self.batch_size))
            for chunk in computations:
                self.LT += dask.compute(*chunk, scheduler=self.scheduler)

            self.LT = np.array(self.LT)
            del computations
            del chunk
            lt_time = time.time() - initial_time
            h, m, s = convert_elapsed_time(lt_time)
            logger.info(
                "LT matrix built in {} hours {} minutes {:.2f} seconds.".
                format(h, m, s))

        return kernel_matrix