コード例 #1
0
ファイル: potentials.py プロジェクト: eligardella/ml4chem
    def load(Cls, model=None, params=None, preprocessor=None, **kwargs):
        """Load a model

        Parameters
        ----------
        model : str
            The path to load the model from the .ml4c file for inference.
        params : srt
            The path to load .params file with users' inputs.
        preprocessor : str
            The path to load the file with the sklearn preprocessor object.
        """
        kwargs["ml4chem_path"] = model
        kwargs["preprocessor"] = preprocessor

        with open(params) as ml4chem_params:
            ml4chem_params = json.load(ml4chem_params)
            model_type = ml4chem_params["model"].get("type")

            if model_type == "svm":
                model_params = ml4chem_params["model"]
                del model_params["name"]  # delete unneeded key, value
                del model_params["type"]  # delete unneeded key, value
                from ml4chem.models.kernelridge import KernelRidge

                weights = load(model)
                # TODO remove after de/serialization is fixed.
                weights = {
                    key.decode("utf-8"): value
                    for key, value in weights.items()
                }
                model_params.update({"weights": weights})
                model = KernelRidge(**model_params)
            else:
                # Instantiate the model class
                model_params = ml4chem_params["model"]
                del model_params["name"]  # delete unneeded key, value
                del model_params["type"]  # delete unneeded key, value
                from ml4chem.models.neuralnetwork import NeuralNetwork

                model = NeuralNetwork(**model_params)

        # Instantiation of fingerprint class
        fingerprint_params = ml4chem_params.get("fingerprints", None)

        if fingerprint_params is None:
            fingerprints = fingerprint_params
        else:
            name = fingerprint_params.get("name")
            del fingerprint_params["name"]

            fingerprints = dynamic_import(name, "ml4chem.fingerprints")
            fingerprints = fingerprints(**fingerprint_params)

        calc = Cls(fingerprints=fingerprints, model=model, **kwargs)

        return calc
コード例 #2
0
ファイル: cu_inference.py プロジェクト: vishankkumar/ml4chem
def autoencode():
    # Load the images with ASE
    latent_space = load("cu_training.latent")

    latent_load = []
    for e in list(latent_space.values()):
        for symbol, features in e:
            latent_load.append(features)

    latent_load = np.array(latent_load).flatten()

    images = Trajectory("cu_training.traj")
    purpose = "training"

    # Arguments for fingerprinting the images
    normalized = True

    data_handler = Data(images, purpose=purpose)
    images, energies = data_handler.get_data(purpose=purpose)

    preprocessor = ("MinMaxScaler", {"feature_range": (-1, 1)})

    features = (
        "Gaussian",
        {
            "cutoff": 6.5,
            "normalized": normalized,
            "preprocessor": preprocessor,
            "save_preprocessor": "inference.scaler",
        },
    )
    encoder = {"model": "ml4chem.ml4c", "params": "ml4chem.params"}

    features = LatentFeatures(
        features=features,
        encoder=encoder,
        preprocessor=None,
        save_preprocessor="latent_space_min_max.scaler",
    )

    features = features.calculate(images,
                                  purpose=purpose,
                                  data=data_handler,
                                  svm=True)

    latent_svm = []
    for e in list(features.values()):
        for symbol, features in e:
            latent_svm.append(features)

    latent_svm = np.array(latent_svm).flatten()

    assert np.allclose(latent_load, latent_svm)
コード例 #3
0
def autoencode():
    # Load the images with ASE
    latent_space = load("cu_training.latent")
    print("Latent space from file")
    print(latent_space)

    images = Trajectory("cu_training.traj")
    purpose = "training"

    # Arguments for fingerprinting the images
    normalized = True

    data_handler = DataSet(images, purpose=purpose)
    images, energies = data_handler.get_images(purpose=purpose)

    fingerprints = (
        "Gaussian",
        {
            "cutoff": 6.5,
            "normalized": normalized,
            "save_preprocessor": "inference.scaler",
        },
    )
    encoder = {"model": "model.ml4c", "params": "model.params"}
    preprocessor = ("MinMaxScaler", {"feature_range": (-1, 1)})

    fingerprints = LatentFeatures(
        features=fingerprints,
        encoder=encoder,
        preprocessor=preprocessor,
        save_preprocessor="latent_space_min_max.scaler",
    )
    fingerprints = fingerprints.calculate_features(images,
                                                   purpose=purpose,
                                                   data=data_handler,
                                                   svm=False)

    print("Latent space from LatentFeatures class")
    print(fingerprints)
コード例 #4
0
ファイル: potentials.py プロジェクト: eligardella/ml4chem
    def calculate(self, atoms, properties, system_changes):
        """Calculate things

        Parameters
        ----------
        atoms : object, list
            List if images in ASE format.
        properties :
        """
        purpose = "inference"
        Calculator.calculate(self, atoms, properties, system_changes)
        model_name = self.model.name()

        # We convert the atoms in atomic fingerprints
        data_handler = DataSet([atoms], purpose=purpose)
        atoms = data_handler.get_data(purpose=purpose)

        # We copy the loaded fingerprint class
        fingerprints = copy.deepcopy(self.fingerprints)
        kwargs = {"data": data_handler, "purpose": purpose}

        if model_name in Potentials.svm_models:
            kwargs.update({"svm": True})

        if fingerprints.name() == "LatentFeatures":
            fingerprints = fingerprints.calculate_features(atoms, **kwargs)
        else:
            fingerprints.preprocessor = self.preprocessor
            fingerprints = fingerprints.calculate_features(atoms, **kwargs)

        if "energy" in properties:
            logger.info("Computing energy...")
            if model_name in Potentials.svm_models:

                try:
                    reference_space = load(self.reference_space)
                except:
                    raise ("This is not a database...")

                energy = self.model.get_potential_energy(
                    fingerprints, reference_space)

            else:
                input_dimension = len(list(fingerprints.values())[0][0][-1])
                model = copy.deepcopy(self.model)
                model.prepare_model(input_dimension,
                                    data=data_handler,
                                    purpose=purpose)
                try:
                    model.load_state_dict(torch.load(self.ml4chem_path),
                                          strict=True)
                except RuntimeError:
                    logger.warning(
                        'Your image does not have some atoms present in the loaded model.\n'
                    )
                    model.load_state_dict(torch.load(self.ml4chem_path),
                                          strict=False)
                model.eval()
                energy = model(fingerprints).item()

            # Populate ASE's self.results dict
            self.results["energy"] = energy
コード例 #5
0
    def calculate_features(self,
                           images=None,
                           purpose="training",
                           data=None,
                           svm=False):
        """Calculate the features per atom in an atoms objects

        Parameters
        ----------
        image : dict
            Hashed images using the DataSet class.
        purpose : str
            The supported purposes are: 'training', 'inference'.
        data : obj
            data object
        svm : bool
            Whether or not these features are going to be used for kernel
            methods.

        Returns
        -------
        feature_space : dict
            A dictionary with key hash and value as a list with the following
            structure: {'hash': [('H', [vector]]}
        reference_space : dict
            A reference space useful for SVM models.
        """

        logger.info(" ")
        logger.info("Fingerprinting")
        logger.info("==============")

        # FIXME the block below should become a function.
        if os.path.isfile(self.filename) and self.overwrite is False:
            logger.warning("Loading features from {}.".format(self.filename))
            logger.info(" ")
            svm_keys = [b"feature_space", b"reference_space"]
            data = load(self.filename)

            data_hashes = list(data.keys())
            image_hashes = list(images.keys())

            if image_hashes == data_hashes:
                # Check if both lists are the same.
                return data
            elif any(i in image_hashes for i in data_hashes):
                # Check if any of the elem
                _data = {}
                for hash in image_hashes:
                    _data[hash] = data[hash]
                return _data

            if svm_keys == list(data.keys()):
                feature_space = data[svm_keys[0]]
                reference_space = data[svm_keys[1]]
                return feature_space, reference_space

        initial_time = time.time()

        # Verify that we know the unique element symbols
        if data.unique_element_symbols is None:
            logger.info(
                "Getting unique element symbols for {}".format(purpose))

            unique_element_symbols = data.get_unique_element_symbols(
                images, purpose=purpose)

            unique_element_symbols = unique_element_symbols[purpose]

            logger.info(
                "Unique chemical elements: {}".format(unique_element_symbols))

        # we make the features
        self.GP = self.custom.get("GP", None)

        if self.GP is None:
            custom = self.custom.get("user_input", None)
            self.GP = self.make_symmetry_functions(
                unique_element_symbols,
                custom=custom,
                angular_type=self.angular_type)
            self.custom.update({"GP": self.GP})
        else:
            logger.info(
                'Using parameters from file to create symmetry functions...\n')

        self.print_fingerprint_params(self.GP)

        preprocessor = Preprocessing(self.preprocessor, purpose=purpose)
        preprocessor.set(purpose=purpose)

        # We start populating computations to get atomic fingerprints.
        logger.info("")
        logger.info("Adding atomic feature calculations to scheduler...")

        ini = end = 0

        computations = []
        atoms_index_map = [
        ]  # This list is used to reconstruct images from atoms.

        for image in images.items():
            key, image = image
            end = ini + len(image)
            atoms_index_map.append(list(range(ini, end)))
            ini = end
            for atom in image:
                index = atom.index
                symbol = atom.symbol
                nl = get_neighborlist(image, cutoff=self.cutoff)
                # n_indices: neighbor indices for central atom_i.
                # n_offsets: neighbor offsets for central atom_i.
                n_indices, n_offsets = nl[atom.index]

                n_symbols = np.array(image.get_chemical_symbols())[n_indices]
                neighborpositions = image.positions[n_indices] + np.dot(
                    n_offsets, image.get_cell())

                afp = self.get_atomic_fingerprint(
                    atom,
                    index,
                    symbol,
                    n_symbols,
                    neighborpositions,
                    self.preprocessor,
                    image_molecule=image,
                    weighted=self.weighted,
                    n_indices=n_indices,
                )

                computations.append(afp)

        scheduler_time = time.time() - initial_time

        h, m, s = convert_elapsed_time(scheduler_time)
        logger.info("... finished in {} hours {} minutes {:.2f}"
                    " seconds.".format(h, m, s))

        # In this block we compute the fingerprints.
        logger.info("")
        logger.info("Computing fingerprints...")

        stacked_features = dask.compute(*computations,
                                        scheduler=self.scheduler)

        if self.preprocessor is not None:
            stacked_features = np.array(stacked_features)

        # Clean
        del computations

        if purpose == "training":
            # To take advantage of dask_ml we need to convert our numpy array
            # into a dask array.
            client = dask.distributed.get_client()

            if self.preprocessor is not None:
                scaled_feature_space = []
                dim = stacked_features.shape
                stacked_features = dask.array.from_array(stacked_features,
                                                         chunks=dim)
                stacked_features = preprocessor.fit(stacked_features,
                                                    scheduler=self.scheduler)
                atoms_index_map = [
                    client.scatter(chunk) for chunk in atoms_index_map
                ]

                for indices in atoms_index_map:
                    features = client.submit(self.stack_features,
                                             *(indices, stacked_features))
                    scaled_feature_space.append(features)

                # More data processing depending on the method used.

            else:
                feature_space = []
                atoms_index_map = [
                    client.scatter(chunk) for chunk in atoms_index_map
                ]

                for indices in atoms_index_map:
                    features = client.submit(self.stack_features,
                                             *(indices, stacked_features))
                    feature_space.append(features)

            del stacked_features
            computations = []

            if svm:
                reference_space = []

                for i, image in enumerate(images.items()):
                    computations.append(
                        self.restack_image(
                            i,
                            image,
                            scaled_feature_space=scaled_feature_space,
                            svm=svm))

                    # image = (hash, ase_image) -> tuple
                    for atom in image[1]:
                        reference_space.append(
                            self.restack_atom(i, atom, scaled_feature_space))

                reference_space = dask.compute(*reference_space,
                                               scheduler=self.scheduler)
            else:
                try:
                    for i, image in enumerate(images.items()):
                        computations.append(
                            self.restack_image(
                                i,
                                image,
                                scaled_feature_space=scaled_feature_space,
                                svm=svm,
                            ))

                except UnboundLocalError:
                    # scaled_feature_space does not exist.
                    for i, image in enumerate(images.items()):
                        computations.append(
                            self.restack_image(i,
                                               image,
                                               feature_space=feature_space,
                                               svm=svm))

            feature_space = dask.compute(*computations,
                                         scheduler=self.scheduler)
            feature_space = OrderedDict(feature_space)
            del computations

            preprocessor.save_to_file(preprocessor, self.save_preprocessor)

            fp_time = time.time() - initial_time

            h, m, s = convert_elapsed_time(fp_time)
            logger.info("Fingerprinting finished in {} hours {} minutes {:.2f}"
                        " seconds.".format(h, m, s))

            if svm:
                if self.filename is not None:
                    logger.info("Fingerprints saved to {}.".format(
                        self.filename))
                    data = {"feature_space": feature_space}
                    data.update({"reference_space": reference_space})
                    dump(data, filename=self.filename)
                return feature_space, reference_space
            else:
                if self.filename is not None:
                    logger.info("Fingerprints saved to {}.".format(
                        self.filename))
                    dump(feature_space, filename=self.filename)
                return feature_space

        elif purpose == "inference":
            feature_space = OrderedDict()
            scaled_feature_space = preprocessor.transform(stacked_features)

            # TODO this has to be parallelized.
            for key, image in images.items():
                if key not in feature_space.keys():
                    feature_space[key] = []
                for index, atom in enumerate(image):
                    symbol = atom.symbol

                    if svm:
                        scaled = scaled_feature_space[index]
                        # TODO change this to something more elegant later
                        try:
                            self.reference_space
                        except AttributeError:
                            # If self.reference does not exist it means that
                            # reference_space is being loaded by Messagepack.
                            symbol = symbol.encode("utf-8")
                    else:
                        scaled = torch.tensor(
                            scaled_feature_space[index],
                            requires_grad=False,
                            dtype=torch.float,
                        )

                    feature_space[key].append((symbol, scaled))

            fp_time = time.time() - initial_time

            h, m, s = convert_elapsed_time(fp_time)

            logger.info("Fingerprinting finished in {} hours {} minutes {:.2f}"
                        " seconds.".format(h, m, s))

            return feature_space
コード例 #6
0
def plot_atomic_features(
    latent_space,
    method="PCA",
    dimensions=2,
    backend="seaborn",
    data_only=False,
    preprocessor=None,
    backend_kwargs=None,
    **kwargs,
):
    """Plot high dimensional atomic feature vectors

    This function can take a feature space dictionary, or a database file
    and plot the atomic features using PCA or t-SNE.

    $ ml4chem --plot tsne --file path.db

    Parameters
    ----------
    latent_space : dict or str
        Dictionary of atomic features of path to database file.
    method : str, optional
        Dimensionality reduction method to employed, by default "PCA".
        Supported are: "PCA" and "TSNE".
    dimensions : int, optional
        Number of dimensions to reduce the high dimensional atomic feature
        vectors, by default 2.
    backend : str, optional
        Select the backend to plot features. Supported are "plotly" and
        "seaborn", by default "plotly".
    preprocessor : obj
        One of the preprocessors supported by sklearn e.g.: StandardScaler(),
        Normalizer().
    backend_kwargs : dict
        Dictionary with extra keyword arguments to extend functionality of
        backends that cannot be set with the defaults keyword arguments of
        the plot_atomic_features function.

        For more information see:
            - https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html
            - https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html
    data_only : bool
        If set to True, this function returns only data in a dataframe with
        the following structure:
    """
    if backend_kwargs == None:
        backend_kwargs = {}

    method = method.lower()
    backend = backend.lower()
    dot_size = kwargs.get("dot_size", 2)

    supported_methods = ["pca", "tsne"]

    if method not in supported_methods:
        raise NotImplementedError

    if backend == "seaborn":
        # This hack is needed because it seems plotly import overwrite
        # everything.
        import matplotlib.pyplot as plt

    axis = ["x", "y", "z"]

    if dimensions > 3:
        raise NotImplementedError
    elif dimensions == 2:
        axis.pop(-1)

    if isinstance(latent_space, str):
        latent_space = load(latent_space)

    full_ls = []
    full_symbols = []

    # This conditional is needed if you are passing an atomic feature database.
    if b"feature_space" in latent_space.keys():
        latent_space = latent_space[b"feature_space"]

    for hash, feature_space in latent_space.items():
        for symbol, feature_vector in feature_space:
            try:
                symbol = symbol.decode("utf-8")
            except AttributeError:
                pass

            if isinstance(feature_vector, np.ndarray) is False:
                feature_vector = feature_vector.numpy()

            full_symbols.append(symbol)
            full_ls.append(feature_vector)

    if method == "pca":
        from sklearn.decomposition import PCA

        labels = {str(axis[i]): "PCA-{}".format(i + 1) for i in range(len(axis))}

        dim_reduction = PCA(n_components=dimensions, **backend_kwargs)

        if preprocessor != None:
            logger.info(
                f"Creating pipeline with preprocessor {preprocessor.__class__.__name__}..."
            )
            dim_reduction = make_pipeline(preprocessor, dim_reduction)

        pca_result = dim_reduction.fit_transform(full_ls)

        to_pandas = []

        entry = []
        for i, element in enumerate(pca_result):
            entry = [full_symbols[i]]
            for d in range(dimensions):
                entry.append(element[d])
            to_pandas.append(entry)

        columns = ["Symbol"]
        args = {}

        for key in axis:
            columns.append(labels[key])
            args[key] = labels[key]

        df = pd.DataFrame(to_pandas, columns=columns)

        if dimensions == 3 and backend == "plotly":
            args["color"] = "Symbol"
            plt = px.scatter_3d(df, **args)
            plt.update_traces(marker=dict(size=dot_size))
        elif dimensions == 2 and backend == "plotly":
            args["color"] = "Symbol"
            plt = px.scatter(df, **args)
            plt.update_traces(marker=dict(size=dot_size))
        elif dimensions == 3 and backend == "seaborn":
            raise ("This backend is for 2D visualization")
        elif dimensions == 2 and backend == "seaborn":
            sns.scatterplot(**labels, data=df, hue="Symbol")

    elif method == "tsne":
        from sklearn import manifold

        labels = {str(axis[i]): "t-SNE-{}".format(i + 1) for i in range(len(axis))}

        dim_reduction = manifold.TSNE(n_components=dimensions, **backend_kwargs)

        if preprocessor != None:
            logger.info(
                f"Creating pipeline with preprocessor {preprocessor.__class__.__name__}..."
            )
            dim_reduction = make_pipeline(preprocessor, dim_reduction)

        tsne_result = dim_reduction.fit_transform(full_ls)

        to_pandas = []

        entry = []
        for i, element in enumerate(tsne_result):
            entry = [full_symbols[i]]
            for d in range(dimensions):
                entry.append(element[d])
            to_pandas.append(entry)

        columns = ["Symbol"]
        args = {}

        for key in axis:
            columns.append(labels[key])
            args[key] = labels[key]

        df = pd.DataFrame(to_pandas, columns=columns)

        if dimensions == 3 and backend == "plotly":
            args["color"] = "Symbol"
            plt = px.scatter_3d(df, **args)
            plt.update_traces(marker=dict(size=dot_size))
        elif dimensions == 2 and backend == "plotly":
            args["color"] = "Symbol"
            plt = px.scatter(df, **args)
            plt.update_traces(marker=dict(size=dot_size))
        elif dimensions == 3 and backend == "seaborn":
            raise ("This backend is for 2D visualization")
        elif dimensions == 2 and backend == "seaborn":
            sns.scatterplot(**labels, data=df, hue="Symbol")

    if data_only:
        return df, dim_reduction

    else:
        try:
            plt.show()
        except:
            pass

        return plt, df, dim_reduction
コード例 #7
0
ファイル: potentials.py プロジェクト: vishankkumar/ml4chem
    def load(Cls, model=None, params=None, preprocessor=None, **kwargs):
        """Load ML4Chem models

        Parameters
        ----------
        model : str
            The path to load the model from the .ml4c file for inference.
        params : srt
            The path to load .params file with users' inputs.
        preprocessor : str
            The path to load the file with the sklearn preprocessor object.
        """
        kwargs["ml4chem_path"] = model
        kwargs["preprocessor"] = preprocessor

        with open(params, "rb") as ml4chem_params:
            ml4chem_params = json.load(ml4chem_params)
            model_type = ml4chem_params["model"].get("type")

            model_params = ml4chem_params["model"]
            class_name = model_params["class_name"]
            module_name = Potentials.module_names[model_params["name"]]

            model_class = dynamic_import(class_name,
                                         "ml4chem.atomistic.models",
                                         alt_name=module_name)

            delete = ["name", "type", "class_name"]
            for param in delete:
                # delete unneeded (key, value) pairs.
                del model_params[param]

            if model_type == "svm":

                weights = load(model)
                # TODO remove after de/serialization is fixed.
                try:
                    weights = {
                        key.decode("utf-8"): value
                        for key, value in weights.items()
                    }
                except AttributeError:
                    weights = {key: value for key, value in weights.items()}

                model_params.update({"weights": weights})
                model = model_class(**model_params)
            else:
                # Instantiate the model class
                model = model_class(**model_params)

        # Instantiation of fingerprint class
        fingerprint_params = ml4chem_params.get("features", None)

        if fingerprint_params == None:
            features = None
        else:
            if "kwargs" in fingerprint_params.keys():
                update_dict_with = fingerprint_params.pop("kwargs")
                fingerprint_params.update(update_dict_with)

            if fingerprint_params is None:
                features = fingerprint_params
            else:
                name = fingerprint_params.get("name")
                del fingerprint_params["name"]

                features = dynamic_import(name, "ml4chem.atomistic.features")
                features = features(**fingerprint_params)

        calc = Cls(features=features, model=model, **kwargs)

        return calc
コード例 #8
0
ファイル: cartesian.py プロジェクト: eligardella/ml4chem
    def calculate_features(self,
                           images=None,
                           purpose="training",
                           data=None,
                           svm=False):
        """Return features per atom in an atoms objects

        Parameters
        ----------
        image : dict
            Hashed images using the DataSet class.
        purpose : str
            The supported purposes are: 'training', 'inference'.
        data : obj
            data object
        svm : bool
            Whether or not these features are going to be used for kernel
            methods.

        Returns
        -------
        feature_space : dict
            A dictionary with key hash and value as a list with the following
            structure: {'hash': [('H', [vector]]}
        """

        logger.info(" ")
        logger.info("Fingerprinting")
        logger.info("==============")

        if os.path.isfile(self.filename) and self.overwrite is False:
            logger.warning("Loading features from {}.".format(self.filename))
            logger.info(" ")
            svm_keys = [b"feature_space", b"reference_space"]
            data = load(self.filename)

            if svm_keys == list(data.keys()):
                feature_space = data[svm_keys[0]]
                reference_space = data[svm_keys[1]]
                return feature_space, reference_space
            else:
                return data

        initial_time = time.time()

        # Verify that we know the unique element symbols
        if data.unique_element_symbols is None:
            logger.info(
                "Getting unique element symbols for {}".format(purpose))

            unique_element_symbols = data.get_unique_element_symbols(
                images, purpose=purpose)

            unique_element_symbols = unique_element_symbols[purpose]

            logger.info(
                "Unique chemical elements: {}".format(unique_element_symbols))

        preprocessor = Preprocessing(self.preprocessor, purpose=purpose)
        preprocessor.set(purpose=purpose)

        # We start populating computations with delayed functions to operate
        # with dask's scheduler. These computations get cartesian coordinates.
        computations = []

        for image in images.items():
            key, image = image

            feature_vectors = []

            computations.append(feature_vectors)

            for atom in image:
                if self.preprocessor is not None:
                    # In this case we will preprocess data and need numpy
                    # arrays to operate with sklearn.
                    afp = self.get_atomic_features(atom, svm=True)
                    feature_vectors.append(afp[1])
                else:
                    afp = self.get_atomic_features(atom, svm=svm)
                    feature_vectors.append(afp)

        # In this block we compute the delayed functions in computations.
        feature_space = dask.compute(*computations, scheduler=self.scheduler)

        hashes = list(images.keys())

        if self.preprocessor is not None and purpose == "training":
            feature_space = np.array(feature_space)
            dim = feature_space.shape

            if len(dim) > 1:
                d1, d2, d3 = dim
                feature_space = feature_space.reshape(d1 * d2, d3)
                feature_space = preprocessor.fit(feature_space,
                                                 scheduler=self.scheduler)
                feature_space = feature_space.reshape(d1, d2, d3)
            else:
                atoms_index_map = []
                stack = []

                d1 = ini = end = 0

                for i in feature_space:
                    end = ini + len(i)
                    atoms_map = list(range(ini, end))
                    atoms_index_map.append(atoms_map)
                    ini = end

                    for j in i:
                        stack.append(j)
                        d1 += 1

                feature_space = np.array(stack)

                d2 = len(stack[0])
                del stack

            # More data processing depending on the method used.
            computations = []

            if svm:
                reference_space = []

                for i, image in enumerate(images.items()):
                    computations.append(
                        self.restack_image(i, image, feature_space, svm=svm))

                    # image = (hash, ase_image) -> tuple
                    for atom in image[1]:
                        reference_space.append(
                            self.restack_atom(i, atom, feature_space))

                reference_space = dask.compute(*reference_space,
                                               scheduler=self.scheduler)
            else:
                for i, image in enumerate(images.items()):
                    computations.append(
                        self.restack_image(i, image, feature_space, svm=svm))

            feature_space = dask.compute(*computations,
                                         scheduler=self.scheduler)

            feature_space = OrderedDict(feature_space)

            # Save preprocessor.
            preprocessor.save_to_file(preprocessor, self.save_preprocessor)

        elif self.preprocessor is not None and purpose == "inference":
            # We take stacked features and preprocess them
            stacked_features = np.array(feature_space)
            d1, d2, d3 = stacked_features.shape
            stacked_features = stacked_features.reshape(d1 * d2, d3)
            feature_space = OrderedDict()
            scaled_feature_space = preprocessor.transform(stacked_features)

            # Once preprocessed, they are wrapped as a dictionary.
            # TODO this has to be parallelized.
            for key, image in images.items():
                if key not in feature_space.keys():
                    feature_space[key] = []
                for index, atom in enumerate(image):
                    symbol = atom.symbol

                    if svm:
                        scaled = scaled_feature_space[index]
                        # TODO change this to something more elegant later
                        try:
                            self.reference_space
                        except AttributeError:
                            # If self.reference does not exist it means that
                            # reference_space is being loaded by Messagepack.
                            symbol = symbol.encode("utf-8")
                    else:
                        scaled = torch.tensor(
                            scaled_feature_space[index],
                            requires_grad=False,
                            dtype=torch.float,
                        )

                    feature_space[key].append((symbol, scaled))
        else:

            feature_space = OrderedDict(zip(hashes, feature_space))

        fp_time = time.time() - initial_time

        h, m, s = convert_elapsed_time(fp_time)

        logger.info("Fingerprinting finished in {} hours {} minutes {:.2f} "
                    "seconds.\n".format(h, m, s))

        if svm:
            data = {"feature_space": feature_space}
            dump(data, filename=self.filename)
        else:
            dump(feature_space, filename=self.filename)

        return feature_space
コード例 #9
0
    def calculate(self, images=None, purpose="training", data=None, svm=False):
        """Calculate the features per atom in an atoms objects

        Parameters
        ----------
        image : dict
            Hashed images using the Data class.
        purpose : str
            The supported purposes are: 'training', 'inference'.
        data : obj
            data object
        svm : bool
            Whether or not these features are going to be used for kernel
            methods.

        Returns
        -------
        feature_space : dict
            A dictionary with key hash and value as a list with the following
            structure: {'hash': [('H', [vector]]}
        reference_space : dict
            A reference space useful for SVM models.
        """

        client = dask.distributed.get_client()
        logger.info(" ")
        logger.info("Featurization")
        logger.info("=============")
        now = datetime.datetime.now()
        logger.info("Module accessed on {}.".format(
            now.strftime("%Y-%m-%d %H:%M:%S")))
        logger.info(f"Module name: {self.name()}.")

        # FIXME the block below should become a function.
        if os.path.isfile(self.filename) and self.overwrite is False:
            logger.warning(f"Loading features from {self.filename}.")
            logger.info(" ")
            svm_keys = [b"feature_space", b"reference_space"]
            data = load(self.filename)

            data_hashes = list(data.keys())
            image_hashes = list(images.keys())

            if image_hashes == data_hashes:
                # Check if both lists are the same.
                return data
            elif any(i in image_hashes for i in data_hashes):
                # Check if any of the elem
                _data = {}
                for hash in image_hashes:
                    _data[hash] = data[hash]
                return _data

            if svm_keys == list(data.keys()):
                feature_space = data[svm_keys[0]]
                reference_space = data[svm_keys[1]]
                return feature_space, reference_space

        initial_time = time.time()

        # Verify that we know the unique element symbols
        if data.unique_element_symbols is None:
            logger.info(f"Getting unique element symbols for {purpose}")

            unique_element_symbols = data.get_unique_element_symbols(
                images, purpose=purpose)

            unique_element_symbols = unique_element_symbols[purpose]

            logger.info(f"Unique chemical elements: {unique_element_symbols}")

        elif isinstance(data.unique_element_symbols, dict):
            unique_element_symbols = data.unique_element_symbols[purpose]

            logger.info(f"Unique chemical elements: {unique_element_symbols}")

        # we make the features
        self.GP = self.custom.get("GP", None)

        if self.GP is None:
            custom = self.custom.get("user_input", None)
            self.GP = self.make_symmetry_functions(
                unique_element_symbols,
                custom=custom,
                angular_type=self.angular_type)
            self.custom.update({"GP": self.GP})
        else:
            logger.info(
                "Using parameters from file to create symmetry functions...\n")

        self.print_features_params(self.GP)

        symbol = data.unique_element_symbols[purpose][0]
        sample = np.zeros(len(self.GP[symbol]))

        self.dimension = len(sample)

        preprocessor = Preprocessing(self.preprocessor, purpose=purpose)
        preprocessor.set(purpose=purpose)

        # We start populating computations to get atomic features.
        logger.info("")
        logger.info(
            "Embarrassingly parallel computation of atomic features...")

        stacked_features = []
        atoms_index_map = [
        ]  # This list is used to reconstruct images from atoms.

        if self.batch_size is None:
            self.batch_size = data.get_total_number_atoms()

        chunks = get_chunks(images, self.batch_size, svm=svm)

        ini = end = 0
        for chunk in chunks:
            images_ = OrderedDict(chunk)
            intermediate = []

            for image in images_.items():
                _, image = image
                end = ini + len(image)
                atoms_index_map.append(list(range(ini, end)))
                ini = end
                for atom in image:
                    index = atom.index
                    symbol = atom.symbol

                    cutoff_keys = ["radial", "angular"]
                    n_symbols, neighborpositions = {}, {}

                    if isinstance(self.cutoff, dict):
                        for cutoff_key in cutoff_keys:
                            nl = get_neighborlist(
                                image, cutoff=self.cutoff[cutoff_key])
                            # n_indices: neighbor indices for central atom_i.
                            # n_offsets: neighbor offsets for central atom_i.
                            n_indices, n_offsets = nl[atom.index]

                            n_symbols_ = np.array(
                                image.get_chemical_symbols())[n_indices]
                            n_symbols[cutoff_key] = n_symbols_

                            neighborpositions_ = image.positions[
                                n_indices] + np.dot(n_offsets,
                                                    image.get_cell())
                            neighborpositions[cutoff_key] = neighborpositions_
                    else:
                        for cutoff_key in cutoff_keys:
                            nl = get_neighborlist(image, cutoff=self.cutoff)
                            # n_indices: neighbor indices for central atom_i.
                            # n_offsets: neighbor offsets for central atom_i.
                            n_indices, n_offsets = nl[atom.index]

                            n_symbols_ = np.array(
                                image.get_chemical_symbols())[n_indices]
                            n_symbols[cutoff_key] = n_symbols_

                            neighborpositions_ = image.positions[
                                n_indices] + np.dot(n_offsets,
                                                    image.get_cell())
                            neighborpositions[cutoff_key] = neighborpositions_

                    afp = self.get_atomic_features(
                        atom,
                        index,
                        symbol,
                        n_symbols,
                        neighborpositions,
                        image_molecule=image,
                        weighted=self.weighted,
                        n_indices=n_indices,
                    )

                    intermediate.append(afp)

            intermediate = client.persist(intermediate,
                                          scheduler=self.scheduler)
            stacked_features += intermediate
            del intermediate

        scheduler_time = time.time() - initial_time

        dask.distributed.wait(stacked_features)

        h, m, s = convert_elapsed_time(scheduler_time)
        logger.info("... finished in {} hours {} minutes {:.2f}"
                    " seconds.".format(h, m, s))

        logger.info("")

        if self.preprocessor is not None:

            scaled_feature_space = []

            # To take advantage of dask_ml we need to convert our numpy array
            # into a dask array.
            logger.info("Converting features to dask array...")
            stacked_features = [
                da.from_delayed(lazy, dtype=float, shape=sample.shape)
                for lazy in stacked_features
            ]
            layout = {0: tuple(len(i) for i in atoms_index_map), 1: -1}
            # stacked_features = dask.array.stack(stacked_features, axis=0).rechunk(layout)
            stacked_features = da.stack(stacked_features,
                                        axis=0).rechunk(layout)

            logger.info("Shape of array is {} and chunks {}.".format(
                stacked_features.shape, stacked_features.chunks))

            # Note that dask_ml by default convert the output of .fit
            # in a concrete value.
            if purpose == "training":
                stacked_features = preprocessor.fit(stacked_features,
                                                    scheduler=self.scheduler)
            else:
                stacked_features = preprocessor.transform(stacked_features)

            atoms_index_map = [
                client.scatter(indices) for indices in atoms_index_map
            ]
            # stacked_features = [client.scatter(features) for features in stacked_features]
            stacked_features = client.scatter(stacked_features, broadcast=True)

            logger.info("Stacking features using atoms index map...")

            for indices in atoms_index_map:
                features = client.submit(self.stack_features,
                                         *(indices, stacked_features))

                # features = self.stack_features(indices, stacked_features)

                scaled_feature_space.append(features)

        else:
            scaled_feature_space = []
            atoms_index_map = [
                client.scatter(chunk) for chunk in atoms_index_map
            ]
            stacked_features = client.scatter(stacked_features, broadcast=True)

            for indices in atoms_index_map:
                features = client.submit(self.stack_features,
                                         *(indices, stacked_features))
                scaled_feature_space.append(features)

            scaled_feature_space = client.gather(scaled_feature_space)

        # Clean
        del stacked_features

        # Restack images
        feature_space = []

        if svm and purpose == "training":
            logger.info("Building array with reference space.")
            reference_space = []

            for i, image in enumerate(images.items()):
                restacked = client.submit(
                    self.restack_image, *(i, image, scaled_feature_space, svm))

                # image = (hash, ase_image) -> tuple
                for atom in image[1]:
                    restacked_atom = client.submit(
                        self.restack_atom, *(i, atom, scaled_feature_space))
                    reference_space.append(restacked_atom)

                feature_space.append(restacked)

            reference_space = client.gather(reference_space)

        elif svm is False and purpose == "training":
            for i, image in enumerate(images.items()):
                restacked = client.submit(
                    self.restack_image, *(i, image, scaled_feature_space, svm))
                feature_space.append(restacked)

        else:
            try:
                for i, image in enumerate(images.items()):
                    restacked = client.submit(
                        self.restack_image,
                        *(i, image, scaled_feature_space, svm))
                    feature_space.append(restacked)

            except UnboundLocalError:
                # scaled_feature_space does not exist.
                for i, image in enumerate(images.items()):
                    restacked = client.submit(self.restack_image,
                                              *(i, image, feature_space, svm))
                    feature_space.append(restacked)

        feature_space = client.gather(feature_space)
        feature_space = OrderedDict(feature_space)

        fp_time = time.time() - initial_time

        h, m, s = convert_elapsed_time(fp_time)

        logger.info("Featurization finished in {} hours {} minutes {:.2f}"
                    " seconds.".format(h, m, s))

        if svm and purpose == "training":
            client.restart()  # Reclaims memory aggressively
            preprocessor.save_to_file(preprocessor, self.save_preprocessor)

            if self.filename is not None:
                logger.info(f"features saved to {self.filename}.")
                data = {"feature_space": feature_space}
                data.update({"reference_space": reference_space})
                dump(data, filename=self.filename)
                self.feature_space = feature_space
                self.reference_space = reference_space

            return self.feature_space, self.reference_space

        elif svm is False and purpose == "training":
            client.restart()  # Reclaims memory aggressively
            preprocessor.save_to_file(preprocessor, self.save_preprocessor)

            if self.filename is not None:
                logger.info(f"features saved to {self.filename}.")
                dump(feature_space, filename=self.filename)
                self.feature_space = feature_space

            return self.feature_space
        else:
            self.feature_space = feature_space
            return self.feature_space
コード例 #10
0
ファイル: coulombmatrix.py プロジェクト: vishankkumar/ml4chem
    def calculate(self, images=None, purpose="training", data=None, svm=False):
        """Calculate the features per atom in an atoms objects

        Parameters
        ----------
        image : dict
            Hashed images using the Data class.
        purpose : str
            The supported purposes are: 'training', 'inference'.
        data : obj
            data object
        svm : bool
            Whether or not these features are going to be used for kernel
            methods.

        Returns
        -------
        feature_space : dict
            A dictionary with key hash and value as a list with the following
            structure: {'hash': [('H', [vector]]}
        reference_space : dict
            A reference space useful for SVM models.
        """

        client = dask.distributed.get_client()
        logger.info(" ")
        logger.info("Featurization")
        logger.info("=============")
        now = datetime.datetime.now()
        logger.info("Module accessed on {}.".format(
            now.strftime("%Y-%m-%d %H:%M:%S")))

        # FIXME the block below should become a function.
        if os.path.isfile(self.filename) and self.overwrite is False:
            logger.warning("Loading features from {}.".format(self.filename))
            logger.info(" ")
            svm_keys = [b"feature_space", b"reference_space"]
            data = load(self.filename)

            data_hashes = list(data.keys())
            image_hashes = list(images.keys())

            if image_hashes == data_hashes:
                # Check if both lists are the same.
                return data
            elif any(i in image_hashes for i in data_hashes):
                # Check if any of the elem
                _data = {}
                for hash in image_hashes:
                    _data[hash] = data[hash]
                return _data

            if svm_keys == list(data.keys()):
                feature_space = data[svm_keys[0]]
                reference_space = data[svm_keys[1]]
                return feature_space, reference_space

        initial_time = time.time()

        # Verify that we know the unique element symbols
        if data.unique_element_symbols is None:
            logger.info(
                "Getting unique element symbols for {}".format(purpose))

            unique_element_symbols = data.get_unique_element_symbols(
                images, purpose=purpose)

            unique_element_symbols = unique_element_symbols[purpose]

            logger.info(
                "Unique chemical elements: {}".format(unique_element_symbols))

        elif isinstance(data.unique_element_symbols, dict):
            unique_element_symbols = data.unique_element_symbols[purpose]

            logger.info(
                "Unique chemical elements: {}".format(unique_element_symbols))

        # we make the features
        preprocessor = Preprocessing(self.preprocessor, purpose=purpose)
        preprocessor.set(purpose=purpose)

        # We start populating computations to get atomic features.
        logger.info("")
        logger.info(
            "Embarrassingly parallel computation of atomic features...")

        stacked_features = []
        atoms_symbols_map = [
        ]  # This list is used to reconstruct images from atoms.

        if self.batch_size is None:
            self.batch_size = data.get_total_number_atoms()

        chunks = get_chunks(images, self.batch_size, svm=svm)

        for chunk in chunks:
            images_ = OrderedDict(chunk)
            intermediate = []

            for image in images_.items():
                key, image = image
                atoms_symbols_map.append(image.get_chemical_symbols())
                # Use .create() class method from dscribe.
                _features = dask.delayed(self.create)(image)
                intermediate.append(_features)

            intermediate = client.compute(intermediate,
                                          scheduler=self.scheduler)
            stacked_features += intermediate
            del intermediate

        # scheduler_time = time.time() - initial_time

        # dask.distributed.wait(stacked_features)

        logger.info("")

        if self.preprocessor is not None:
            raise NotImplementedError

        else:
            scaled_feature_space = []
            atoms_symbols_map = [
                client.scatter(chunk) for chunk in atoms_symbols_map
            ]
            stacked_features = client.scatter(stacked_features, broadcast=True)

            for image_index, symbols in enumerate(atoms_symbols_map):
                features = client.submit(
                    self.stack_features,
                    *(symbols, image_index, stacked_features))
                scaled_feature_space.append(features)

            scaled_feature_space = client.gather(scaled_feature_space)

        # Clean
        del stacked_features

        # Restack images
        feature_space = []

        if svm and purpose == "training":

            for i, image in enumerate(images.items()):
                restacked = client.submit(
                    self.restack_image, *(i, image, scaled_feature_space, svm))

                feature_space.append(restacked)

        elif svm is False and purpose == "training":
            for i, image in enumerate(images.items()):
                restacked = client.submit(
                    self.restack_image, *(i, image, scaled_feature_space, svm))
                feature_space.append(restacked)

        else:
            try:
                for i, image in enumerate(images.items()):
                    restacked = client.submit(
                        self.restack_image,
                        *(i, image, scaled_feature_space, svm))
                    feature_space.append(restacked)

            except UnboundLocalError:
                # scaled_feature_space does not exist.
                for i, image in enumerate(images.items()):
                    restacked = client.submit(self.restack_image,
                                              *(i, image, feature_space, svm))
                    feature_space.append(restacked)

        feature_space = client.gather(feature_space)

        if svm and purpose == "training":
            # FIXME This might need to be improved
            logger.info("Building array with reference space.")
            hashes, reference_space = list(zip(*feature_space))
            del hashes
            reference_space = list(
                itertools.chain.from_iterable(reference_space))
            logger.info("Finished reference space.")

        feature_space = OrderedDict(feature_space)

        fp_time = time.time() - initial_time

        h, m, s = convert_elapsed_time(fp_time)

        logger.info("Featurization finished in {} hours {} minutes {:.2f}"
                    " seconds.".format(h, m, s))

        if svm and purpose == "training":
            client.restart()  # Reclaims memory aggressively
            preprocessor.save_to_file(preprocessor, self.save_preprocessor)

            if self.filename is not None:
                logger.info("features saved to {}.".format(self.filename))
                data = {"feature_space": feature_space}
                data.update({"reference_space": reference_space})
                dump(data, filename=self.filename)
                self.feature_space = feature_space
                self.reference_space = reference_space

            return self.feature_space, self.reference_space

        elif svm is False and purpose == "training":
            client.restart()  # Reclaims memory aggressively
            preprocessor.save_to_file(preprocessor, self.save_preprocessor)

            if self.filename is not None:
                logger.info("features saved to {}.".format(self.filename))
                dump(feature_space, filename=self.filename)
                self.feature_space = feature_space

            return self.feature_space
        else:
            self.feature_space = feature_space
            return self.feature_space
コード例 #11
0
ファイル: visualization.py プロジェクト: eligardella/ml4chem
def plot_atomic_features(latent_space, method="PCA", dimensions=2):
    """Plot high dimensional atomic feature vectors

    This function can take a feature space dictionary, or a database file
    and plot the atomic features using PCA or t-SNE.

    $ mlchem --plot tsne --file path.db

    Parameters
    ----------
    latent_space : dict or str
        Dictionary of atomic features of path to database file.
    method : str, optional
        Dimensionality reduction method to employed, by default "PCA".
        Supported are: "PCA" and "TSNE".
    dimensions : int, optional
        Number of dimensions to reduce the high dimensional atomic feature
        vectors, by default 2.
    """

    method = method.lower()
    if isinstance(latent_space, str):
        latent_space = load(latent_space)

    full_ls = []
    full_symbols = []

    # This conditional is needed if you are passing an atomic feature database.
    if b"feature_space" in latent_space.keys():
        latent_space = latent_space[b"feature_space"]

    for hash, feature_space in latent_space.items():
        for symbol, feature_vector in feature_space:
            try:
                symbol = symbol.decode("utf-8")
            except AttributeError:
                pass

            if isinstance(feature_vector, np.ndarray) is False:
                feature_vector = feature_vector.numpy()

            full_symbols.append(symbol)
            full_ls.append(feature_vector)

    if method == "pca":
        from sklearn.decomposition import PCA

        labels = {"x": "PCA-1", "y": "PCA-2"}
        pca = PCA(n_components=dimensions)
        pca_result = pca.fit_transform(full_ls)

        to_pandas = []

        for i, element in enumerate(pca_result):
            to_pandas.append([full_symbols[i], element[0], element[1]])

        columns = ["Symbol", "PCA-1", "PCA-2"]

        df = pd.DataFrame(to_pandas, columns=columns)
        sns.scatterplot(**labels, data=df, hue="Symbol")

    elif method == "tsne":
        from sklearn import manifold

        labels = {"x": "t-SNE-1", "y": "t-SNE-2"}

        tsne = manifold.TSNE(n_components=dimensions)

        tsne_result = tsne.fit_transform(full_ls)

        to_pandas = []

        for i, element in enumerate(tsne_result):
            to_pandas.append([full_symbols[i], element[0], element[1]])

        columns = ["Symbol", "t-SNE-1", "t-SNE-2"]

        df = pd.DataFrame(to_pandas, columns=columns)
        sns.scatterplot(**labels, data=df, hue="Symbol")

    plt.show()