Exemple #1
0
 def featurize_mol(self, coords, mol, max_num_atoms):
   logging.info("Featurizing molecule of size: %d", len(mol.GetAtoms()))
   neighbor_list = compute_neighbor_list(coords, self.neighbor_cutoff,
                                         self.max_num_neighbors, None)
   z = self.get_Z_matrix(mol, max_num_atoms)
   z = pad_array(z, max_num_atoms)
   coords = pad_array(coords, (max_num_atoms, 3))
   return coords, neighbor_list, z
    def _featurize(self, mol: RDKitMol) -> np.ndarray:
        """
    Calculate eigenvalues of Coulomb matrix for molecules. Eigenvalues
    are returned sorted by absolute value in descending order and padded
    by max_atoms.

    Parameters
    ----------
    mol: rdkit.Chem.rdchem.Mol
      RDKit Mol object

    Returns
    -------
    np.ndarray
      The eigenvalues of Coulomb matrix for molecules.
      The default shape is `(num_confs, max_atoms)`.
      If num_confs == 1, the shape is `(max_atoms,)`.
    """
        cmat = self.coulomb_matrix(mol)
        features_list = []
        for f in cmat:
            w, v = np.linalg.eig(f)
            w_abs = np.abs(w)
            sortidx = np.argsort(w_abs)
            sortidx = sortidx[::-1]
            w = w[sortidx]
            f = pad_array(w, self.max_atoms)
            features_list.append(f)
        features = np.asarray(features_list)
        if features.shape[0] == 1:
            # `(1, max_atoms)` -> `(max_atoms,)`
            features = np.squeeze(features, axis=0)
        return features
Exemple #3
0
    def _featurize(self, struct: PymatgenStructure) -> np.ndarray:
        """
    Calculate sine Coulomb matrix from pymatgen structure.

    Parameters
    ----------
    struct: pymatgen.Structure
      A periodic crystal composed of a lattice and a sequence of atomic
      sites with 3D coordinates and elements.

    Returns
    -------
    features: np.ndarray
      2D sine Coulomb matrix with shape (max_atoms, max_atoms),
      or 1D matrix eigenvalues with shape (max_atoms,).
    """
        # Get full N x N SCM
        sine_mat = self.scm.featurize(struct)

        if self.flatten:
            eigs, _ = np.linalg.eig(sine_mat)
            zeros = np.zeros(self.max_atoms)
            zeros[:len(eigs[0])] = eigs[0]
            features = zeros
        else:
            features = pad_array(sine_mat, self.max_atoms)

        features = np.asarray(features)

        return features
Exemple #4
0
    def _featurize(self, datapoint: RDKitMol, **kwargs) -> np.ndarray:
        """Calculate symmetry function.

    Parameters
    ----------
    datapoint: rdkit.Chem.rdchem.Mol
      RDKit Mol object

    Returns
    -------
    np.ndarray
      A numpy array of symmetry function. The shape is `(max_atoms, 4)`.
    """
        if 'mol' in kwargs:
            datapoint = kwargs.get("mol")
            raise DeprecationWarning(
                'Mol is being phased out as a parameter, please pass "datapoint" instead.'
            )
        coordinates = self.coordfeat._featurize(datapoint)
        atom_numbers = np.array(
            [atom.GetAtomicNum() for atom in datapoint.GetAtoms()])
        atom_numbers = np.expand_dims(atom_numbers, axis=1)
        assert atom_numbers.shape[0] == coordinates.shape[0]
        features = np.concatenate([atom_numbers, coordinates], axis=1)
        return pad_array(features, (self.max_atoms, 4))
 def get_Z_matrix(self, mol, max_atoms):
     if len(mol.GetAtoms()) > max_atoms:
         raise ValueError(
             "A molecule is larger than permitted by max_atoms. "
             "Increase max_atoms and try again.")
     return pad_array(
         np.array([atom.GetAtomicNum() for atom in mol.GetAtoms()]),
         max_atoms)
    def coulomb_matrix(self, mol: RDKitMol) -> np.ndarray:
        """
    Generate Coulomb matrices for each conformer of the given molecule.

    Parameters
    ----------
    mol: rdkit.Chem.rdchem.Mol
      RDKit Mol object

    Returns
    -------
    np.ndarray
      The coulomb matrices of the given molecule
    """
        try:
            from rdkit import Chem
            from rdkit.Chem import AllChem
        except ModuleNotFoundError:
            raise ImportError("This class requires RDKit to be installed.")

        # Check whether num_confs >=1 or not
        num_confs = len(mol.GetConformers())
        if num_confs == 0:
            mol = Chem.AddHs(mol)
            AllChem.EmbedMolecule(mol, AllChem.ETKDG())

        if self.remove_hydrogens:
            mol = Chem.RemoveHs(mol)
        n_atoms = mol.GetNumAtoms()
        z = [atom.GetAtomicNum() for atom in mol.GetAtoms()]
        rval = []
        for conf in mol.GetConformers():
            d = self.get_interatomic_distances(conf)
            m = np.outer(z, z) / d
            m[range(n_atoms), range(n_atoms)] = 0.5 * np.array(z)**2.4
            if self.randomize:
                for random_m in self.randomize_coulomb_matrix(m):
                    random_m = pad_array(random_m, self.max_atoms)
                    rval.append(random_m)
            else:
                m = pad_array(m, self.max_atoms)
                rval.append(m)
        rval = np.asarray(rval)
        return rval
    def _featurize(self, datapoint: PymatgenStructure, **kwargs) -> np.ndarray:
        """
    Calculate sine Coulomb matrix from pymatgen structure.

    Parameters
    ----------
    datapoint: pymatgen.core.Structure
      A periodic crystal composed of a lattice and a sequence of atomic
      sites with 3D coordinates and elements.

    Returns
    -------
    features: np.ndarray
      2D sine Coulomb matrix with shape (max_atoms, max_atoms),
      or 1D matrix eigenvalues with shape (max_atoms,).
    """
        if 'struct' in kwargs and datapoint is None:
            datapoint = kwargs.get("struct")
            raise DeprecationWarning(
                'Struct is being phased out as a parameter, please pass "datapoint" instead.'
            )

        if self.scm is None:
            try:
                from matminer.featurizers.structure import SineCoulombMatrix as SCM
                self.scm = SCM(flatten=False)
            except ModuleNotFoundError:
                raise ImportError(
                    "This class requires matminer to be installed.")

        # Get full N x N SCM
        sine_mat = self.scm.featurize(datapoint)

        if self.flatten:
            eigs, _ = np.linalg.eig(sine_mat)
            zeros = np.zeros(self.max_atoms)
            zeros[:len(eigs[0])] = eigs[0]
            features = zeros
        else:
            features = pad_array(sine_mat, self.max_atoms)

        features = np.asarray(features)

        return features
Exemple #8
0
  def _featurize(self, mol: RDKitMol) -> np.ndarray:
    """Calculate symmetry function.

    Parameters
    ----------
    mol: rdkit.Chem.rdchem.Mol
      RDKit Mol object

    Returns
    -------
    np.ndarray
      A numpy array of symmetry function. The shape is `(max_atoms, 4)`.
    """
    coordinates = self.coordfeat._featurize(mol)
    atom_numbers = np.array([atom.GetAtomicNum() for atom in mol.GetAtoms()])
    atom_numbers = np.expand_dims(atom_numbers, axis=1)
    assert atom_numbers.shape[0] == coordinates.shape[0]
    features = np.concatenate([atom_numbers, coordinates], axis=1)
    return pad_array(features, (self.max_atoms, 4))
Exemple #9
0
  def _featurize(self, datapoint: RDKitMol, **kwargs) -> np.ndarray:
    """
    Calculate eigenvalues of Coulomb matrix for molecules. Eigenvalues
    are returned sorted by absolute value in descending order and padded
    by max_atoms.

    Parameters
    ----------
    datapoint: rdkit.Chem.rdchem.Mol
      RDKit Mol object

    Returns
    -------
    np.ndarray
      The eigenvalues of Coulomb matrix for molecules.
      The default shape is `(num_confs, max_atoms)`.
      If num_confs == 1, the shape is `(max_atoms,)`.
    """
    if 'mol' in kwargs:
      datapoint = kwargs.get("mol")
      raise DeprecationWarning(
          'Mol is being phased out as a parameter, please pass "datapoint" instead.'
      )

    cmat = self.coulomb_matrix(datapoint)
    features_list = []
    for f in cmat:
      w, v = np.linalg.eig(f)
      w_abs = np.abs(w)
      sortidx = np.argsort(w_abs)
      sortidx = sortidx[::-1]
      w = w[sortidx]
      f = pad_array(w, self.max_atoms)
      features_list.append(f)
    features = np.asarray(features_list)
    if features.shape[0] == 1:
      # `(1, max_atoms)` -> `(max_atoms,)`
      features = np.squeeze(features, axis=0)
    return features
    def _featurize(self, struct: PymatgenStructure) -> np.ndarray:
        """
    Calculate sine Coulomb matrix from pymatgen structure.

    Parameters
    ----------
    struct: pymatgen.core.Structure
      A periodic crystal composed of a lattice and a sequence of atomic
      sites with 3D coordinates and elements.

    Returns
    -------
    features: np.ndarray
      2D sine Coulomb matrix with shape (max_atoms, max_atoms),
      or 1D matrix eigenvalues with shape (max_atoms,).
    """
        if self.scm is None:
            try:
                from matminer.featurizers.structure import SineCoulombMatrix as SCM
                self.scm = SCM(flatten=False)
            except ModuleNotFoundError:
                raise ImportError(
                    "This class requires matminer to be installed.")

        # Get full N x N SCM
        sine_mat = self.scm.featurize(struct)

        if self.flatten:
            eigs, _ = np.linalg.eig(sine_mat)
            zeros = np.zeros(self.max_atoms)
            zeros[:len(eigs[0])] = eigs[0]
            features = zeros
        else:
            features = pad_array(sine_mat, self.max_atoms)

        features = np.asarray(features)

        return features