Beispiel #1
0
  def coulomb_matrix(self, mol):
    """
    Generate Coulomb matrices for each conformer of the given molecule.

    Parameters
    ----------
    mol : RDKit Mol
        Molecule.
    """
    if self.remove_hydrogens:
      mol = Chem.RemoveHs(mol)
    n_atoms = mol.GetNumAtoms()
    z = [atom.GetAtomicNum() for atom in mol.GetAtoms()]
    rval = []
    for conf in mol.GetConformers():
      d = self.get_interatomic_distances(conf)
      m = np.zeros((n_atoms, n_atoms))
      for i in xrange(mol.GetNumAtoms()):
        for j in xrange(mol.GetNumAtoms()):
          if i == j:
            m[i, j] = 0.5 * z[i] ** 2.4
          elif i < j:
            m[i, j] = (z[i] * z[j]) / d[i, j]
            m[j, i] = m[i, j]
          else:
            continue
      if self.randomize:
        for random_m in self.randomize_coulomb_matrix(m):
          random_m = pad_array(random_m, self.max_atoms)
          rval.append(random_m)
      else:
        m = pad_array(m, self.max_atoms)
        rval.append(m)
    rval = np.asarray(rval)
    return rval
Beispiel #2
0
 def featurize_mol(self, coords, mol, max_num_atoms):
     neighbor_list = compute_neighbor_list(coords, self.neighbor_cutoff,
                                           self.max_num_neighbors, None)
     z = self.get_Z_matrix(mol, max_num_atoms)
     z = pad_array(z, max_num_atoms)
     coords = pad_array(coords, (max_num_atoms, 3))
     return coords, neighbor_list, z
    def coulomb_matrix(self, mol):
        """
    Generate Coulomb matrices for each conformer of the given molecule.

    Parameters
    ----------
    mol : RDKit Mol
        Molecule.
    """
        if self.remove_hydrogens:
            mol = Chem.RemoveHs(mol)
        n_atoms = mol.GetNumAtoms()
        z = [atom.GetAtomicNum() for atom in mol.GetAtoms()]
        rval = []
        for conf in mol.GetConformers():
            d = self.get_interatomic_distances(conf)
            m = np.zeros((n_atoms, n_atoms))
            for i in xrange(mol.GetNumAtoms()):
                for j in xrange(mol.GetNumAtoms()):
                    if i == j:
                        m[i, j] = 0.5 * z[i]**2.4
                    elif i < j:
                        m[i, j] = (z[i] * z[j]) / d[i, j]
                        m[j, i] = m[i, j]
                    else:
                        continue
            if self.randomize:
                for random_m in self.randomize_coulomb_matrix(m):
                    random_m = pad_array(random_m, self.max_atoms)
                    rval.append(random_m)
            else:
                m = pad_array(m, self.max_atoms)
                rval.append(m)
        rval = np.asarray(rval)
        return rval
Beispiel #4
0
 def featurize_mol(self, coords, mol, max_num_atoms):
   neighbor_list = compute_neighbor_list(coords, self.neighbor_cutoff,
                                         self.max_num_neighbors, None)
   z = self.get_Z_matrix(mol, max_num_atoms)
   z = pad_array(z, max_num_atoms)
   coords = pad_array(coords, (max_num_atoms, 3))
   return coords, neighbor_list, z
Beispiel #5
0
    def coulomb_matrix(self, mol):
        """
    Generate Coulomb matrices for each conformer of the given molecule.

    Parameters
    ----------
    mol : RDKit Mol
        Molecule.
    """
        from rdkit import Chem
        if self.remove_hydrogens:
            mol = Chem.RemoveHs(mol)
        n_atoms = mol.GetNumAtoms()
        z = [atom.GetAtomicNum() for atom in mol.GetAtoms()]
        rval = []
        for conf in mol.GetConformers():
            d = self.get_interatomic_distances(conf)
            m = np.outer(z, z) / d
            m[range(n_atoms), range(n_atoms)] = 0.5 * np.array(z)**2.4
            if self.randomize:
                for random_m in self.randomize_coulomb_matrix(m):
                    random_m = pad_array(random_m, self.max_atoms)
                    rval.append(random_m)
            else:
                m = pad_array(m, self.max_atoms)
                rval.append(m)
        rval = np.asarray(rval)
        return rval
 def featurize_mol(self, coords, mol, max_num_atoms):
     logging.info("Featurizing molecule of size: %d", len(mol.GetAtoms()))
     neighbor_list = compute_neighbor_list(coords, self.neighbor_cutoff,
                                           self.max_num_neighbors, None)
     z = self.get_Z_matrix(mol, max_num_atoms)
     z = pad_array(z, max_num_atoms)
     coords = pad_array(coords, (max_num_atoms, 3))
     return coords, neighbor_list, z
Beispiel #7
0
  def unpad_randomize_and_flatten(self, cm):
    """
    1. Remove zero padding on Coulomb Matrix
    2. Randomly permute the rows and columns for n_samples
    3. Flatten each sample to upper triangular portion
    Returns list of feature vectors
    """
    max_atom_number = len(cm) 
    atom_number = 0
    for i in cm[0]:
        if atom_number == max_atom_number: break
        elif i != 0.: atom_number += 1
        else: break

    upcm = cm[0:atom_number,0:atom_number]

    row_norms = np.asarray(
        [np.linalg.norm(row) for row in upcm], dtype=float)
    rng = np.random.RandomState(self.seed)
    e = rng.normal(size=row_norms.size)
    p = np.argsort(row_norms+e)
    rcm = upcm[p][:,p]
    rcm = pad_array(rcm, len(cm))
    rcm = rcm[np.triu_indices_from(rcm)]

    return rcm
Beispiel #8
0
    def unpad_randomize_and_flatten(self, cm):
        """
    1. Remove zero padding on Coulomb Matrix
    2. Randomly permute the rows and columns for n_samples
    3. Flatten each sample to upper triangular portion
    Returns list of feature vectors
    """
        max_atom_number = len(cm)
        atom_number = 0
        for i in cm[0]:
            if atom_number == max_atom_number: break
            elif i != 0.: atom_number += 1
            else: break

        upcm = cm[0:atom_number, 0:atom_number]

        row_norms = np.asarray([np.linalg.norm(row) for row in upcm],
                               dtype=float)
        rng = np.random.RandomState(self.seed)
        e = rng.normal(size=row_norms.size)
        p = np.argsort(row_norms + e)
        rcm = upcm[p][:, p]
        rcm = pad_array(rcm, len(cm))
        rcm = rcm[np.triu_indices_from(rcm)]

        return rcm
 def get_Z_matrix(self, mol, max_atoms):
     if len(mol.GetAtoms()) > max_atoms:
         raise ValueError(
             f"A molecule (#atoms = {len(mol.GetAtoms())}) is larger than permitted by max_atoms. "
             "Increase max_atoms and try again.")
     return pad_array(
         np.array([atom.GetAtomicNum() for atom in mol.GetAtoms()]),
         max_atoms)
  def _featurize(self, struct):
    """
    Calculate sine Coulomb matrix from pymatgen structure.

    Parameters
    ----------
    struct : dict
      Json-serializable dictionary representation of pymatgen.core.structure
      https://pymatgen.org/pymatgen.core.structure.html

    Returns
    -------
    features: np.ndarray
      2D sine Coulomb matrix with shape (max_atoms, max_atoms),
      or 1D matrix eigenvalues with shape (max_atoms,). 

    """

    from pymatgen import Structure
    from matminer.featurizers.structure import SineCoulombMatrix as SCM

    s = Structure.from_dict(struct)

    # Get full N x N SCM
    scm = SCM(flatten=False)
    sine_mat = scm.featurize(s)

    if self.flatten:
      eigs, _ = np.linalg.eig(sine_mat)
      zeros = np.zeros((self.max_atoms,))
      zeros[:len(eigs)] = eigs
      features = zeros
    else:
      features = pad_array(sine_mat, self.max_atoms)

    features = np.asarray(features)

    return features
    def _featurize(self, struct: "pymatgen.Structure"):
        """
    Calculate sine Coulomb matrix from pymatgen structure.

    Parameters
    ----------
    struct : pymatgen.Structure
      A periodic crystal composed of a lattice and a sequence of atomic
      sites with 3D coordinates and elements.
      
    Returns
    -------
    features: np.ndarray
      2D sine Coulomb matrix with shape (max_atoms, max_atoms),
      or 1D matrix eigenvalues with shape (max_atoms,). 

    """

        try:
            from matminer.featurizers.structure import SineCoulombMatrix as SCM
        except ModuleNotFoundError:
            raise ValueError("This class requires matminer to be installed.")

        # Get full N x N SCM
        scm = SCM(flatten=False)
        sine_mat = scm.featurize(struct)

        if self.flatten:
            eigs, _ = np.linalg.eig(sine_mat)
            zeros = np.zeros((1, self.max_atoms))
            zeros[:len(eigs)] = eigs
            features = zeros
        else:
            features = pad_array(sine_mat, self.max_atoms)

        features = np.asarray(features)

        return features
Beispiel #12
0
  def _featurize(self, mol):
    """
    Calculate eigenvalues of Coulomb matrix for molecules. Eigenvalues
    are returned sorted by absolute value in descending order and padded
    by max_atoms. 

    Parameters
    ----------
    mol : RDKit Mol
        Molecule.
    """
    cmat = self.coulomb_matrix(mol)
    features = []
    for f in cmat:
      w, v = np.linalg.eig(f)
      w_abs = np.abs(w)
      sortidx = np.argsort(w_abs)
      sortidx = sortidx[::-1]
      w = w[sortidx]
      f = pad_array(w, self.max_atoms)
      features.append(f)
    features = np.asarray(features)
    return features
    def _featurize(self, mol):
        """
    Calculate eigenvalues of Coulomb matrix for molecules. Eigenvalues
    are returned sorted by absolute value in descending order and padded
    by max_atoms. 

    Parameters
    ----------
    mol : RDKit Mol
        Molecule.
    """
        cmat = self.coulomb_matrix(mol)
        features = []
        for f in cmat:
            w, v = np.linalg.eig(f)
            w_abs = np.abs(w)
            sortidx = np.argsort(w_abs)
            sortidx = sortidx[::-1]
            w = w[sortidx]
            f = pad_array(w, self.max_atoms)
            features.append(f)
        features = np.asarray(features)
        return features
Beispiel #14
0
    def _featurize(self, mol):
        """Compute neighbor list.

    Parameters
    ----------
    mol: rdkit.Chem.rdchem.mol
      Molecule

    """
        print(mol)
        N = mol.GetNumAtoms()
        coords = get_coords(mol)

        x_bins, y_bins, z_bins = get_cells(coords, self.neighbor_cutoff)

        # Associate each atom with cell it belongs to. O(N)
        cell_to_atoms, atom_to_cell = put_atoms_in_cells(
            coords, x_bins, y_bins, z_bins)

        # Associate each cell with its neighbor cells. Assumes periodic boundary
        # conditions, so does wrapround. O(constant)
        N_x, N_y, N_z = len(x_bins), len(y_bins), len(z_bins)
        neighbor_cell_map = compute_neighbor_cell_map(N_x, N_y, N_z)

        # For each atom, loop through all atoms in its cell and neighboring cells.
        # Accept as neighbors only those within threshold. This computation should be
        # O(Nm), where m is the number of atoms within a set of neighboring-cells.
        neighbor_list = {}
        if self.boxsize is not None:
            for atom in range(N):
                cell = atom_to_cell[atom]
                neighbor_cells = neighbor_cell_map[cell]
                neighbor_list[atom] = set()
                for neighbor_cell in neighbor_cells:
                    atoms_in_cell = cell_to_atoms[neighbor_cell]
                    for neighbor_atom in atoms_in_cell:
                        if neighbor_atom == atom:
                            continue
                        dist = np.linalg.norm(coords[atom] -
                                              coords[neighbor_atom])
                        dist = dist - self.boxsize * np.round(
                            dist / self.boxsize)
                        if dist < self.neighbor_cutoff:
                            neighbor_list[atom].add((neighbor_atom, dist))
                # Sort neighbors by distance
                closest_neighbors = sorted(list(neighbor_list[atom]),
                                           key=lambda elt: elt[1])
                closest_neighbors = [nbr for (nbr, dist) in closest_neighbors]
                # Pick up to max_num_neighbors
                closest_neighbors = closest_neighbors[:self.max_num_neighbors]
                neighbor_list[atom] = closest_neighbors
        else:
            for atom in range(N):
                cell = atom_to_cell[atom]
                neighbor_cells = neighbor_cell_map[cell]
                neighbor_list[atom] = set()
                for neighbor_cell in neighbor_cells:
                    atoms_in_cell = cell_to_atoms[neighbor_cell]
                    for neighbor_atom in atoms_in_cell:
                        if neighbor_atom == atom:
                            continue
                        dist = np.linalg.norm(coords[atom] -
                                              coords[neighbor_atom])
                        if dist < self.neighbor_cutoff:
                            neighbor_list[atom].add((neighbor_atom, dist))
                closest_neighbors = sorted(list(neighbor_list[atom]),
                                           key=lambda elt: elt[1])
                closest_neighbors = [nbr for (nbr, dist) in closest_neighbors]
                closest_neighbors = closest_neighbors[:self.max_num_neighbors]
                neighbor_list[atom] = closest_neighbors
        Z = pad_array(
            np.array([atom.GetAtomicNum() for atom in mol.GetAtoms()]),
            self.max_num_atoms)
        coords = pad_array(coords, (self.max_num_atoms, 3))
        return (coords, neighbor_list, Z)
  def _featurize(self, mol):
    """Compute neighbor list.

    Parameters
    ----------
    mol: rdkit.Chem.rdchem.mol
      Molecule

    """
    N = mol.GetNumAtoms()
    coords = get_coords(mol)

    x_bins, y_bins, z_bins = get_cells(coords, self.neighbor_cutoff)

    # Associate each atom with cell it belongs to. O(N)
    cell_to_atoms, atom_to_cell = put_atoms_in_cells(coords, x_bins, y_bins,
                                                     z_bins)

    # Associate each cell with its neighbor cells. Assumes periodic boundary
    # conditions, so does wrapround. O(constant)
    N_x, N_y, N_z = len(x_bins), len(y_bins), len(z_bins)
    neighbor_cell_map = compute_neighbor_cell_map(N_x, N_y, N_z)

    # For each atom, loop through all atoms in its cell and neighboring cells.
    # Accept as neighbors only those within threshold. This computation should be
    # O(Nm), where m is the number of atoms within a set of neighboring-cells.
    neighbor_list = {}
    if self.boxsize is not None:
      for atom in range(N):
        cell = atom_to_cell[atom]
        neighbor_cells = neighbor_cell_map[cell]
        neighbor_list[atom] = set()
        for neighbor_cell in neighbor_cells:
          atoms_in_cell = cell_to_atoms[neighbor_cell]
          for neighbor_atom in atoms_in_cell:
            if neighbor_atom == atom:
              continue
            dist = np.linalg.norm(coords[atom] - coords[neighbor_atom])
            dist = dist - self.boxsize * np.round(dist / self.boxsize)
            if dist < self.neighbor_cutoff:
              neighbor_list[atom].add((neighbor_atom, dist))
        # Sort neighbors by distance
        closest_neighbors = sorted(
            list(neighbor_list[atom]), key=lambda elt: elt[1])
        closest_neighbors = [nbr for (nbr, dist) in closest_neighbors]
        # Pick up to max_num_neighbors
        closest_neighbors = closest_neighbors[:self.max_num_neighbors]
        neighbor_list[atom] = closest_neighbors
    else:
      for atom in range(N):
        cell = atom_to_cell[atom]
        neighbor_cells = neighbor_cell_map[cell]
        neighbor_list[atom] = set()
        for neighbor_cell in neighbor_cells:
          atoms_in_cell = cell_to_atoms[neighbor_cell]
          for neighbor_atom in atoms_in_cell:
            if neighbor_atom == atom:
              continue
            dist = np.linalg.norm(coords[atom] - coords[neighbor_atom])
            if dist < self.neighbor_cutoff:
              neighbor_list[atom].add((neighbor_atom, dist))
        closest_neighbors = sorted(
            list(neighbor_list[atom]), key=lambda elt: elt[1])
        closest_neighbors = [nbr for (nbr, dist) in closest_neighbors]
        closest_neighbors = closest_neighbors[:self.max_num_neighbors]
        neighbor_list[atom] = closest_neighbors
    Z = pad_array(
        np.array([atom.GetAtomicNum()
                  for atom in mol.GetAtoms()]), self.max_num_atoms)
    coords = pad_array(coords, (self.max_num_atoms, 3))
    return (coords, neighbor_list, Z)
Beispiel #16
0
 def get_Z_matrix(self, mol, max_atoms):
     return pad_array(
         np.array([atom.GetAtomicNum() for atom in mol.GetAtoms()]),
         max_atoms)
Beispiel #17
0
 def get_Z_matrix(self, mol, max_atoms):
   return pad_array(
       np.array([atom.GetAtomicNum() for atom in mol.GetAtoms()]), max_atoms)