Esempio n. 1
0
def mol_to_graph(mol: RDKitMol):
    """Convert RDKit Mol to NetworkX graph

  Convert mol into a graph representation atoms are nodes, and bonds
  are vertices stored as graph

  Parameters
  ----------
  mol: RDKit Mol
    The molecule to convert into a graph. 

  Returns
  -------
  graph: networkx.Graph
    Contains atoms indices as nodes, edges as bonds.

  Note
  ----
  This function requires NetworkX to be installed.
  """
    try:
        import networkx as nx
    except ModuleNotFoundError:
        raise ValueError("This function requires NetworkX to be installed.")

    G = nx.Graph()
    num_atoms = mol.GetNumAtoms()
    G.add_nodes_from(range(num_atoms))
    for i in range(mol.GetNumBonds()):
        from_idx = mol.GetBonds()[i].GetBeginAtomIdx()
        to_idx = mol.GetBonds()[i].GetEndAtomIdx()
        G.add_edge(from_idx, to_idx)
    return G
    def _featurize(self, mol: RDKitMol) -> GraphData:
        """Calculate molecule graph features from RDKit mol object.

    Parameters
    ----------
    mol: rdkit.Chem.rdchem.Mol
      RDKit mol object.

    Returns
    -------
    graph: GraphData
      A molecule graph with some features.
    """
        if self.use_partial_charge:
            try:
                mol.GetAtomWithIdx(0).GetProp('_GasteigerCharge')
            except:
                # If partial charges were not computed
                try:
                    from rdkit.Chem import AllChem
                    AllChem.ComputeGasteigerCharges(mol)
                except ModuleNotFoundError:
                    raise ImportError(
                        "This class requires RDKit to be installed.")

        # construct atom (node) feature
        h_bond_infos = construct_hydrogen_bonding_info(mol)
        atom_features = np.asarray(
            [
                _construct_atom_feature(atom, h_bond_infos, self.use_chirality,
                                        self.use_partial_charge)
                for atom in mol.GetAtoms()
            ],
            dtype=float,
        )

        # construct edge (bond) index
        src, dest = [], []
        for bond in mol.GetBonds():
            # add edge list considering a directed graph
            start, end = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()
            src += [start, end]
            dest += [end, start]

        # construct edge (bond) feature
        bond_features = None  # deafult None
        if self.use_edges:
            features = []
            for bond in mol.GetBonds():
                features += 2 * [_construct_bond_feature(bond)]
            bond_features = np.asarray(features, dtype=float)

        return GraphData(node_features=atom_features,
                         edge_index=np.asarray([src, dest], dtype=int),
                         edge_features=bond_features)
    def _featurize(self, mol: RDKitMol) -> GraphData:
        """Calculate molecule graph features from RDKit mol object.

    Parameters
    ----------
    mol: rdkit.Chem.rdchem.Mol
      RDKit mol object.

    Returns
    -------
    graph: GraphData
      A molecule graph with some features.
    """
        from rdkit import Chem
        from rdkit.Chem import AllChem

        # construct atom and bond features
        try:
            mol.GetAtomWithIdx(0).GetProp('_GasteigerCharge')
        except:
            # If partial charges were not computed
            AllChem.ComputeGasteigerCharges(mol)

        h_bond_infos = construct_hydrogen_bonding_info(mol)
        sssr = Chem.GetSymmSSSR(mol)

        # construct atom (node) feature
        atom_features = np.array(
            [
                _construct_atom_feature(atom, h_bond_infos, sssr)
                for atom in mol.GetAtoms()
            ],
            dtype=np.float,
        )

        # construct edge (bond) information
        src, dest, bond_features = [], [], []
        for bond in mol.GetBonds():
            # add edge list considering a directed graph
            start, end = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()
            src += [start, end]
            dest += [end, start]
            bond_features += 2 * [_construct_bond_feature(bond)]

        if self.add_self_edges:
            num_atoms = mol.GetNumAtoms()
            src += [i for i in range(num_atoms)]
            dest += [i for i in range(num_atoms)]
            # add dummy edge features
            bond_fea_length = len(bond_features[0])
            bond_features += num_atoms * [[0 for _ in range(bond_fea_length)]]

        return GraphData(node_features=atom_features,
                         edge_index=np.array([src, dest], dtype=np.int),
                         edge_features=np.array(bond_features, dtype=np.float))
Esempio n. 4
0
    def _featurize(self, datapoint: RDKitMol,
                   **kwargs) -> Optional[GraphMatrix]:
        """
    Calculate adjacency matrix and nodes features for RDKitMol.
    It strips any chirality and charges

    Parameters
    ----------
    datapoint: rdkit.Chem.rdchem.Mol
      RDKit mol object.

    Returns
    -------
    graph: GraphMatrix
      A molecule graph with some features.
    """

        try:
            from rdkit import Chem
        except ModuleNotFoundError:
            raise ImportError("This method requires RDKit to be installed.")
        if 'mol' in kwargs:
            datapoint = kwargs.get("mol")
            raise DeprecationWarning(
                'Mol is being phased out as a parameter, please pass "datapoint" instead.'
            )

        if self.kekulize:
            Chem.Kekulize(datapoint)

        A = np.zeros(shape=(self.max_atom_count, self.max_atom_count),
                     dtype=np.float32)
        bonds = datapoint.GetBonds()

        begin, end = [b.GetBeginAtomIdx()
                      for b in bonds], [b.GetEndAtomIdx() for b in bonds]
        bond_type = [self.bond_encoder[b.GetBondType()] for b in bonds]

        A[begin, end] = bond_type
        A[end, begin] = bond_type

        degree = np.sum(A[:datapoint.GetNumAtoms(), :datapoint.GetNumAtoms()],
                        axis=-1)
        X = np.array(
            [
                self.atom_encoder[atom.GetAtomicNum()]
                for atom in datapoint.GetAtoms()
            ] + [0] * (self.max_atom_count - datapoint.GetNumAtoms()),
            dtype=np.int32,
        )
        graph = GraphMatrix(A, X)

        return graph if (degree > 0).all() else None
Esempio n. 5
0
    def _featurize(self, mol: RDKitMol) -> Optional[GraphMatrix]:
        """
    Calculate adjacency matrix and nodes features for RDKitMol.
    It strips any chirality and charges

    Parameters
    ----------
    mol: rdkit.Chem.rdchem.Mol
      RDKit mol object.

    Returns
    -------
    graph: GraphMatrix
      A molecule graph with some features.
    """

        try:
            from rdkit import Chem
        except ModuleNotFoundError:
            raise ImportError("This method requires RDKit to be installed.")

        if self.kekulize:
            Chem.Kekulize(mol)

        A = np.zeros(shape=(self.max_atom_count, self.max_atom_count),
                     dtype=np.float32)
        bonds = mol.GetBonds()

        begin, end = [b.GetBeginAtomIdx()
                      for b in bonds], [b.GetEndAtomIdx() for b in bonds]
        bond_type = [self.bond_encoder[b.GetBondType()] for b in bonds]

        A[begin, end] = bond_type
        A[end, begin] = bond_type

        degree = np.sum(A[:mol.GetNumAtoms(), :mol.GetNumAtoms()], axis=-1)
        X = np.array(
            [
                self.atom_encoder[atom.GetAtomicNum()]
                for atom in mol.GetAtoms()
            ] + [0] * (self.max_atom_count - mol.GetNumAtoms()),
            dtype=np.int32,
        )
        graph = GraphMatrix(A, X)

        return graph if (degree > 0).all() else None
    def _featurize(self, mol: RDKitMol) -> GraphMatrix:
        """Calculate adjacency matrix and nodes features for RDKitMol.

        Parameters
        ----------
        mol: rdkit.Chem.rdchem.Mol
          RDKit mol object.
        Returns
        -------
        graph: GraphMatrix
          A molecule graph with some features.
        """
        if self.kekulize:
            Chem.Kekulize(mol)

        A = np.zeros(shape=(self.max_atom_count, self.max_atom_count),
                     dtype=np.float32)
        bonds = mol.GetBonds()

        begin, end = [b.GetBeginAtomIdx()
                      for b in bonds], [b.GetEndAtomIdx() for b in bonds]
        bond_type = [self.bond_encoder[b.GetBondType()] for b in bonds]

        A[begin, end] = bond_type
        A[end, begin] = bond_type

        degree = np.sum(A[:mol.GetNumAtoms(), :mol.GetNumAtoms()], axis=-1)
        X = np.array(
            [
                self.atom_encoder[atom.GetAtomicNum()]
                for atom in mol.GetAtoms()
            ] + [0] * (self.max_atom_count - mol.GetNumAtoms()),
            dtype=np.int32,
        )
        graph = GraphMatrix(A, X)

        return graph if (degree > 0).all() else None
Esempio n. 7
0
    def _featurize(self, datapoint: RDKitMol, **kwargs) -> np.ndarray:
        """Featurizes a single SMILE into an image.

    Parameters
    ----------
    datapoint: rdkit.Chem.rdchem.Mol
      RDKit Mol object

    Returns
    -------
    np.ndarray
      A 3D array of image, the shape is `(img_size, img_size, 1)`.
      If the length of SMILES is longer than `max_len`, this value is an empty array.
    """
        try:
            from rdkit import Chem
            from rdkit.Chem import AllChem
        except ModuleNotFoundError:
            raise ImportError("This class requires RDKit to be installed.")
        if 'mol' in kwargs:
            datapoint = kwargs.get("mol")
            raise DeprecationWarning(
                'Mol is being phased out as a parameter, please pass "datapoint" instead.'
            )

        smile = Chem.MolToSmiles(datapoint)
        if len(smile) > self.max_len:
            return np.array([])

        cmol = Chem.Mol(datapoint.ToBinary())
        cmol.ComputeGasteigerCharges()
        AllChem.Compute2DCoords(cmol)
        atom_coords = cmol.GetConformer(0).GetPositions()

        if self.img_spec == "std":
            # Setup image
            img = np.zeros((self.img_size, self.img_size, 1))
            # Compute bond properties
            bond_props = np.array(
                [[2.0, bond.GetBeginAtomIdx(),
                  bond.GetEndAtomIdx()] for bond in datapoint.GetBonds()])
            # Compute atom properties
            atom_props = np.array([[atom.GetAtomicNum()]
                                   for atom in cmol.GetAtoms()])

            bond_props = bond_props.astype(np.float32)
            atom_props = atom_props.astype(np.float32)

        else:
            # Setup image
            img = np.zeros((self.img_size, self.img_size, 4))
            # Compute bond properties
            bond_props = np.array([[
                bond.GetBondTypeAsDouble(),
                bond.GetBeginAtomIdx(),
                bond.GetEndAtomIdx()
            ] for bond in datapoint.GetBonds()])
            # Compute atom properties
            atom_props = np.array([[
                atom.GetAtomicNum(),
                atom.GetProp("_GasteigerCharge"),
                atom.GetExplicitValence(),
                atom.GetHybridization().real,
            ] for atom in cmol.GetAtoms()])

            bond_props = bond_props.astype(np.float32)
            atom_props = atom_props.astype(np.float32)

            partial_charges = atom_props[:, 1]
            if np.any(np.isnan(partial_charges)):
                return np.array([])

        frac = np.linspace(0, 1, int(1 / self.res * 2))
        # Reshape done for proper broadcast
        frac = frac.reshape(-1, 1, 1)

        bond_begin_idxs = bond_props[:, 1].astype(int)
        bond_end_idxs = bond_props[:, 2].astype(int)

        # Reshapes, and axes manipulations to facilitate vector processing.
        begin_coords = atom_coords[bond_begin_idxs]
        begin_coords = np.expand_dims(begin_coords.T, axis=0)
        end_coords = atom_coords[bond_end_idxs]
        end_coords = np.expand_dims(end_coords.T, axis=0)

        # Draw a line between the two atoms.
        # The coordinates of this line, are indicated in line_coords
        line_coords = frac * begin_coords + (1 - frac) * end_coords
        # Turn the line coordinates into image positions
        bond_line_idxs = np.ceil(
            (line_coords[:, 0] + self.embed) / self.res).astype(int)
        bond_line_idys = np.ceil(
            (line_coords[:, 1] + self.embed) / self.res).astype(int)
        # Turn atomic coordinates into image positions
        atom_idxs = np.round(
            (atom_coords[:, 0] + self.embed) / self.res).astype(int)
        atom_idys = np.round(
            (atom_coords[:, 1] + self.embed) / self.res).astype(int)

        try:
            # Set the bond line coordinates to the bond property used.
            img[bond_line_idxs, bond_line_idys, 0] = bond_props[:, 0]

            # Set the atom positions in image to different atomic properties in channels
            img[atom_idxs, atom_idys, :] = atom_props

        except IndexError:
            # With fixed res and img_size some molecules (e.g. long chains) may not fit.
            raise IndexError(
                "The molecule does not fit into the image. Consider increasing img_size or res of the SmilesToImage featurizer."
            )
        return img
Esempio n. 8
0
    def _featurize(self, mol: RDKitMol) -> np.ndarray:
        """Featurizes a single SMILE into an image.

    Parameters
    ----------
    mol: rdkit.Chem.rdchem.Mol
      RDKit Mol object

    Returns
    -------
    np.ndarray
      A 3D array of image, the shape is `(img_size, img_size, 1)`.
      If the length of SMILES is longer than `max_len`, this value is an empty array.
    """
        from rdkit import Chem
        from rdkit.Chem import AllChem

        smile = Chem.MolToSmiles(mol)
        if len(smile) > self.max_len:
            return np.array([])

        cmol = Chem.Mol(mol.ToBinary())
        cmol.ComputeGasteigerCharges()
        AllChem.Compute2DCoords(cmol)
        atom_coords = cmol.GetConformer(0).GetPositions()

        if self.img_spec == "std":
            # Setup image
            img = np.zeros((self.img_size, self.img_size, 1))
            # Compute bond properties
            bond_props = np.array(
                [[2.0, bond.GetBeginAtomIdx(),
                  bond.GetEndAtomIdx()] for bond in mol.GetBonds()])
            # Compute atom properties
            atom_props = np.array([[atom.GetAtomicNum()]
                                   for atom in cmol.GetAtoms()])

            bond_props = bond_props.astype(np.float32)
            atom_props = atom_props.astype(np.float32)

        else:
            # Setup image
            img = np.zeros((self.img_size, self.img_size, 4))
            # Compute bond properties
            bond_props = np.array([[
                bond.GetBondTypeAsDouble(),
                bond.GetBeginAtomIdx(),
                bond.GetEndAtomIdx()
            ] for bond in mol.GetBonds()])
            # Compute atom properties
            atom_props = np.array([[
                atom.GetAtomicNum(),
                atom.GetProp("_GasteigerCharge"),
                atom.GetExplicitValence(),
                atom.GetHybridization().real,
            ] for atom in cmol.GetAtoms()])

            bond_props = bond_props.astype(np.float32)
            atom_props = atom_props.astype(np.float32)

            partial_charges = atom_props[:, 1]
            if np.any(np.isnan(partial_charges)):
                return np.array([])

        frac = np.linspace(0, 1, int(1 / self.res * 2))
        # Reshape done for proper broadcast
        frac = frac.reshape(-1, 1, 1)

        bond_begin_idxs = bond_props[:, 1].astype(int)
        bond_end_idxs = bond_props[:, 2].astype(int)

        # Reshapes, and axes manipulations to facilitate vector processing.
        begin_coords = atom_coords[bond_begin_idxs]
        begin_coords = np.expand_dims(begin_coords.T, axis=0)
        end_coords = atom_coords[bond_end_idxs]
        end_coords = np.expand_dims(end_coords.T, axis=0)

        # Draw a line between the two atoms.
        # The coordinates of this line, are indicated in line_coords
        line_coords = frac * begin_coords + (1 - frac) * end_coords
        # Turn the line coordinates into image positions
        bond_line_idxs = np.ceil(
            (line_coords[:, 0] + self.embed) / self.res).astype(int)
        bond_line_idys = np.ceil(
            (line_coords[:, 1] + self.embed) / self.res).astype(int)
        # Set the bond line coordinates to the bond property used.
        img[bond_line_idxs, bond_line_idys, 0] = bond_props[:, 0]

        # Turn atomic coordinates into image positions
        atom_idxs = np.round(
            (atom_coords[:, 0] + self.embed) / self.res).astype(int)
        atom_idys = np.round(
            (atom_coords[:, 1] + self.embed) / self.res).astype(int)
        # Set the atom positions in image to different atomic properties in channels
        img[atom_idxs, atom_idys, :] = atom_props
        return img
Esempio n. 9
0
    def _featurize(self, datapoint: RDKitMol, **kwargs) -> GraphData:
        """Calculate molecule graph features from RDKit mol object.

    Parameters
    ----------
    datapoint: rdkit.Chem.rdchem.Mol
      RDKit mol object.

    Returns
    -------
    graph: GraphData
      A molecule graph with some features.
    """
        assert datapoint.GetNumAtoms(
        ) > 1, "More than one atom should be present in the molecule for this featurizer to work."
        if 'mol' in kwargs:
            datapoint = kwargs.get("mol")
            raise DeprecationWarning(
                'Mol is being phased out as a parameter, please pass "datapoint" instead.'
            )

        if self.use_partial_charge:
            try:
                datapoint.GetAtomWithIdx(0).GetProp('_GasteigerCharge')
            except:
                # If partial charges were not computed
                try:
                    from rdkit.Chem import AllChem
                    AllChem.ComputeGasteigerCharges(datapoint)
                except ModuleNotFoundError:
                    raise ImportError(
                        "This class requires RDKit to be installed.")

        # construct atom (node) feature
        h_bond_infos = construct_hydrogen_bonding_info(datapoint)
        atom_features = np.asarray(
            [
                _construct_atom_feature(atom, h_bond_infos, self.use_chirality,
                                        self.use_partial_charge)
                for atom in datapoint.GetAtoms()
            ],
            dtype=float,
        )

        # construct edge (bond) index
        src, dest = [], []
        for bond in datapoint.GetBonds():
            # add edge list considering a directed graph
            start, end = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()
            src += [start, end]
            dest += [end, start]

        # construct edge (bond) feature
        bond_features = None  # deafult None
        if self.use_edges:
            features = []
            for bond in datapoint.GetBonds():
                features += 2 * [_construct_bond_feature(bond)]
            bond_features = np.asarray(features, dtype=float)

        return GraphData(node_features=atom_features,
                         edge_index=np.asarray([src, dest], dtype=int),
                         edge_features=bond_features)