Example #1
0
def smile2graph(smile,
                add_self_loop=False,
                atom_featurizer=CanonicalAtomFeaturizer(),
                bond_featurizer=None):
    """Convert SMILES into a DGLGraph.

    The **i** th atom in the molecule, i.e. ``mol.GetAtomWithIdx(i)``, corresponds to the
    **i** th node in the returned DGLGraph.

    The **i** th bond in the molecule, i.e. ``mol.GetBondWithIdx(i)``, corresponds to the
    **(2i)**-th and **(2i+1)**-th edges in the returned DGLGraph. The **(2i)**-th and
    **(2i+1)**-th edges will be separately from **u** to **v** and **v** to **u**, where
    **u** is ``bond.GetBeginAtomIdx()`` and **v** is ``bond.GetEndAtomIdx()``.

    If self loops are added, the last **n** edges will separately be self loops for
    atoms ``0, 1, ..., n-1``.

    Parameters
    ----------
    smiles : str
        String of SMILES
    add_self_loop : bool
        Whether to add self loops in DGLGraphs.
    atom_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict
        Featurization for atoms in a molecule, which can be used to update
        ndata for a DGLGraph. Default to CanonicalAtomFeaturizer().
    bond_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict
        Featurization for bonds in a molecule, which can be used to update
        edata for a DGLGraph.
    """
    mol = Chem.MolFromSmiles(smile)
    new_order = rdmolfiles.CanonicalRankAtoms(mol)
    mol = rdmolops.RenumberAtoms(mol, new_order)
    g = DGLGraph()
    num_atoms = mol.GetNumAtoms()
    g.add_nodes(num_atoms)

    src_list = []
    dst_list = []
    num_bonds = mol.GetNumBonds()
    for i in range(num_bonds):
        bond = mol.GetBondWithIdx(i)
        u = bond.GetBeginAtomIdx()
        v = bond.GetEndAtomIdx()
        src_list.extend([u, v])
        dst_list.extend([v, u])
    g.add_edges(src_list, dst_list)

    if add_self_loop:
        nodes = g.nodes()
        g.add_edges(nodes, nodes)

    # Featurization
    if atom_featurizer is not None:
        g.ndata.update(atom_featurizer(mol))

    if bond_featurizer is not None:
        g.edata.update(bond_featurizer(mol))

    return g
Example #2
0
def featurize_smiles_np(arr, featurizer, log_every_N=1000, verbose=True):
    """Featurize individual compounds in a numpy array.

    Given a featurizer that operates on individual chemical compounds
    or macromolecules, compute & add features for that compound to the
    features array
    """
    features = []
    from rdkit import Chem
    from rdkit.Chem import rdmolfiles
    from rdkit.Chem import rdmolops
    for ind, elem in enumerate(arr.tolist()):
        mol = Chem.MolFromSmiles(elem)
        if mol:
            new_order = rdmolfiles.CanonicalRankAtoms(mol)
            mol = rdmolops.RenumberAtoms(mol, new_order)
        if ind % log_every_N == 0:
            log("Featurizing sample %d" % ind, verbose)
        features.append(featurizer.featurize([mol]))

    valid_inds = np.array([1 if elt.size > 0 else 0 for elt in features],
                          dtype=bool)
    features = [
        elt for (is_valid, elt) in zip(valid_inds, features) if is_valid
    ]
    features = np.squeeze(np.array(features))
    return features.reshape(-1, )
Example #3
0
def featurize_smiles_df(df, featurizer, field, log_every_N=1000, verbose=True):
    """Featurize individual compounds in dataframe.

  Given a featurizer that operates on individual chemical compounds 
  or macromolecules, compute & add features for that compound to the 
  features dataframe
  """
    sample_elems = df[field].tolist()

    features = []
    for ind, elem in enumerate(sample_elems):
        mol = Chem.MolFromSmiles(elem)
        # TODO (ytz) this is a bandage solution to reorder the atoms so
        # that they're always in the same canonical order. Presumably this
        # should be correctly implemented in the future for graph mols.
        if mol:
            new_order = rdmolfiles.CanonicalRankAtoms(mol)
            mol = rdmolops.RenumberAtoms(mol, new_order)
        if ind % log_every_N == 0:
            log("Featurizing sample %d" % ind, verbose)
        features.append(featurizer.featurize([mol]))
    valid_inds = np.array([1 if elt.size > 0 else 0 for elt in features],
                          dtype=bool)
    features = [
        elt for (is_valid, elt) in zip(valid_inds, features) if is_valid
    ]
    return np.squeeze(np.array(features), axis=1), valid_inds
Example #4
0
def smiles2adjoin(smiles, explicit_hydrogens=True, canonical_atom_order=False):

    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        print('error')
        mol = Chem.MolFromSmiles(obsmitosmile(smiles))
        assert mol is not None, smiles + ' is not valid '

    if explicit_hydrogens:
        mol = Chem.AddHs(mol)
    else:
        mol = Chem.RemoveHs(mol)

    if canonical_atom_order:
        new_order = rdmolfiles.CanonicalRankAtoms(mol)
        mol = rdmolops.RenumberAtoms(mol, new_order)
    num_atoms = mol.GetNumAtoms()
    atoms_list = []
    for i in range(num_atoms):
        atom = mol.GetAtomWithIdx(i)
        atoms_list.append(atom.GetSymbol())

    adjoin_matrix = np.eye(num_atoms)
    # Add edges
    num_bonds = mol.GetNumBonds()
    for i in range(num_bonds):
        bond = mol.GetBondWithIdx(i)
        u = bond.GetBeginAtomIdx()
        v = bond.GetEndAtomIdx()
        adjoin_matrix[u, v] = 1.0
        adjoin_matrix[v, u] = 1.0
    return atoms_list, adjoin_matrix
Example #5
0
def mol_to_graph(mol, graph_constructor, atom_featurizer, bond_featurizer):
    """Convert an RDKit molecule object into a DGLGraph and featurize for it.

    Parameters
    ----------
    mol : rdkit.Chem.rdchem.Mol
        RDKit molecule holder
    graph_constructor : callable
        Takes an RDKit molecule as input and returns a DGLGraph
    atom_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict
        Featurization for atoms in a molecule, which can be used to update
        ndata for a DGLGraph.
    bond_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict
        Featurization for bonds in a molecule, which can be used to update
        edata for a DGLGraph.

    Returns
    -------
    g : DGLGraph
        Converted DGLGraph for the molecule
    """
    new_order = rdmolfiles.CanonicalRankAtoms(mol)
    mol = rdmolops.RenumberAtoms(mol, new_order)
    g = graph_constructor(mol)

    if atom_featurizer is not None:
        g.ndata.update(atom_featurizer(mol))

    if bond_featurizer is not None:
        g.edata.update(bond_featurizer(mol))

    return g
Example #6
0
    def featurize(self, molecules, log_every_n=1000) -> np.ndarray:
        """Calculate features for molecules.

    Parameters
    ----------
    molecules: rdkit.Chem.rdchem.Mol / SMILES string / iterable
      RDKit Mol, or SMILES string or iterable sequence of RDKit mols/SMILES
      strings.
    log_every_n: int, default 1000
      Logging messages reported every `log_every_n` samples.

    Returns
    -------
    features: np.ndarray
      A numpy array containing a featurized representation of `datapoints`.
    """
        try:
            from rdkit import Chem
            from rdkit.Chem import rdmolfiles
            from rdkit.Chem import rdmolops
            from rdkit.Chem.rdchem import Mol
        except ModuleNotFoundError:
            raise ImportError("This class requires RDKit to be installed.")

        # Special case handling of single molecule
        if isinstance(molecules, str) or isinstance(molecules, Mol):
            molecules = [molecules]
        else:
            # Convert iterables to list
            molecules = list(molecules)

        features = []
        for i, mol in enumerate(molecules):
            if i % log_every_n == 0:
                logger.info("Featurizing datapoint %i" % i)

            try:
                if isinstance(mol, str):
                    # mol must be a RDKit Mol object, so parse a SMILES
                    mol = Chem.MolFromSmiles(mol)
                    # SMILES is unique, so set a canonical order of atoms
                    new_order = rdmolfiles.CanonicalRankAtoms(mol)
                    mol = rdmolops.RenumberAtoms(mol, new_order)

                features.append(self._featurize(mol))
            except Exception as e:
                if isinstance(mol, Chem.rdchem.Mol):
                    mol = Chem.MolToSmiles(mol)
                logger.warning(
                    "Failed to featurize datapoint %d, %s. Appending empty array",
                    i, mol)
                logger.warning("Exception message: {}".format(e))
                features.append(np.array([]))

        features = np.asarray(features)
        return features
def fingerprint_features(smile_string, radius=2, size=2048):
    mol = MolFromSmiles(smile_string)
    new_order = rdmolfiles.CanonicalRankAtoms(mol)
    mol = rdmolops.RenumberAtoms(mol, new_order)
    return rdMolDescriptors.GetMorganFingerprintAsBitVect(mol,
                                                          radius,
                                                          nBits=size,
                                                          useChirality=True,
                                                          useBondTypes=True,
                                                          useFeatures=False)
Example #8
0
    def load_reaction_data(self, file_path):
        """Load reaction data from the raw file.

        Parameters
        ----------
        file_path : str
            Path to read the file.

        Returns
        -------
        all_mols : list of rdkit.Chem.rdchem.Mol
            RDKit molecule instances
        all_reactions : list of str
            Reactions
        all_graph_edits : list of str
            Graph edits in the reactions.
        """
        all_mols = []
        all_reactions = []
        all_graph_edits = []
        with open(file_path, 'r') as f:
            for i, line in enumerate(f):
                if i % 10000 == 0:
                    print('Processing line {:d}'.format(i))
                # Each line represents a reaction and the corresponding graph edits
                #
                # reaction example:
                # [CH3:14][OH:15].[NH2:12][NH2:13].[OH2:11].[n:1]1[n:2][cH:3][c:4]
                # ([C:7]([O:9][CH3:8])=[O:10])[cH:5][cH:6]1>>[n:1]1[n:2][cH:3][c:4]
                # ([C:7](=[O:9])[NH:12][NH2:13])[cH:5][cH:6]1
                # The reactants are on the left-hand-side of the reaction and the product
                # is on the right-hand-side of the reaction. The numbers represent atom mapping.
                #
                # graph_edits example:
                # 23-33-1.0;23-25-0.0
                # For a triplet a-b-c, a and b are the atoms that form or loss the bond.
                # c specifies the particular change, 0.0 for losing a bond, 1.0, 2.0, 3.0 and
                # 1.5 separately for forming a single, double, triple or aromatic bond.
                reaction, graph_edits = line.strip("\r\n ").split()
                reactants = reaction.split('>')[0]
                mol = Chem.MolFromSmiles(reactants)
                if mol is None:
                    continue

                # Reorder atoms according to the order specified in the atom map
                atom_map_order = [-1 for _ in range(mol.GetNumAtoms())]
                for i in range(mol.GetNumAtoms()):
                    atom = mol.GetAtomWithIdx(i)
                    atom_map_order[atom.GetIntProp('molAtomMapNumber') - 1] = i
                mol = rdmolops.RenumberAtoms(mol, atom_map_order)
                all_mols.append(mol)
                all_reactions.append(reaction)
                all_graph_edits.append(graph_edits)

        return all_mols, all_reactions, all_graph_edits
Example #9
0
def mol_to_graph(mol, graph_constructor, node_featurizer, edge_featurizer,
                 canonical_atom_order, explicit_hydrogens):
    """Convert an RDKit molecule object into a DGLGraph and featurize for it.

    This function can be used to construct any arbitrary ``DGLGraph`` from an
    RDKit molecule instance.

    Parameters
    ----------
    mol : rdkit.Chem.rdchem.Mol
        RDKit molecule holder
    graph_constructor : callable
        Takes an RDKit molecule as input and returns a DGLGraph
    node_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict
        Featurization for nodes like atoms in a molecule, which can be used to
        update ndata for a DGLGraph.
    edge_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict
        Featurization for edges like bonds in a molecule, which can be used to
        update edata for a DGLGraph.
    canonical_atom_order : bool
        Whether to use a canonical order of atoms returned by RDKit. Setting it
        to true might change the order of atoms in the graph constructed.
    explicit_hydrogens : bool
        Whether to explicitly represent hydrogens as nodes in the graph.

    Returns
    -------
    g : DGLGraph
        Converted DGLGraph for the molecule

    See Also
    --------
    mol_to_bigraph
    mol_to_complete_graph
    mol_to_nearest_neighbor_graph
    """
    # Whether to have hydrogen atoms as explicit nodes
    if explicit_hydrogens:
        mol = Chem.AddHs(mol)
    else:
        mol = Chem.RemoveHs(mol)

    if canonical_atom_order:
        new_order = rdmolfiles.CanonicalRankAtoms(mol)
        mol = rdmolops.RenumberAtoms(mol, new_order)
    g = graph_constructor(mol)

    if node_featurizer is not None:
        g.ndata.update(node_featurizer(mol))

    if edge_featurizer is not None:
        g.edata.update(edge_featurizer(mol))

    return g
Example #10
0
def featurize_smiles_df(df, featurizer, field, log_every_N=1000, verbose=True):
    """Featurize individual compounds in dataframe.

    Given a featurizer that operates on individual chemical compounds
    or macromolecules, compute & add features for that compound to the
    features dataframe
    """
    sample_elems = df[field].tolist()

    features = []
    from rdkit import Chem
    from rdkit.Chem import rdmolfiles
    from rdkit.Chem import rdmolops

    if 'Comet' in str(featurizer.__class__.__qualname__):
        mols = preprocess_df(sample_elems, NUM_WORKERS)
        mols_chunks = np.array_split(mols, len(mols) // BATCH_SIZE + 1)
        for chunk in mols_chunks:
            X, A, L = list(zip(*chunk))
            X = np.array(X, dtype=np.uint8)
            A = np.array(A, dtype=np.float32)
            L = np.array(L, dtype=np.uint8)
            max_len = L[-1]
            X = X[:, :max_len, :]
            A = A[:, :max_len, :max_len]
            temp = featurizer._featurize((X, A))
            features += list(temp)

        valid_inds = np.array([1 if elt.size > 0 else 0 for elt in features],
                              dtype=bool)
        features = [
            elt for (is_valid, elt) in zip(valid_inds, features) if is_valid
        ]
        return np.array(features), valid_inds

    else:
        for ind, elem in enumerate(sample_elems):
            mol = Chem.MolFromSmiles(elem)
            # TODO (ytz) this is a bandage solution to reorder the atoms so
            # that they're always in the same canonical order. Presumably this
            # should be correctly implemented in the future for graph mols.
            if mol:
                new_order = rdmolfiles.CanonicalRankAtoms(mol)
                mol = rdmolops.RenumberAtoms(mol, new_order)
            if ind % log_every_N == 0:
                log("Featurizing sample %d" % ind, verbose)
            features.append(featurizer.featurize([mol]))
        valid_inds = np.array([1 if elt.size > 0 else 0 for elt in features],
                              dtype=bool)
        features = [
            elt for (is_valid, elt) in zip(valid_inds, features) if is_valid
        ]
        return np.squeeze(np.array(features), axis=1), valid_inds
Example #11
0
def fingerprint_features(smile_string, radius=2, size=256):
    mol = MolFromSmiles(smile_string)
    new_order = rdmolfiles.CanonicalRankAtoms(mol)
    mol = rdmolops.RenumberAtoms(mol, new_order)
    arr = np.zeros((0,), dtype=np.int8)
    DataStructs.ConvertToNumpyArray(
        rdMolDescriptors.GetMorganFingerprintAsBitVect(mol, radius,
                                                       nBits=size,
                                                       useChirality=True,
                                                       useBondTypes=True,
                                                       useFeatures=False
                                                       ), arr)
    return arr
Example #12
0
def build_graph_from_molecule(mol, use_master_atom=False):
    """
    Param:
        mol - rdkit.Chem.rdchem.Mol
    Output:
        nodes - np.ndarray of shape (num_atoms, num_feat)
        canon_adj_list - list. index corresponds to the index of node
                         and canon_adj_list[index] corresponds to indices
                         of the nodes that node i is connected to.
    """
    if not isinstance(mol, Chem.rdchem.Mol):
        raise TypeError("'mol' must be rdkit.Chem.rdchem.Mol obj")

    # what are the two lines below doing?
    # Answer found in deepchem.data.data_loader featurize_smiles_df
    # TODO (ytz) this is a bandage solution to reorder the atoms so
    # that they're always in the same canonical order. Presumably this
    # should be correctly implemented in the future for graph mols.
    new_order = rdmolfiles.CanonicalRankAtoms(mol)
    mol = rdmolops.RenumberAtoms(mol, new_order)
    
    
    idx_nodes = [(atom.GetIdx(), encode_atom(atom))
                 for atom in mol.GetAtoms()]
    idx_nodes.sort()
    _, nodes = list(zip(*idx_nodes))

    nodes = np.vstack(nodes)

    # Master atom is the "average" of all atoms that is connected to all atom
    # Introduced in https://arxiv.org/pdf/1704.01212.pdf
    if use_master_atom:
        master_atom_features = np.expand_dims(np.mean(nodes, axis=0), axis=0)
        nodes = np.concatenate([nodes, master_atom_features], axis=0)

    edge_list = [(bond.GetBeginAtomIdx(), bond.GetEndAtomIdx())
                for bond in mol.GetBonds()]

    canon_adj_list = [[] for _ in range(len(nodes))]

    for edge in edge_list:
        canon_adj_list[edge[0]].append(edge[1])
        canon_adj_list[edge[1]].append(edge[0])

    if use_master_atom:
        fake_atom_index = len(nodes) - 1

        for i in range(len(nodes) - 1):
            canon_adj_list[i].append(fake_atom_index)

    return (nodes, canon_adj_list)
Example #13
0
    def featurize(self, molecules, log_every_n=1000):
        """Calculate features for molecules.

    Parameters
    ----------
    molecules: RDKit Mol / SMILES string /iterable
        RDKit Mol, or SMILES string or iterable sequence of RDKit mols/SMILES
        strings.

    Returns
    -------
    A numpy array containing a featurized representation of
    `datapoints`.
    """
        try:
            from rdkit import Chem
            from rdkit.Chem import rdmolfiles
            from rdkit.Chem import rdmolops
            from rdkit.Chem.rdchem import Mol
        except ModuleNotFoundError:
            raise ValueError("This class requires RDKit to be installed.")
        # Special case handling of single molecule
        if isinstance(molecules, str) or isinstance(molecules, Mol):
            molecules = [molecules]
        else:
            # Convert iterables to list
            molecules = list(molecules)
        features = []
        for i, mol in enumerate(molecules):
            if i % log_every_n == 0:
                logger.info("Featurizing datapoint %i" % i)
            try:
                # Process only case of SMILES strings.
                if isinstance(mol, str):
                    # mol must be a SMILES string so parse
                    mol = Chem.MolFromSmiles(mol)
                    # TODO (ytz) this is a bandage solution to reorder the atoms
                    # so that they're always in the same canonical order.
                    # Presumably this should be correctly implemented in the
                    # future for graph mols.
                    if mol:
                        new_order = rdmolfiles.CanonicalRankAtoms(mol)
                        mol = rdmolops.RenumberAtoms(mol, new_order)
                features.append(self._featurize(mol))
            except:
                logger.warning(
                    "Failed to featurize datapoint %d. Appending empty array")
                features.append(np.array([]))

        features = np.asarray(features)
        return features
Example #14
0
def featurize_smiles(arr):
    featurizer = dc.feat.ConvMolFeaturizer()
    features = []
    for ind, elem in enumerate(arr.tolist()):
        mol = Chem.MolFromSmiles(elem)
        if mol:
            new_order = rdmolfiles.CanonicalRankAtoms(mol)
            mol = rdmolops.RenumberAtoms(mol, new_order)
        features.append(featurizer([mol]))

    valid_inds = np.array([1 if elt.size > 0 else 0 for elt in features],
                          dtype=bool)
    features = [
        elt for (is_valid, elt) in zip(valid_inds, features) if is_valid
    ]
    features = np.squeeze(np.array(features))
    return features.reshape(-1, ), valid_inds
Example #15
0
def _featurize_smiles_df(df, featurizer, field, log_every_n=1000):
    """Featurize individual compounds in dataframe.

  Private helper that given a featurizer that operates on individual
  chemical compounds or macromolecules, compute & add features for
  that compound to the features dataframe

  Parameters
  ----------
  df: pd.DataFrame
    DataFrame that holds SMILES strings
  featurizer: Featurizer
    A featurizer object
  field: str
    The name of a column in `df` that holds SMILES strings
  log_every_n: int, optional (default 1000)
    Emit a logging statement every `log_every_n` rows.

  Note
  ----
  This function requires RDKit to be installed
  """
    sample_elems = df[field].tolist()

    features = []
    from rdkit import Chem
    from rdkit.Chem import rdmolfiles
    from rdkit.Chem import rdmolops
    for ind, elem in enumerate(sample_elems):
        mol = Chem.MolFromSmiles(elem)
        # TODO (ytz) this is a bandage solution to reorder the atoms
        # so that they're always in the same canonical order.
        # Presumably this should be correctly implemented in the
        # future for graph mols.
        if mol:
            new_order = rdmolfiles.CanonicalRankAtoms(mol)
            mol = rdmolops.RenumberAtoms(mol, new_order)
        if ind % log_every_n == 0:
            logger.info("Featurizing sample %d" % ind)
        features.append(featurizer.featurize([mol]))
    valid_inds = np.array([1 if elt.size > 0 else 0 for elt in features],
                          dtype=bool)
    features = [
        elt for (is_valid, elt) in zip(valid_inds, features) if is_valid
    ]
    return np.squeeze(np.array(features), axis=1), valid_inds
Example #16
0
def featurize_smiles_df(df, featurizer, field, log_every_N=1000, verbose=True):
    """Featurize individual compounds in dataframe.
  Given a featurizer that operates on individual chemical compounds 
  or macromolecules, compute & add features for that compound to the 
  features dataframe
  """
    sample_elems = df[field].tolist()

    features = []
    for ind, elem in enumerate(sample_elems):
        mol = Chem.MolFromSmiles(elem)
        if mol:
            new_order = rdmolfiles.CanonicalRankAtoms(mol)
            mol = rdmolops.RenumberAtoms(mol, new_order)
        if ind % log_every_N == 0:
            log("Featurizing sample %d" % ind, verbose)
        features.append(featurizer.featurize([mol]))
    valid_inds = torch.Tensor([1 if elt.size > 0 else 0 for elt in features],
                              dtype=bool)
    features = [
        elt for (is_valid, elt) in zip(valid_inds, features) if is_valid
    ]
    return torch.squeeze(torch.Tensor(features), axis=1), valid_inds
Example #17
0
def mol_to_graph(mol, graph_constructor, node_featurizer, edge_featurizer,
                 canonical_atom_order):
    """Convert an RDKit molecule object into a DGLGraph and featurize for it.

    Parameters
    ----------
    mol : rdkit.Chem.rdchem.Mol
        RDKit molecule holder
    graph_constructor : callable
        Takes an RDKit molecule as input and returns a DGLGraph
    node_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict
        Featurization for nodes like atoms in a molecule, which can be used to
        update ndata for a DGLGraph.
    edge_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict
        Featurization for edges like bonds in a molecule, which can be used to
        update edata for a DGLGraph.
    canonical_atom_order : bool
        Whether to use a canonical order of atoms returned by RDKit. Setting it
        to true might change the order of atoms in the graph constructed.

    Returns
    -------
    g : DGLGraph
        Converted DGLGraph for the molecule
    """
    if canonical_atom_order:
        new_order = rdmolfiles.CanonicalRankAtoms(mol)
        mol = rdmolops.RenumberAtoms(mol, new_order)
    g = graph_constructor(mol)

    if node_featurizer is not None:
        g.ndata.update(node_featurizer(mol))

    if edge_featurizer is not None:
        g.edata.update(edge_featurizer(mol))

    return g
    #Calculate Boltzmann averaged
    k_B = 0.0019872041
    #in kcal mol-1 K-1 as MOE gives energies in kcal mol-1
    T = 298.15
    #Room temperature in K.
    k_BT = k_B * T
    sum_exp_EkBT = 0
    for i in range(0, len(conformers), 1):
        sum_exp_EkBT += math.exp(-energies[i] / k_BT)

    writer = Chem.SDWriter(args.output_file)
    #Loop over the conformer molecules and renumber them
    for i in range(0, len(conformers)):
        m = rdmolops.RenumberAtoms(
            conformers[i],
            ssm)  #renumber molecules to match atom numbering in RefMol

        #calculate distances of restraints and indicate if both average restraints =<5Angstrom in the conformer
        restraint_1_dist = (Chem.rdMolTransforms.GetBondLength(
            m.GetConformer(), atom_dict[2],
            atom_dict[38]) + Chem.rdMolTransforms.GetBondLength(
                m.GetConformer(), atom_dict[4], atom_dict[38])) / 2
        restraint_2_dist = (Chem.rdMolTransforms.GetBondLength(
            m.GetConformer(), atom_dict[1],
            atom_dict[28]) + Chem.rdMolTransforms.GetBondLength(
                m.GetConformer(), atom_dict[5], atom_dict[28])) / 2

        m.SetProp("restraint_1_dist", str(restraint_1_dist))
        m.SetProp("restraint_2_dist", str(restraint_2_dist))
        if ((restraint_1_dist <= args.cutoff)
Example #19
0
def update_ti_atoms(mol_list, off_list):
    assert len(mol_list) == 2
    assert len(off_list) == 2

    periodic = {
        '6': 'C',
        '1': 'H',
        '8': 'O',
        '7': 'N',
        '17': 'Cl',
        '9': 'F',
        '16': 'S',
        '35': 'Br',
        '15': 'P',
        '53': 'I'
    }

    matches = compare_mols(off_list[0], off_list[1])

    MCS_atoms_amber = []
    for i in matches:
        MCS_atoms_amber.append(off_list[0][i])

    out_mols = []
    out_off = []
    for mol, mol_amber in zip(mol_list, off_list):
        ele_count = dict([(6, 1), (1, 1), (8, 1), (7, 1), (17, 1), (9, 1),
                          (16, 1), (35, 1), (15, 1), (53, 1)])

        write_core = []
        write_last = []

        mol_copy = Chem.Mol(mol)

        for i in range(0, len(MCS_atoms_amber)):
            for j in range(0, len(mol.GetAtoms())):
                if compare_atom(MCS_atoms_amber[i],
                                mol_amber[j]) and j not in write_core:
                    write_core.append(j)

        for i in range(0, len(mol.GetAtoms())):
            if i not in write_core:
                write_last.append(i)

        for i in range(0, len(mol.GetAtoms())):
            if i in write_core:
                mol_amber[i].core = True
            elif i in write_last:
                mol_amber[i].core = False

        for i in write_core:
            new_atom_name = periodic[str(mol_amber[i].element)] + str(
                ele_count[int(mol_amber[i].element)])
            mol_amber[i].name = new_atom_name
            ele_count[int(mol_amber[i].element)] += 1

        for i in range(0, len(mol.GetAtoms())):
            if mol_amber[i].core == False:
                new_atom_name = periodic[str(mol_amber[i].element)] + str(
                    ele_count[int(mol_amber[i].element)])
                mol_amber[i].name = new_atom_name
                ele_count[int(mol_amber[i].element)] += 1

        # return a re-ordered mol
        mol_copy = rdmolops.RenumberAtoms(mol_copy, write_core + write_last)
        out_mols.append(mol_copy)

        # return matchin re-ordered amber off
        mol_amber = [mol_amber[i] for i in write_core + write_last]
        out_off.append(mol_amber)

    return out_mols, out_off
Example #20
0
def mol_to_nearest_neighbor_graph(mol,
                                  coordinates,
                                  neighbor_cutoff,
                                  max_num_neighbors=None,
                                  p_distance=2,
                                  add_self_loop=False,
                                  node_featurizer=None,
                                  edge_featurizer=None,
                                  canonical_atom_order=True,
                                  keep_dists=False,
                                  dist_field='dist',
                                  explicit_hydrogens=False,
                                  num_virtual_nodes=0):
    """Convert an RDKit molecule into a nearest neighbor graph and featurize for it.

    Different from bigraph and complete graph, the nearest neighbor graph
    may not be symmetric since i is the closest neighbor of j does not
    necessarily suggest the other way.

    Parameters
    ----------
    mol : rdkit.Chem.rdchem.Mol
        RDKit molecule holder
    coordinates : numpy.ndarray of shape (N, D)
        The coordinates of atoms in the molecule. N for the number of atoms
        and D for the dimensions of the coordinates.
    neighbor_cutoff : float
        If the distance between a pair of nodes is larger than neighbor_cutoff,
        they will not be considered as neighboring nodes.
    max_num_neighbors : int or None.
        If not None, then this specifies the maximum number of neighbors
        allowed for each atom. Default to None.
    p_distance : int
        We compute the distance between neighbors using Minkowski (:math:`l_p`)
        distance. When ``p_distance = 1``, Minkowski distance is equivalent to
        Manhattan distance. When ``p_distance = 2``, Minkowski distance is
        equivalent to the standard Euclidean distance. Default to 2.
    add_self_loop : bool
        Whether to add self loops in DGLGraphs. Default to False.
    node_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict
        Featurization for nodes like atoms in a molecule, which can be used to update
        ndata for a DGLGraph. Default to None.
    edge_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict
        Featurization for edges like bonds in a molecule, which can be used to update
        edata for a DGLGraph. Default to None.
    canonical_atom_order : bool
        Whether to use a canonical order of atoms returned by RDKit. Setting it
        to true might change the order of atoms in the graph constructed. Default
        to True.
    keep_dists : bool
        Whether to store the distance between neighboring atoms in ``edata`` of the
        constructed DGLGraphs. Default to False.
    dist_field : str
        Field for storing distance between neighboring atoms in ``edata``. This comes
        into effect only when ``keep_dists=True``. Default to ``'dist'``.
    explicit_hydrogens : bool
        Whether to explicitly represent hydrogens as nodes in the graph. If True,
        it will call rdkit.Chem.AddHs(mol). Default to False.
    num_virtual_nodes : int
        The number of virtual nodes to add. The virtual nodes will be connected to
        all real nodes with virtual edges. If the returned graph has any node/edge
        feature, an additional column of binary values will be used for each feature
        to indicate the identity of virtual node/edges. The features of the virtual
        nodes/edges will be zero vectors except for the additional column. Default to 0.

    Returns
    -------
    DGLGraph or None
        Nearest neighbor DGLGraph for the molecule if :attr:`mol` is valid and None otherwise.

    Examples
    --------
    >>> from dgllife.utils import mol_to_nearest_neighbor_graph
    >>> from rdkit import Chem
    >>> from rdkit.Chem import AllChem

    >>> mol = Chem.MolFromSmiles('CC1(C(N2C(S1)C(C2=O)NC(=O)CC3=CC=CC=C3)C(=O)O)C')
    >>> AllChem.EmbedMolecule(mol)
    >>> AllChem.MMFFOptimizeMolecule(mol)
    >>> coords = get_mol_3d_coordinates(mol)
    >>> g = mol_to_nearest_neighbor_graph(mol, coords, neighbor_cutoff=1.25)
    >>> print(g)
    DGLGraph(num_nodes=23, num_edges=6,
             ndata_schemes={}
             edata_schemes={})

    Quite often we will want to use the distance between end atoms of edges, this can be
    achieved with

    >>> g = mol_to_nearest_neighbor_graph(mol, coords, neighbor_cutoff=1.25, keep_dists=True)
    >>> print(g.edata['dist'])
    tensor([[1.2024],
            [1.2024],
            [1.2270],
            [1.2270],
            [1.2259],
            [1.2259]])

    By default, we do not explicitly represent hydrogens as nodes, which can be done as follows.

    >>> mol = Chem.MolFromSmiles('CC1(C(N2C(S1)C(C2=O)NC(=O)CC3=CC=CC=C3)C(=O)O)C')
    >>> mol = Chem.AddHs(mol)
    >>> AllChem.EmbedMolecule(mol)
    >>> AllChem.MMFFOptimizeMolecule(mol)
    >>> coords = get_mol_3d_coordinates(mol)
    >>> g = mol_to_nearest_neighbor_graph(mol, coords, neighbor_cutoff=1.25,
    >>>                                   explicit_hydrogens=True)
    >>> print(g)
    DGLGraph(num_nodes=41, num_edges=42,
             ndata_schemes={}
             edata_schemes={})

    See Also
    --------
    get_mol_3d_coordinates
    k_nearest_neighbors
    smiles_to_nearest_neighbor_graph
    """
    if mol is None:
        print('Invalid mol found')
        return None

    if explicit_hydrogens:
        mol = Chem.AddHs(mol)

    num_atoms = mol.GetNumAtoms()
    num_coords = coordinates.shape[0]
    assert num_atoms == num_coords, \
        'Expect the number of atoms to match the first dimension of coordinates, ' \
        'got {:d} and {:d}'.format(num_atoms, num_coords)

    if canonical_atom_order:
        new_order = rdmolfiles.CanonicalRankAtoms(mol)
        mol = rdmolops.RenumberAtoms(mol, new_order)

    srcs, dsts, dists = k_nearest_neighbors(
        coordinates=coordinates,
        neighbor_cutoff=neighbor_cutoff,
        max_num_neighbors=max_num_neighbors,
        p_distance=p_distance,
        self_loops=add_self_loop)
    g = dgl.graph(([], []), idtype=torch.int32)

    # Add nodes first since some nodes may be completely isolated
    g.add_nodes(num_atoms)

    # Add edges
    g.add_edges(srcs, dsts)

    if node_featurizer is not None:
        g.ndata.update(node_featurizer(mol))

    if edge_featurizer is not None:
        g.edata.update(edge_featurizer(mol))

    if keep_dists:
        assert dist_field not in g.edata, \
            'Expect {} to be reserved for distance between neighboring atoms.'
        g.edata[dist_field] = torch.tensor(dists).float().reshape(-1, 1)

    if num_virtual_nodes > 0:
        num_real_nodes = g.num_nodes()
        real_nodes = list(range(num_real_nodes))
        g.add_nodes(num_virtual_nodes)

        # Change Topology
        virtual_src = []
        virtual_dst = []
        for count in range(num_virtual_nodes):
            virtual_node = num_real_nodes + count
            virtual_node_copy = [virtual_node] * num_real_nodes
            virtual_src.extend(real_nodes)
            virtual_src.extend(virtual_node_copy)
            virtual_dst.extend(virtual_node_copy)
            virtual_dst.extend(real_nodes)
        g.add_edges(virtual_src, virtual_dst)

        for nk, nv in g.ndata.items():
            nv = torch.cat([nv, torch.zeros(g.num_nodes(), 1)], dim=1)
            nv[:-num_virtual_nodes, -1] = 1
            g.ndata[nk] = nv

        for ek, ev in g.edata.items():
            ev = torch.cat([ev, torch.zeros(g.num_edges(), 1)], dim=1)
            ev[:-num_virtual_nodes * num_real_nodes * 2, -1] = 1
            g.edata[ek] = ev

    return g
Example #21
0
def mol_to_graph(mol,
                 graph_constructor,
                 node_featurizer,
                 edge_featurizer,
                 canonical_atom_order,
                 explicit_hydrogens,
                 num_virtual_nodes=0):
    """Convert an RDKit molecule object into a DGLGraph and featurize for it.

    This function can be used to construct any arbitrary ``DGLGraph`` from an
    RDKit molecule instance.

    Parameters
    ----------
    mol : rdkit.Chem.rdchem.Mol
        RDKit molecule holder
    graph_constructor : callable
        Takes an RDKit molecule as input and returns a DGLGraph
    node_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict
        Featurization for nodes like atoms in a molecule, which can be used to
        update ndata for a DGLGraph.
    edge_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict
        Featurization for edges like bonds in a molecule, which can be used to
        update edata for a DGLGraph.
    canonical_atom_order : bool
        Whether to use a canonical order of atoms returned by RDKit. Setting it
        to true might change the order of atoms in the graph constructed.
    explicit_hydrogens : bool
        Whether to explicitly represent hydrogens as nodes in the graph. If True,
        it will call rdkit.Chem.AddHs(mol).
    num_virtual_nodes : int
        The number of virtual nodes to add. The virtual nodes will be connected to
        all real nodes with virtual edges. If the returned graph has any node/edge
        feature, an additional column of binary values will be used for each feature
        to indicate the identity of virtual node/edges. The features of the virtual
        nodes/edges will be zero vectors except for the additional column. Default to 0.

    Returns
    -------
    DGLGraph or None
        Converted DGLGraph for the molecule if :attr:`mol` is valid and None otherwise.

    See Also
    --------
    mol_to_bigraph
    mol_to_complete_graph
    mol_to_nearest_neighbor_graph
    """
    if mol is None:
        print('Invalid mol found')
        return None

    # Whether to have hydrogen atoms as explicit nodes
    if explicit_hydrogens:
        mol = Chem.AddHs(mol)

    if canonical_atom_order:
        new_order = rdmolfiles.CanonicalRankAtoms(mol)
        mol = rdmolops.RenumberAtoms(mol, new_order)
    g = graph_constructor(mol)

    if node_featurizer is not None:
        g.ndata.update(node_featurizer(mol))

    if edge_featurizer is not None:
        g.edata.update(edge_featurizer(mol))

    if num_virtual_nodes > 0:
        num_real_nodes = g.num_nodes()
        real_nodes = list(range(num_real_nodes))
        g.add_nodes(num_virtual_nodes)

        # Change Topology
        virtual_src = []
        virtual_dst = []
        for count in range(num_virtual_nodes):
            virtual_node = num_real_nodes + count
            virtual_node_copy = [virtual_node] * num_real_nodes
            virtual_src.extend(real_nodes)
            virtual_src.extend(virtual_node_copy)
            virtual_dst.extend(virtual_node_copy)
            virtual_dst.extend(real_nodes)
        g.add_edges(virtual_src, virtual_dst)

        for nk, nv in g.ndata.items():
            nv = torch.cat([nv, torch.zeros(g.num_nodes(), 1)], dim=1)
            nv[-num_virtual_nodes:, -1] = 1
            g.ndata[nk] = nv

        for ek, ev in g.edata.items():
            ev = torch.cat([ev, torch.zeros(g.num_edges(), 1)], dim=1)
            ev[-num_virtual_nodes * num_real_nodes * 2:, -1] = 1
            g.edata[ek] = ev

    return g
Example #22
0
def mol_to_nearest_neighbor_graph(mol,
                                  coordinates,
                                  neighbor_cutoff,
                                  max_num_neighbors=None,
                                  p_distance=2,
                                  add_self_loop=False,
                                  node_featurizer=None,
                                  edge_featurizer=None,
                                  canonical_atom_order=True,
                                  keep_dists=False,
                                  dist_field='dist'):
    """Convert an RDKit molecule into a nearest neighbor graph and featurize for it.

    Different from bigraph and complete graph, the nearest neighbor graph
    may not be symmetric since i is the closest neighbor of j does not
    necessarily suggest the other way.

    Parameters
    ----------
    mol : rdkit.Chem.rdchem.Mol
        RDKit molecule holder
    coordinates : numpy.ndarray of shape (N, D)
        The coordinates of atoms in the molecule. N for the number of atoms
        and D for the dimensions of the coordinates.
    neighbor_cutoff : float
        If the distance between a pair of nodes is larger than neighbor_cutoff,
        they will not be considered as neighboring nodes.
    max_num_neighbors : int or None.
        If not None, then this specifies the maximum number of neighbors
        allowed for each atom. Default to None.
    p_distance : int
        We compute the distance between neighbors using Minkowski (:math:`l_p`)
        distance. When ``p_distance = 1``, Minkowski distance is equivalent to
        Manhattan distance. When ``p_distance = 2``, Minkowski distance is
        equivalent to the standard Euclidean distance. Default to 2.
    add_self_loop : bool
        Whether to add self loops in DGLGraphs. Default to False.
    node_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict
        Featurization for nodes like atoms in a molecule, which can be used to update
        ndata for a DGLGraph. Default to None.
    edge_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict
        Featurization for edges like bonds in a molecule, which can be used to update
        edata for a DGLGraph. Default to None.
    canonical_atom_order : bool
        Whether to use a canonical order of atoms returned by RDKit. Setting it
        to true might change the order of atoms in the graph constructed. Default
        to True.
    keep_dists : bool
        Whether to store the distance between neighboring atoms in ``edata`` of the
        constructed DGLGraphs. Default to False.
    dist_field : str
        Field for storing distance between neighboring atoms in ``edata``. This comes
        into effect only when ``keep_dists=True``. Default to ``'dist'``.
    """
    if canonical_atom_order:
        new_order = rdmolfiles.CanonicalRankAtoms(mol)
        mol = rdmolops.RenumberAtoms(mol, new_order)

    srcs, dsts, dists = k_nearest_neighbors(
        coordinates=coordinates,
        neighbor_cutoff=neighbor_cutoff,
        max_num_neighbors=max_num_neighbors,
        p_distance=p_distance,
        self_loops=add_self_loop)
    g = DGLGraph()

    # Add nodes first since some nodes may be completely isolated
    num_atoms = mol.GetNumAtoms()
    g.add_nodes(num_atoms)

    # Add edges
    g.add_edges(srcs, dsts)

    if node_featurizer is not None:
        g.ndata.update(node_featurizer(mol))

    if edge_featurizer is not None:
        g.edata.update(edge_featurizer(mol))

    if keep_dists:
        assert dist_field not in g.edata, \
            'Expect {} to be reserved for distance between neighboring atoms.'
        g.edata[dist_field] = torch.tensor(dists).float().reshape(-1, 1)

    return g
Example #23
0
def mol_to_nearest_neighbor_graph(mol,
                                  coordinates,
                                  neighbor_cutoff,
                                  max_num_neighbors=None,
                                  p_distance=2,
                                  add_self_loop=False,
                                  node_featurizer=None,
                                  edge_featurizer=None,
                                  canonical_atom_order=True,
                                  keep_dists=False,
                                  dist_field='dist',
                                  explicit_hydrogens=False):
    """Convert an RDKit molecule into a nearest neighbor graph and featurize for it.

    Different from bigraph and complete graph, the nearest neighbor graph
    may not be symmetric since i is the closest neighbor of j does not
    necessarily suggest the other way.

    Parameters
    ----------
    mol : rdkit.Chem.rdchem.Mol
        RDKit molecule holder
    coordinates : numpy.ndarray of shape (N, D)
        The coordinates of atoms in the molecule. N for the number of atoms
        and D for the dimensions of the coordinates.
    neighbor_cutoff : float
        If the distance between a pair of nodes is larger than neighbor_cutoff,
        they will not be considered as neighboring nodes.
    max_num_neighbors : int or None.
        If not None, then this specifies the maximum number of neighbors
        allowed for each atom. Default to None.
    p_distance : int
        We compute the distance between neighbors using Minkowski (:math:`l_p`)
        distance. When ``p_distance = 1``, Minkowski distance is equivalent to
        Manhattan distance. When ``p_distance = 2``, Minkowski distance is
        equivalent to the standard Euclidean distance. Default to 2.
    add_self_loop : bool
        Whether to add self loops in DGLGraphs. Default to False.
    node_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict
        Featurization for nodes like atoms in a molecule, which can be used to update
        ndata for a DGLGraph. Default to None.
    edge_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict
        Featurization for edges like bonds in a molecule, which can be used to update
        edata for a DGLGraph. Default to None.
    canonical_atom_order : bool
        Whether to use a canonical order of atoms returned by RDKit. Setting it
        to true might change the order of atoms in the graph constructed. Default
        to True.
    keep_dists : bool
        Whether to store the distance between neighboring atoms in ``edata`` of the
        constructed DGLGraphs. Default to False.
    dist_field : str
        Field for storing distance between neighboring atoms in ``edata``. This comes
        into effect only when ``keep_dists=True``. Default to ``'dist'``.
    explicit_hydrogens : bool
        Whether to explicitly represent hydrogens as nodes in the graph. Default to False.

    Returns
    -------
    g : DGLGraph
        Nearest neighbor DGLGraph for the molecule

    Examples
    --------
    >>> from dgllife.utils import mol_to_nearest_neighbor_graph
    >>> from rdkit import Chem
    >>> from rdkit.Chem import AllChem

    >>> mol = Chem.MolFromSmiles('CC1(C(N2C(S1)C(C2=O)NC(=O)CC3=CC=CC=C3)C(=O)O)C')
    >>> AllChem.EmbedMolecule(mol)
    >>> AllChem.MMFFOptimizeMolecule(mol)
    >>> coords = get_mol_3d_coordinates(mol)
    >>> g = mol_to_nearest_neighbor_graph(mol, coords, neighbor_cutoff=1.25)
    >>> print(g)
    DGLGraph(num_nodes=23, num_edges=6,
             ndata_schemes={}
             edata_schemes={})

    Quite often we will want to use the distance between end atoms of edges, this can be
    achieved with

    >>> g = mol_to_nearest_neighbor_graph(mol, coords, neighbor_cutoff=1.25, keep_dists=True)
    >>> print(g.edata['dist'])
    tensor([[1.2024],
            [1.2024],
            [1.2270],
            [1.2270],
            [1.2259],
            [1.2259]])

    By default, we do not explicitly represent hydrogens as nodes, which can be done as follows.

    >>> mol = Chem.MolFromSmiles('CC1(C(N2C(S1)C(C2=O)NC(=O)CC3=CC=CC=C3)C(=O)O)C')
    >>> mol = Chem.AddHs(mol)
    >>> AllChem.EmbedMolecule(mol)
    >>> AllChem.MMFFOptimizeMolecule(mol)
    >>> coords = get_mol_3d_coordinates(mol)
    >>> g = mol_to_nearest_neighbor_graph(mol, coords, neighbor_cutoff=1.25,
    >>>                                   explicit_hydrogens=True)
    >>> print(g)
    DGLGraph(num_nodes=41, num_edges=42,
             ndata_schemes={}
             edata_schemes={})

    See Also
    --------
    get_mol_3d_coordinates
    k_nearest_neighbors
    smiles_to_nearest_neighbor_graph
    """
    if explicit_hydrogens:
        mol = Chem.AddHs(mol)
    else:
        mol = Chem.RemoveHs(mol)

    num_atoms = mol.GetNumAtoms()
    num_coords = coordinates.shape[0]
    assert num_atoms == num_coords, \
        'Expect the number of atoms to match the first dimension of coordinates, ' \
        'got {:d} and {:d}'.format(num_atoms, num_coords)

    if canonical_atom_order:
        new_order = rdmolfiles.CanonicalRankAtoms(mol)
        mol = rdmolops.RenumberAtoms(mol, new_order)

    srcs, dsts, dists = k_nearest_neighbors(
        coordinates=coordinates,
        neighbor_cutoff=neighbor_cutoff,
        max_num_neighbors=max_num_neighbors,
        p_distance=p_distance,
        self_loops=add_self_loop)
    g = DGLGraph()

    # Add nodes first since some nodes may be completely isolated
    g.add_nodes(num_atoms)

    # Add edges
    g.add_edges(srcs, dsts)

    if node_featurizer is not None:
        g.ndata.update(node_featurizer(mol))

    if edge_featurizer is not None:
        g.edata.update(edge_featurizer(mol))

    if keep_dists:
        assert dist_field not in g.edata, \
            'Expect {} to be reserved for distance between neighboring atoms.'
        g.edata[dist_field] = torch.tensor(dists).float().reshape(-1, 1)

    return g