Beispiel #1
0
def construct_RGCN_bigraph_from_smiles(smiles):
    g = DGLGraph()

    # Add nodes
    mol = MolFromSmiles(smiles)
    num_atoms = mol.GetNumAtoms()
    g.add_nodes(num_atoms)
    atoms_feature_all = []
    for atom_index, atom in enumerate(mol.GetAtoms()):
        atom_feature = atom_features(atom).tolist()
        atoms_feature_all.append(atom_feature)
    g.ndata["atom"] = torch.tensor(atoms_feature_all)



    # Add edges
    src_list = []
    dst_list = []
    etype_feature_all = []
    num_bonds = mol.GetNumBonds()
    for i in range(num_bonds):
        bond = mol.GetBondWithIdx(i)
        etype_feature = etype_features(bond)
        u = bond.GetBeginAtomIdx()
        v = bond.GetEndAtomIdx()
        src_list.extend([u, v])
        dst_list.extend([v, u])
        etype_feature_all.append(etype_feature)
        etype_feature_all.append(etype_feature)

    g.add_edges(src_list, dst_list)
    normal_all = []
    for i in etype_feature_all:
        normal = etype_feature_all.count(i)/len(etype_feature_all)
        normal = round(normal, 1)
        normal_all.append(normal)

    g.edata["etype"] = torch.tensor(etype_feature_all)
    g.edata["normal"] = torch.tensor(normal_all)
    return g
Beispiel #2
0
def processline(t, step, line):
    global lensum
    if t.incr():
        return 1
    if step == 0:
        lensum += len(line)
    else:
        m = MolFromSmiles(line)
        if step == 100:
            lensum += len(line)
        elif step == 105:
            lensum += len(sha256(line).hexdigest())
        elif step in (110, 120):
            with open(tmpname, 'wb+') as f:
                print(line, file=f)
                if step == 120:
                    os.fsync(f.fileno())
            lensum += os.stat(tmpname).st_size
        elif step == 210:
            lensum += m.GetNumAtoms()
        elif step == 220:
            lensum += m.GetNumBonds()
        elif step == 300:
            lensum += len(MolToSmiles(m))
        elif step == 400:
            lensum += len(MolToMolBlock(m))
        elif step == 420:
            m2 = AddHs(m)
            EmbedMolecule(m2, randomSeed=2020)
            m2 = RemoveHs(m2)
            m2.SetProp("_Name", "test")
            lensum += len(MolToMolBlock(m2))
        elif step == 600:
            lensum += mol2file(m, 'svg')
        elif step == 610:
            lensum += mol2file(m, 'png')
        else:
            raise ValueError("Not implemented step " + str(step))

    return 0
Beispiel #3
0
    def construct_feature_matrices(self, smiles, train=True):
        """ construct a molecule from the given smiles string and return atom
        and bond classes.

        Returns
        dict with entries
        'n_atom' : number of atoms in the molecule
        'n_bond' : number of bonds in the molecule 
        'atom' : (n_atom,) length list of atom classes
        'bond' : (n_bond,) list of bond classes
        'connectivity' : (n_bond, 2) array of source atom, target atom pairs.

        """

        self.atom_tokenizer.train = train
        self.bond_tokenizer.train = train

        logger = logging.getLogger(__name__)
        mol = MolFromSmiles(smiles)
        if self.explicit_hs:
            mol = AddHs(mol)

        n_atom = mol.GetNumAtoms()
        n_bond = 2 * mol.GetNumBonds()

        # If its an isolated atom, add a self-link
        if n_bond == 0:
            n_bond = 1
            logger.warning(f'Found molecule {smiles} with zero bonds')

        atom_feature_matrix = np.zeros(n_atom, dtype='int')
        bond_feature_matrix = np.zeros(n_bond, dtype='int')
        bond_indices = np.zeros(n_bond, dtype='int')
        connectivity = np.zeros((n_bond, 2), dtype='int')

        bond_index = 0
        for n, atom in enumerate(mol.GetAtoms()):

            # Atom Classes
            atom_feature_matrix[n] = self.atom_tokenizer(
                self.atom_features(atom))

            start_index = atom.GetIdx()

            for bond in atom.GetBonds():
                # Is the bond pointing at the target atom
                rev = bond.GetBeginAtomIdx() != start_index

                # Bond Classes
                bond_feature_matrix[bond_index] = self.bond_tokenizer(
                    self.bond_features(bond, flipped=rev))

                # Connect edges to original bonds
                bond_indices[bond_index] = bond.GetIdx()

                # Connectivity
                if not rev:  # Original direction
                    connectivity[bond_index, 0] = bond.GetBeginAtomIdx()
                    connectivity[bond_index, 1] = bond.GetEndAtomIdx()

                else:  # Reversed
                    connectivity[bond_index, 0] = bond.GetEndAtomIdx()
                    connectivity[bond_index, 1] = bond.GetBeginAtomIdx()

                bond_index += 1

        # Track the largest atom and bonds seen
        if train:
            if n_atom > self.max_atoms:
                self.max_atoms = n_atom
            if mol.GetNumBonds() > self.max_bonds:
                self.max_bonds = mol.GetNumBonds()

        return {
            'n_atom': n_atom,
            'n_bond': mol.GetNumBonds(),  # the real number of bonds
            'bond_indices': bond_indices,
            'atom': atom_feature_matrix,
            'bond': bond_feature_matrix,
            'connectivity': connectivity,
        }
def generate_graph(smiles, label=None):
    mol = MolFromSmiles(smiles)
    if not mol:
        raise ValueError("Could not parse SMILES string:", smiles)

    SYMBOL = [
        'B', 'C', 'N', 'O', 'F', 'Si', 'P', 'S', 'Cl', 'As', 'Se', 'Br', 'Te',
        'I', 'At', 'other'
    ]
    HYBRIDIZATION = [
        Chem.rdchem.HybridizationType.SP,
        Chem.rdchem.HybridizationType.SP2,
        Chem.rdchem.HybridizationType.SP3,
        Chem.rdchem.HybridizationType.SP3D,
        Chem.rdchem.HybridizationType.SP3D2,
        'other',
    ]

    num_atom = Chem.RemoveHs(mol).GetNumAtoms()

    symbol = np.zeros((num_atom, 16), np.uint8)
    hybridization = np.zeros((num_atom, 6), np.uint8)
    degree = np.zeros((num_atom, 6), np.uint8)
    num_h = np.zeros((num_atom, 5), np.uint8)
    chirality = np.zeros((num_atom, 3), np.uint8)
    aromatic = np.zeros((num_atom, 1), np.uint8)
    formal_charge = np.zeros((num_atom, 1), np.float32)
    radical_electrons = np.zeros((num_atom, 1), np.float32)

    for i in range(num_atom):
        atom = mol.GetAtomWithIdx(i)
        symbol[i] = one_of_k_encoding_unk(atom.GetSymbol(), SYMBOL)
        hybridization[i] = one_of_k_encoding_unk(atom.GetHybridization(),
                                                 HYBRIDIZATION)
        degree[i] = one_of_k_encoding_unk(atom.GetDegree(), [0, 1, 2, 3, 4, 5])
        num_h[i] = one_of_k_encoding_unk(
            atom.GetTotalNumHs(includeNeighbors=True), [0, 1, 2, 3, 4])
        try:
            chirality[i] = one_of_k_encoding_unk(atom.GetProp('_CIPCode'),
                                                 ['R', 'S', 'unknown'])
        except:
            chirality[i] = [0, 0, 0]
        aromatic[i] = atom.GetIsAromatic()
        formal_charge[i] = atom.GetFormalCharge()
        radical_electrons[i] = atom.GetNumRadicalElectrons()


#     abundant features
#     won't bring substantial change to predictive performance, sometimes even worse

    AtomicWeight = np.zeros((num_atom, 1), np.float32)
    AtomicNumber = np.zeros((num_atom, 1), np.float32)
    Rvdw = np.zeros((num_atom, 1), np.float32)
    RCovalent = np.zeros((num_atom, 1), np.float32)
    DefaultValence = np.zeros((num_atom, 1), np.float32)
    valence = np.zeros((num_atom, 1), np.float32)
    NOuterElecs = np.zeros((num_atom, 1), np.float32)
    ring = np.zeros((num_atom, 7), np.uint8)
    acceptor = np.zeros((num_atom, 1), np.uint8)
    donor = np.zeros((num_atom, 1), np.uint8)

    for i in range(num_atom):
        atom = mol.GetAtomWithIdx(i)
        AtomicNum = atom.GetAtomicNum()
        AtomicNumber[i] = AtomicNum
        AtomicWeight[i] = Chem.GetPeriodicTable().GetAtomicWeight(AtomicNum)
        Rvdw[i] = Chem.GetPeriodicTable().GetRvdw(
            AtomicNum)  # (van der Waals radius)
        RCovalent[i] = Chem.GetPeriodicTable().GetRcovalent(
            AtomicNum)  #(covalent radius)
        DefaultValence[i] = Chem.GetPeriodicTable().GetDefaultValence(
            AtomicNum)
        valence[i] = atom.GetExplicitValence()
        NOuterElecs[i] = Chem.GetPeriodicTable().GetNOuterElecs(AtomicNum)
        ring[i] = [int(atom.IsInRing()), int(atom.IsInRingSize(3)), \
                   int(atom.IsInRingSize(4)), int(atom.IsInRingSize(5)), \
                   int(atom.IsInRingSize(6)), int(atom.IsInRingSize(7)), int(atom.IsInRingSize(8))]

    factory = ChemicalFeatures.BuildFeatureFactory(
        os.path.join(RDConfig.RDDataDir, 'BaseFeatures.fdef'))
    feature = factory.GetFeaturesForMol(mol)
    for t in range(0, len(feature)):
        if feature[t].GetFamily() == 'Donor':
            for i in feature[t].GetAtomIds():
                donor[i] = 1
        elif feature[t].GetFamily() == 'Acceptor':
            for i in feature[t].GetAtomIds():
                acceptor[i] = 1

    num_bond = mol.GetNumBonds()
    if num_bond == 0:
        num_bond = 1  # except error caused by CH4, NH3
    bond_feat = np.zeros((num_bond * 2, 10), np.int16)
    bond_index = np.zeros((num_bond * 2, 2), np.int16)

    BOND_TYPE = [
        Chem.rdchem.BondType.SINGLE,
        Chem.rdchem.BondType.DOUBLE,
        Chem.rdchem.BondType.TRIPLE,
        Chem.rdchem.BondType.AROMATIC,
    ]

    BOND_STEREO = ["STEREONONE", "STEREOANY", "STEREOZ", "STEREOE"]
    ij = 0
    for i in range(num_atom):
        for j in range(num_atom):
            if i == j: continue
            bond = mol.GetBondBetweenAtoms(i, j)
            if bond is not None:
                atom1 = mol.GetAtomWithIdx(i)
                atom2 = mol.GetAtomWithIdx(j)
                bond_index[ij] = [i, j]
                bond_type = one_of_k_encoding(bond.GetBondType(), BOND_TYPE)
                bond_ring = [bond.GetIsConjugated(), bond.IsInRing()]
                bond_stereo = one_of_k_encoding(str(bond.GetStereo()),
                                                BOND_STEREO)
                bond_feat[ij] = bond_type + bond_ring + bond_stereo
                ij += 1

    graph = Graph(
        smiles,
        [symbol, hybridization, degree, num_h, chirality, aromatic, formal_charge, radical_electrons, \
        AtomicWeight, AtomicNumber, Rvdw, RCovalent, DefaultValence, valence, NOuterElecs, ring, acceptor, donor],
        bond_feat,
        bond_index,
        np.array(label).reshape((1, 1)),
    )

    return graph
    def parse_smiles_str(self, smiles_str, id, target=None):
        # Use RDKit to parse SMILES string
        mol = MolFromSmiles(smiles_str)
        if not mol:
            return None

        # Represent Hydrogen atoms explicity (if necessary)
        if self.config['explicit_Hs']:
            mol = Chem.AddHs(mol)

        # Compute number of nodes (atoms) and edges (bonds)
        n_nodes, n_edges = mol.GetNumAtoms(), mol.GetNumBonds()

        # Allocate space for Numpy arrays representing the molecular graph
        node_features = np.zeros((n_nodes, self.num_node_features), dtype=np.float32)
        edge_features = np.zeros((n_edges, self.num_edge_features), dtype=np.float32)
        adj_mat = np.zeros((2*n_edges, 2), dtype=np.int64)  # Adjacency matrix (sparse representation)
        inc_mat = np.zeros((2*n_edges, 2), dtype=np.int64)  # Incidence matrix (sparse representation)

        # Retrieve node (atom) features, if needed
        if self.num_node_features > 0:
            for i, atom in enumerate(mol.GetAtoms()):
                node_features[i] = self.get_node_features(atom)

        # Retrieve edges (bonds)
        for i, bond in enumerate(mol.GetBonds()):
            # Fill in the two pairs of indices this edge (bond) contributes to the adjacency matrix
            adj_mat[2*i] = [bond.GetBeginAtom().GetIdx(), bond.GetEndAtom().GetIdx()]
            adj_mat[2*i+1] = [bond.GetEndAtom().GetIdx(), bond.GetBeginAtom().GetIdx()]
            # Fill in the two pairs of indices this edge (bond) contributes to the incidence matrix
            inc_mat[2*i] = [bond.GetBeginAtom().GetIdx(), i]
            inc_mat[2*i+1] = [bond.GetEndAtom().GetIdx(), i]

            # Retrieve edge (bond) features, if needed
            if self.num_edge_features > 0:
                edge_features[i] = self.get_edge_features(bond)

        # Sort the adjacency and incidence matrices lexicographically
        adj_mat = adj_mat[np.lexsort((adj_mat[:, 1], adj_mat[:, 0]))]
        inc_mat = inc_mat[np.lexsort((inc_mat[:, 1], inc_mat[:, 0]))]

        # Represent molecular graph as a dictionary
        g = {'node_features': node_features, 'edge_features': edge_features, 'adj_mat': adj_mat, 'inc_mat': inc_mat}

        # Add target(s) (if any), making sure they are a NumPy array object with method tobytes()
        if target is not None:
            # Convert scalars to NumPy array
            if not isinstance(target, np.ndarray):
                target = np.array(target, np.float32)

            # Ensure target is of type np.float32
            target = target.astype(np.float32)

            # Flatten targets of rank >= 2
            if target.ndim > 1:
                target = target.flatten()

            # Store target as a (row) 2D NumPy array (for compatibility)
            g['target'] = np.reshape(target, (1, -1))
            n_targets = g['target'].shape[1]
        # If there are no targets, add an empty NumPy array (for compatibility)
        else:
            g['target'] = np.zeros((1, 0), dtype=np.float32)
            n_targets = 0

        # Add ID, making sure it is a NumPy array object with method tobytes()
        if not isinstance(target, np.ndarray):
            id = np.array(id, np.int64)
        g['id'] = id

        # Finally, add shape information. The last element refers to the number of graphs, and is included for
        # compatibility with batched graphs
        g['shape'] = np.array((n_nodes, n_edges, self.num_node_features, self.num_edge_features, n_targets, 1),
                              np.int64)

        return g