Python MolFromSmiles.GetBondsの例、rdkit.Chem.MolFromSmiles.GetBonds Pythonの例

コード例 #1

0

ファイルを表示

ファイル: data_features.py プロジェクト: zjujdj/DeepChemStable

def graph_from_smiles(smiles):
    graph = MolGraph()
    mol = MolFromSmiles(smiles)
    if not mol:
        raise ValueError("Could not parse SMILES string:", smiles)
    atoms_by_rd_idx = {}

    rdPartialCharges.ComputeGasteigerCharges(mol)
    for atom in mol.GetAtoms():
        add_Gasteiger = float(atom.GetProp('_GasteigerCharge'))
        if np.isnan(add_Gasteiger) or np.isinf(add_Gasteiger):
            add_Gasteiger = 0.0
        new_atom_node = graph.new_node('atom',
                                       features=atom_features(
                                           atom, add_Gasteiger),
                                       rdkit_ix=atom.GetIdx())
        atoms_by_rd_idx[atom.GetIdx()] = new_atom_node

    for bond in mol.GetBonds():
        atom1_node = atoms_by_rd_idx[bond.GetBeginAtom().GetIdx()]
        atom2_node = atoms_by_rd_idx[bond.GetEndAtom().GetIdx()]
        new_bond_node = graph.new_node('bond', features=bond_features(bond))
        new_bond_node.add_neighbors((atom1_node, atom2_node))
        atom1_node.add_neighbors((atom2_node, ))

    mol_node = graph.new_node('molecule')
    mol_node.add_neighbors(graph.nodes['atom'])
    return graph

コード例 #2

0

ファイルを表示

def graph_from_smiles(smiles):
    graph = MolGraph()
    mol = MolFromSmiles(smiles)

    # mol = MolFromSmiles(smiles, sanitize=False)
    # mol.UpdatePropertyCache(strict=False)
    # Chem.SanitizeMol(mol, Chem.SanitizeFlags.SANITIZE_FINDRADICALS | Chem.SanitizeFlags.SANITIZE_KEKULIZE | Chem.SanitizeFlags.SANITIZE_SETAROMATICITY | Chem.SanitizeFlags.SANITIZE_SETCONJUGATION | Chem.SanitizeFlags.SANITIZE_SETHYBRIDIZATION | Chem.SanitizeFlags.SANITIZE_SYMMRINGS, catchErrors=True)
    if not mol:
        raise ValueError("Could not parse SMILES string:", smiles)
    atoms_by_rd_idx = {}
    for atom in mol.GetAtoms():
        new_atom_node = graph.new_node('atom',
                                       features=atom_features(atom),
                                       rdkit_ix=atom.GetIdx())
        atoms_by_rd_idx[atom.GetIdx()] = new_atom_node

    for bond in mol.GetBonds():
        atom1_node = atoms_by_rd_idx[bond.GetBeginAtom().GetIdx()]
        atom2_node = atoms_by_rd_idx[bond.GetEndAtom().GetIdx()]
        new_bond_node = graph.new_node('bond', features=bond_features(bond))
        new_bond_node.add_neighbors((atom1_node, atom2_node))
        atom1_node.add_neighbors((atom2_node, ))

    mol_node = graph.new_node('molecule')
    mol_node.add_neighbors(graph.nodes['atom'])
    return graph

コード例 #3

0

ファイルを表示

def load_from_smiles(smiles):
    """ Load a single molecule graph from its SMIELS string. """
    graph = Molecule()
    mol = MolFromSmiles(smiles)
    if not mol:
        raise ValueError("Could not parse SMILES string:", smiles)
    for atom in mol.GetAtoms():
        atom_node = Node('atom', node_id(smiles, atom.GetIdx()),
                         atom_features(atom))
        graph.add_node(atom_node)

    for bond in mol.GetBonds():
        src_node = graph.get_node(
            'atom', node_id(smiles,
                            bond.GetBeginAtom().GetIdx()))
        tgt_node = graph.get_node('atom',
                                  node_id(smiles,
                                          bond.GetEndAtom().GetIdx()))
        bond_node = Node('bond', node_id(smiles, bond.GetIdx()),
                         bond_features(bond))
        graph.add_node(bond_node)
        bond_node.add_neighbors([src_node, tgt_node])
        src_node.add_neighbors([bond_node, tgt_node])
        tgt_node.add_neighbors([bond_node, src_node])

    mol_node = Node('molecule', smiles)
    graph.add_node(mol_node)
    atom_nodes = graph.get_node_list('atom')
    mol_node.add_neighbors(atom_nodes)

    graph.sort_by_degree('atom')

    return graph

コード例 #4

0

ファイルを表示

def graph_from_smiles(smiles):
    graph = MolGraph()
    mol = MolFromSmiles(smiles)
    Chem.DetectBondStereochemistry(mol, -1)
    Chem.AssignStereochemistry(mol, flagPossibleStereoCenters=True, force=True)
    Chem.AssignAtomChiralTagsFromStructure(mol, -1)

    if not mol:
        raise ValueError("Could not parse SMILES string:", smiles)
    atoms_by_rd_idx = {}
    for atom in mol.GetAtoms():
        new_atom_node = graph.new_node('atom',
                                       features=atom_features(atom),
                                       rdkit_ix=atom.GetIdx())
        atoms_by_rd_idx[atom.GetIdx()] = new_atom_node

    for bond in mol.GetBonds():
        atom1_node = atoms_by_rd_idx[bond.GetBeginAtom().GetIdx()]
        atom2_node = atoms_by_rd_idx[bond.GetEndAtom().GetIdx()]
        new_bond_node = graph.new_node('bond', features=bond_features(bond))
        new_bond_node.add_neighbors((atom1_node, atom2_node))
        atom1_node.add_neighbors((atom2_node, ))

    mol_node = graph.new_node('molecule')
    mol_node.add_neighbors(graph.nodes['atom'])
    return graph

コード例 #5

0

ファイルを表示

def graph_from_smiles(smiles, fp_switch):  #ecfp = false, fcfp = true
    graph = MolGraph()
    check = np.array(1)
    if type(check) is not type(smiles):
        str_smiles = smiles._data[0][0]
    else:
        str_smiles = smiles[0]
    mol = MolFromSmiles(str_smiles)

    if not mol:
        raise ValueError("Could not parse SMILES string:", str_smiles)

    atoms_by_rd_idx = {}
    fcfp = atom_features_from_fcfp(mol)
    idx = 0
    for atom in mol.GetAtoms():
        new_atom_node = graph.new_node(
            'atom',
            features=np.r_[atom_features_from_ecfp(atom), fcfp[idx]],
            rdkit_ix=atom.GetIdx())
        atoms_by_rd_idx[atom.GetIdx()] = new_atom_node
        idx += 1

    for bond in mol.GetBonds():
        atom1_node = atoms_by_rd_idx[bond.GetBeginAtom().GetIdx()]
        atom2_node = atoms_by_rd_idx[bond.GetEndAtom().GetIdx()]
        new_bond_node = graph.new_node('bond', features=bond_features(bond))
        new_bond_node.add_neighbors((atom1_node, atom2_node))
        atom1_node.add_neighbors((atom2_node, ))

    mol_node = graph.new_node('molecule')
    mol_node.add_neighbors(graph.nodes['atom'])
    return graph

コード例 #6

0

ファイルを表示

    def Read(self, lgi):
        """
        Method Read imports an lgi to Graph.
        """
        try:
            # Extract the degree
            D = [
                self.degree[lgi[idx]] for idx in range(len(lgi))
                if lgi[idx] in self.known
            ]

            # Translate to smiles and import using RDKit
            smi = "%s" % (lgi)
            for src, dst in self.replacements:
                smi = smi.replace(src, dst)
            mol = MolFromSmiles(smi)

            # Define the graph
            G = nx.Graph()
            for bond in mol.GetBonds():
                f, t = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()
                G.add_edge(f, t)

            # Done
            if IsValid(G, D):
                return G
            else:
                return None
        except:
            return None

コード例 #7

0

ファイルを表示

ファイル: mol_graph.py プロジェクト: GUR9000/InnerOuterRNN

def graph_from_smiles(smiles):
    #    print ('graph_from_smiles::',smiles)
    graph = MolGraph()
    mol = MolFromSmiles(smiles)
    if not mol:
        raise ValueError("Could not parse SMILES string:", smiles)
    atoms_by_rd_idx = {}
    for atom in mol.GetAtoms():
        #print(atom.GetSymbol(), 'deg', atom.GetDegree(), '#H',atom.GetTotalNumHs(),'valence', atom.GetImplicitValence(), 'Idx()',atom.GetIdx())
        new_atom_node = graph.new_node('atom',
                                       features=atom_features(atom),
                                       rdkit_ix=atom.GetIdx())
        atoms_by_rd_idx[atom.GetIdx()] = new_atom_node

    for bond in mol.GetBonds():
        #print('bond.GetBeginAtom()--bond.GetBeginAtom():', bond.GetBeginAtom().GetIdx(), bond.GetEndAtom().GetIdx(), 'type',str(bond.GetBondType()).split('.')[-1],'conjugated', bond.GetIsConjugated(), 'ring',bond.IsInRing())
        atom1_node = atoms_by_rd_idx[bond.GetBeginAtom().GetIdx()]
        atom2_node = atoms_by_rd_idx[bond.GetEndAtom().GetIdx()]
        new_bond_node = graph.new_node('bond', features=bond_features(bond))
        new_bond_node.add_neighbors((atom1_node, atom2_node))
        atom1_node.add_neighbors((atom2_node, ))

    mol_node = graph.new_node('molecule')
    mol_node.add_neighbors(graph.nodes['atom'])
    return graph

コード例 #8

0

ファイルを表示

def graph_from_smiles(smiles):
    graph = MolGraph()
    try:
        mol = MolFromSmiles(smiles)
    except:
        print('Could not parse...')
        print(smiles)
        quit()
    if not mol:
        raise ValueError("Could not parse SMILES string:", smiles)
    atoms_by_rd_idx = {}
    for atom in mol.GetAtoms():
        new_atom_node = graph.new_node('atom',
                                       features=atom_features(atom),
                                       rdkit_ix=atom.GetIdx())
        atoms_by_rd_idx[atom.GetIdx()] = new_atom_node

    for bond in mol.GetBonds():
        atom1_node = atoms_by_rd_idx[bond.GetBeginAtom().GetIdx()]
        atom2_node = atoms_by_rd_idx[bond.GetEndAtom().GetIdx()]
        new_bond_node = graph.new_node('bond', features=bond_features(bond))
        new_bond_node.add_neighbors((atom1_node, atom2_node))
        atom1_node.add_neighbors((atom2_node, ))

    mol_node = graph.new_node('molecule')
    mol_node.add_neighbors(graph.nodes['atom'])
    return graph

コード例 #9

0

ファイルを表示

    def get_feature_lengths(self) -> List[int]:
        """Calculates the length of each feature

        Returns:
            A list of the lengths of each feature.
        """
        molecule = MolFromSmiles('CC')
        bond = molecule.GetBonds()[0]
        return self._get_feature_lengths(bond)

コード例 #10

0

ファイルを表示

def get_max_atom_bond_size(smiles_iterator, explicit_hs=True):
    """ Convienence function to get max_atoms, max_bonds for a set of input
    SMILES """

    max_atoms = 0
    max_bonds = 0
    for smiles in tqdm(smiles_iterator):
        mol = MolFromSmiles(smiles)
        if explicit_hs:
            mol = AddHs(mol)
        max_atoms = max([max_atoms, len(mol.GetAtoms())])
        max_bonds = max([max_bonds, len(mol.GetBonds())])

    return dict(max_atoms=max_atoms, max_bonds=max_bonds * 2)

コード例 #11

0

ファイルを表示

 def Translate(self, smi, canonical=True):
     """
     Method translates a SMILES-string to a undirected
     graph G(V,E) with featureless vertices and unweighted
     edges, e.g. the graph equivalent of a saturated hydrocarbon.
     Input:
     smi
     """
     # Make a copy of the molecule to address the degrees
     mol = MolFromSmiles(smi)
     degrees = [atom.GetDegree() for atom in mol.GetAtoms()]
     edges = [(bond.GetBeginAtomIdx(), bond.GetEndAtomIdx())
              for bond in mol.GetBonds()]
     return self.Write(degrees, edges, canonical=canonical)

コード例 #12

0

ファイルを表示

 def process(self, smiles): #构图
     mol = MolFromSmiles(smiles)
     n = mol.GetNumAtoms()+1
     graph = DGLGraph()
     graph.add_nodes(n)
     graph.add_edges(graph.nodes(), graph.nodes())
     graph.add_edges(range(1, n), 0)
     for e in mol.GetBonds():
         u, v = e.GetBeginAtomIdx(), e.GetEndAtomIdx()
         graph.add_edge(u+1, v+1)
         graph.add_edge(v+1, u+1)
     adj = graph.adjacency_matrix(transpose=False).to_dense()
     v, m = torch.cat([atom_feature(atom)[0][None, :] for atom in mol.GetAtoms()]), FEATURE_DIM
     vec = torch.cat([torch.zeros((1, m)),v]).to(self.device)
     return GCNPoint(n, adj, vec)

コード例 #13

0

ファイルを表示

    def process(self, smiles):  #构图
        mol = MolFromSmiles(smiles)
        n = mol.GetNumAtoms()
        graph = DGLGraph()
        graph.add_nodes(n)
        graph.add_edges(graph.nodes(), graph.nodes())
        graph.add_edges(range(1, n), 0)
        graph.ndata["element"] = torch.tensor(
            [ATOM[atom.GetAtomicNum()] for atom in mol.GetAtoms()])
        graph.ndata["explicit"] = torch.tensor(
            [atom.GetExplicitValence() for atom in mol.GetAtoms()])
        graph.ndata["implicit"] = torch.tensor(
            [atom.GetImplicitValence() for atom in mol.GetAtoms()])
        graph.ndata["hybrid"] = torch.tensor(
            [HYBRID[atom.GetHybridization()] for atom in mol.GetAtoms()])
        graph.ndata["hcount"] = torch.tensor(
            [atom.GetTotalNumHs() for atom in mol.GetAtoms()])
        graph.ndata["degree"] = torch.tensor(
            [atom.GetDegree() for atom in mol.GetAtoms()])
        graph.ndata["charge"] = torch.tensor(
            [atom.GetFormalCharge() + 2 for atom in mol.GetAtoms()])
        graph.ndata["ring"] = torch.tensor(
            [int(atom.IsInRing()) for atom in mol.GetAtoms()])
        graph.ndata["aromatic"] = torch.tensor(
            [int(atom.GetIsAromatic()) for atom in mol.GetAtoms()])
        for e in mol.GetBonds():
            u, v = e.GetBeginAtomIdx(), e.GetEndAtomIdx()
            graph.add_edge(u, v)
            graph.add_edge(v, u)

        vec = self.embed(graph.ndata["element"] + graph.ndata["explicit"] +
                         graph.ndata["implicit"] + graph.ndata["hybrid"] +
                         graph.ndata["hcount"] + graph.ndata["degree"] +
                         graph.ndata["charge"] + graph.ndata["ring"] +
                         graph.ndata["aromatic"])
        return GNNPoint(n, graph, vec)

コード例 #14

0

ファイルを表示

ファイル: preprocessor.py プロジェクト: akey7/nfp

    def construct_feature_matrices(self, smiles):
        """ construct a molecule from the given smiles string and return atom
        and bond classes.

        Returns
        dict with entries
        'n_atom' : number of atoms in the molecule
        'n_bond' : number of bonds in the molecule 
        'atom' : (n_atom,) length list of atom classes
        'bond' : (n_bond,) list of bond classes
        'connectivity' : (n_bond, 2) array of source atom, target atom pairs.

        """

        mol = MolFromSmiles(smiles)
        if self.explicit_hs:
            mol = AddHs(mol)

        n_atom = len(mol.GetAtoms())
        n_bond = 2 * len(mol.GetBonds())

        # If its an isolated atom, add a self-link
        if n_bond == 0:
            n_bond = 1

        atom_feature_matrix = np.zeros(n_atom, dtype='int')
        bond_feature_matrix = np.zeros(n_bond, dtype='int')
        connectivity = np.zeros((n_bond, 2), dtype='int')

        bond_index = 0

        atom_seq = mol.GetAtoms()
        atoms = [atom_seq[i] for i in range(n_atom)]

        for n, atom in enumerate(atoms):

            # Atom Classes
            atom_feature_matrix[n] = self.atom_tokenizer(
                self.atom_features(atom))

            start_index = atom.GetIdx()

            for bond in atom.GetBonds():
                # Is the bond pointing at the target atom
                rev = bond.GetBeginAtomIdx() != start_index

                # Bond Classes
                bond_feature_matrix[bond_index] = self.bond_tokenizer(
                    self.bond_features(bond, flipped=rev))

                # Connectivity
                if not rev:  # Original direction
                    connectivity[bond_index, 0] = bond.GetBeginAtomIdx()
                    connectivity[bond_index, 1] = bond.GetEndAtomIdx()

                else:  # Reversed
                    connectivity[bond_index, 0] = bond.GetEndAtomIdx()
                    connectivity[bond_index, 1] = bond.GetBeginAtomIdx()

                bond_index += 1

        return {
            'n_atom': n_atom,
            'n_bond': n_bond,
            'atom': atom_feature_matrix,
            'bond': bond_feature_matrix,
            'connectivity': connectivity,
        }

コード例 #15

0

ファイルを表示

ファイル: smiles_parser.py プロジェクト: whoyouwith91/neural_fingerprints_tf

    def parse_smiles_str(self, smiles_str, id, target=None):
        # Use RDKit to parse SMILES string
        mol = MolFromSmiles(smiles_str)
        if not mol:
            return None

        # Represent Hydrogen atoms explicity (if necessary)
        if self.config['explicit_Hs']:
            mol = Chem.AddHs(mol)

        # Compute number of nodes (atoms) and edges (bonds)
        n_nodes, n_edges = mol.GetNumAtoms(), mol.GetNumBonds()

        # Allocate space for Numpy arrays representing the molecular graph
        node_features = np.zeros((n_nodes, self.num_node_features), dtype=np.float32)
        edge_features = np.zeros((n_edges, self.num_edge_features), dtype=np.float32)
        adj_mat = np.zeros((2*n_edges, 2), dtype=np.int64)  # Adjacency matrix (sparse representation)
        inc_mat = np.zeros((2*n_edges, 2), dtype=np.int64)  # Incidence matrix (sparse representation)

        # Retrieve node (atom) features, if needed
        if self.num_node_features > 0:
            for i, atom in enumerate(mol.GetAtoms()):
                node_features[i] = self.get_node_features(atom)

        # Retrieve edges (bonds)
        for i, bond in enumerate(mol.GetBonds()):
            # Fill in the two pairs of indices this edge (bond) contributes to the adjacency matrix
            adj_mat[2*i] = [bond.GetBeginAtom().GetIdx(), bond.GetEndAtom().GetIdx()]
            adj_mat[2*i+1] = [bond.GetEndAtom().GetIdx(), bond.GetBeginAtom().GetIdx()]
            # Fill in the two pairs of indices this edge (bond) contributes to the incidence matrix
            inc_mat[2*i] = [bond.GetBeginAtom().GetIdx(), i]
            inc_mat[2*i+1] = [bond.GetEndAtom().GetIdx(), i]

            # Retrieve edge (bond) features, if needed
            if self.num_edge_features > 0:
                edge_features[i] = self.get_edge_features(bond)

        # Sort the adjacency and incidence matrices lexicographically
        adj_mat = adj_mat[np.lexsort((adj_mat[:, 1], adj_mat[:, 0]))]
        inc_mat = inc_mat[np.lexsort((inc_mat[:, 1], inc_mat[:, 0]))]

        # Represent molecular graph as a dictionary
        g = {'node_features': node_features, 'edge_features': edge_features, 'adj_mat': adj_mat, 'inc_mat': inc_mat}

        # Add target(s) (if any), making sure they are a NumPy array object with method tobytes()
        if target is not None:
            # Convert scalars to NumPy array
            if not isinstance(target, np.ndarray):
                target = np.array(target, np.float32)

            # Ensure target is of type np.float32
            target = target.astype(np.float32)

            # Flatten targets of rank >= 2
            if target.ndim > 1:
                target = target.flatten()

            # Store target as a (row) 2D NumPy array (for compatibility)
            g['target'] = np.reshape(target, (1, -1))
            n_targets = g['target'].shape[1]
        # If there are no targets, add an empty NumPy array (for compatibility)
        else:
            g['target'] = np.zeros((1, 0), dtype=np.float32)
            n_targets = 0

        # Add ID, making sure it is a NumPy array object with method tobytes()
        if not isinstance(target, np.ndarray):
            id = np.array(id, np.int64)
        g['id'] = id

        # Finally, add shape information. The last element refers to the number of graphs, and is included for
        # compatibility with batched graphs
        g['shape'] = np.array((n_nodes, n_edges, self.num_node_features, self.num_edge_features, n_targets, 1),
                              np.int64)

        return g

コード例 #16

0

ファイルを表示

def extract_graph(data_path, out_file_path, max_atom_num, label_name=None):
    import os
    from rdkit import RDConfig
    from rdkit.Chem import ChemicalFeatures
    fdefName = os.path.join(RDConfig.RDDataDir, 'BaseFeatures.fdef')
    factory = ChemicalFeatures.BuildFeatureFactory(fdefName)

    data_pd = pd.read_csv(data_path)
    smiles_list = data_pd['SMILES'].tolist()

    symbol_candidates = set()
    atom_attribute_dim = num_atom_features()
    bond_attribute_dim = num_bond_features()

    node_attribute_matrix_list = []
    bond_attribute_matrix_list = []
    adjacent_matrix_list = []
    distance_matrix_list = []
    valid_index = []

    ###
    degree_set = set()
    h_num_set = set()
    implicit_valence_set = set()
    charge_set = set()
    ###

    for line_idx, smiles in enumerate(smiles_list):
        smiles = smiles.strip()
        mol = MolFromSmiles(smiles)
        AllChem.Compute2DCoords(mol)
        conformer = mol.GetConformers()[0]
        feats = factory.GetFeaturesForMol(mol)
        acceptor_atom_ids = map(
            lambda x: x.GetAtomIds()[0],
            filter(lambda x: x.GetFamily() == 'Acceptor', feats))
        donor_atom_ids = map(lambda x: x.GetAtomIds()[0],
                             filter(lambda x: x.GetFamily() == 'Donor', feats))

        adjacent_matrix = np.zeros((max_atom_num, max_atom_num))
        adjacent_matrix = adjacent_matrix.astype(int)
        distance_matrix = np.zeros((max_atom_num, max_atom_num))
        node_attribute_matrix = np.zeros((max_atom_num, atom_attribute_dim))
        node_attribute_matrix = node_attribute_matrix.astype(int)

        if len(mol.GetAtoms()) > max_atom_num:
            print('Outlier {} has {} atoms'.format(line_idx,
                                                   mol.GetNumAtoms()))
            continue
        valid_index.append(line_idx)

        atom_positions = [None for _ in range(mol.GetNumAtoms() + 1)]
        for atom in mol.GetAtoms():
            atom_idx = atom.GetIdx()
            symbol_candidates.add(atom.GetSymbol())
            atom_positions[atom_idx] = conformer.GetAtomPosition(atom_idx)
            degree_set.add(atom.GetDegree())
            h_num_set.add(atom.GetTotalNumHs())
            implicit_valence_set.add(atom.GetImplicitValence())
            charge_set.add(atom.GetFormalCharge())
            node_attribute_matrix[atom_idx] = extract_atom_features(
                atom,
                is_acceptor=atom_idx in acceptor_atom_ids,
                is_donor=atom_idx in donor_atom_ids)
        node_attribute_matrix_list.append(node_attribute_matrix)

        for idx_i in range(mol.GetNumAtoms()):
            for idx_j in range(idx_i + 1, mol.GetNumAtoms()):
                distance = get_atom_distance(conformer.GetAtomPosition(idx_i),
                                             conformer.GetAtomPosition(idx_j))
                distance_matrix[idx_i, idx_j] = distance
                distance_matrix[idx_j, idx_i] = distance
        distance_matrix_list.append(distance_matrix)

        for bond in mol.GetBonds():
            begin_atom = bond.GetBeginAtom()
            end_atom = bond.GetEndAtom()
            begin_index = begin_atom.GetIdx()
            end_index = end_atom.GetIdx()
            adjacent_matrix[begin_index, end_index] = 1
            adjacent_matrix[end_index, begin_index] = 1
        adjacent_matrix_list.append(adjacent_matrix)

    adjacent_matrix_list = np.asarray(adjacent_matrix_list)
    distance_matrix_list = np.asarray(distance_matrix_list)
    node_attribute_matrix_list = np.asarray(node_attribute_matrix_list)
    bond_attribute_matrix_list = np.asarray(bond_attribute_matrix_list)
    print('adjacent matrix shape\t', adjacent_matrix_list.shape)
    print('distance matrix shape\t', distance_matrix_list.shape)
    print('node attr matrix shape\t', node_attribute_matrix_list.shape)
    print('bond attr matrix shape\t', bond_attribute_matrix_list.shape)
    print(symbol_candidates)
    print('{} valid out of {}'.format(len(valid_index), len(smiles_list)))

    print('degree set:\t', degree_set)
    print('h num set: \t', h_num_set)
    print('implicit valence set: \t', implicit_valence_set)
    print('charge set:\t', charge_set)

    if label_name is None:
        np.savez_compressed(
            out_file_path,
            adjacent_matrix_list=adjacent_matrix_list,
            distance_matrix_list=distance_matrix_list,
            node_attribute_matrix_list=node_attribute_matrix_list,
            bond_attribute_matrix_list=bond_attribute_matrix_list)
    else:
        true_labels = data_pd[label_name].tolist()
        true_labels = np.array(true_labels)
        valid_index = np.array(valid_index)
        true_labels = true_labels[valid_index]
        np.savez_compressed(
            out_file_path,
            adjacent_matrix_list=adjacent_matrix_list,
            distance_matrix_list=distance_matrix_list,
            node_attribute_matrix_list=node_attribute_matrix_list,
            bond_attribute_matrix_list=bond_attribute_matrix_list,
            label_name=true_labels)
    print()
    return

コード例 #17

0

ファイルを表示

    def process(self):
        if osp.exists(
                os.path.join(self.processed_dir,
                             'Decagon-{}-multi.pt'.format(self.datatype))):
            return

        data_list = []

        # >>> Obtain One-Hot Encoding for Side-Effects
        json_dict = {
            literal_eval(k): v
            for k, v in self.json_load[self.datatype].items()
        }
        total = len(json_dict)

        for idx, (smiles1, smiles2) in enumerate(json_dict):
            printProgress(idx + 1, total,
                          '{} dataset preparation: '.format(self.datatype),
                          ' ', 2, 50)
            mol1 = MolFromSmiles(smiles1)
            mol2 = MolFromSmiles(smiles2)
            label = np.array(json_dict[(smiles1, smiles2)])
            #print(len(label[label == 1]))
            #print(len(label[label == 0]))
            #print("\n{}-[{},{},{}:{}] : {}".format(mode, smiles1, smiles2, se, target_dict[se], label))

            if mol1 is None or mol2 is None:
                print("There is a missing drug from the pair (%s,%s)" %
                      (mol1, mol2))
                continue

            ######################################################################
            # >>> Get pairwise graph G1, G2
            c1_size = mol1.GetNumAtoms()
            c2_size = mol2.GetNumAtoms()

            if c1_size == 0 or c2_size == 0:
                print("There is a size error from pair (%s,%s)" % (mol1, mol2))
                continue

            atoms1 = mol1.GetAtoms()
            atoms2 = mol2.GetAtoms()
            bonds1 = mol1.GetBonds()
            bonds2 = mol2.GetBonds()

            features, edges = [], []

            for atom in atoms1:
                feature = atom_features(atom)
                features.append(feature / sum(feature))  # normalize
            for atom in atoms2:
                feature = atom_features(atom)
                features.append(feature / sum(feature))  # normalize
            for bond in bonds1:
                edges.append([bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()])
            for bond in bonds2:
                edges.append([
                    bond.GetBeginAtomIdx() + c1_size,
                    bond.GetEndAtomIdx() + c1_size
                ])

            if len(edges) == 0:
                continue

            G = nx.Graph(edges).to_directed()
            edge_index = [[e1, e2] for e1, e2 in G.edges]

            GraphSiameseData = DATA.Data(
                x=torch.Tensor(features),
                edge_index=torch.LongTensor(edge_index).transpose(1, 0),
                y=torch.Tensor(label).view(1, -1))
            GraphSiameseData.__setitem__('c1_size',
                                         torch.LongTensor([c1_size]))
            GraphSiameseData.__setitem__('c2_size',
                                         torch.LongTensor([c2_size]))
            data_list.append(GraphSiameseData)
            ###########################################################################

        if self.pre_filter is not None:
            data_list = [data for data in data_list if self.pre_filter(data)]
        if self.pre_transform is not None:
            data_list = [self.pre_transform(data) for data in data_list]

        # check this function
        data, slices = self.collate(data_list)
        torch.save((data, slices), self.processed_paths[0])

コード例 #18

0

ファイルを表示

    def process(self):
        if osp.exists(
                os.path.join(self.processed_dir,
                             'Decagon-{}.pt'.format(self.datatype))):
            return

        data_list = []

        # >>> Obtain One-Hot Encoding for Side-Effects
        target_list = []
        with open(self.total_data_dir, 'r', encoding='utf-8') as f:
            rdr = csv.reader(f)
            for line in rdr:
                target_list.append(line[-1])

        label_encoder = LabelEncoder()
        label_encoder.fit(
            target_list
        )  # Automatically generate one-hot labels for side-effects
        label_list = label_encoder.transform(target_list)
        num_classes = len(label_encoder.classes_)

        target_dict = {}
        for target_idx, targets in enumerate(target_list):
            target_dict[targets] = label_list[target_idx]

        for label_idx, mode in enumerate(['negative', 'positive']):
            # negative will be 0, positive will be 1
            pair_list, se_list = [], []
            with open(osp.join(self.dataset_dir,
                               'Decagon-{}-{}.csv'.format(mode,
                                                          self.datatype)),
                      'r',
                      encoding='utf-8') as f:
                rdr = csv.reader(f)
                for line in rdr:
                    se_list.append(line[-1])
                    pair_list.append(line[:-1])
            one_hot = [0] * num_classes
            total = len(pair_list)

            for idx, (smiles_pair, se) in enumerate(zip(pair_list, se_list)):
                smiles1, smiles2 = smiles_pair
                side_effect = one_hot.copy()
                side_effect[target_dict[se]] = 1

                printProgress(idx + 1, total,
                              '{} dataset preparation: '.format(self.datatype),
                              ' ', 2, 50)
                mol1 = MolFromSmiles(smiles1)
                mol2 = MolFromSmiles(smiles2)
                label = [int(label_idx)]

                #print("\n{}-[{},{},{}:{}] : {}".format(mode, smiles1, smiles2, se, target_dict[se], label))

                if mol1 is None or mol2 is None:
                    print("There is a missing drug from the pair (%s,%s)" %
                          (mol1, mol2))
                    continue

                ######################################################################
                # >>> Get pairwise graph G1, G2
                c1_size = mol1.GetNumAtoms()
                c2_size = mol2.GetNumAtoms()

                if c1_size == 0 or c2_size == 0:
                    print("There is a size error from pair (%s,%s)" %
                          (mol1, mol2))
                    continue

                atoms1 = mol1.GetAtoms()
                atoms2 = mol2.GetAtoms()
                bonds1 = mol1.GetBonds()
                bonds2 = mol2.GetBonds()

                features, edges = [], []

                for atom in atoms1:
                    feature = atom_features(atom)
                    features.append(feature / sum(feature))  # normalize
                for atom in atoms2:
                    feature = atom_features(atom)
                    features.append(feature / sum(feature))  # normalize
                for bond in bonds1:
                    edges.append(
                        [bond.GetBeginAtomIdx(),
                         bond.GetEndAtomIdx()])
                for bond in bonds2:
                    edges.append([
                        bond.GetBeginAtomIdx() + c1_size,
                        bond.GetEndAtomIdx() + c1_size
                    ])

                if len(edges) == 0:
                    continue

                G = nx.Graph(edges).to_directed()
                edge_index = [[e1, e2] for e1, e2 in G.edges]

                GraphSiameseData = DATA.Data(
                    x=torch.Tensor(features),
                    edge_index=torch.LongTensor(edge_index).transpose(1, 0),
                    y=torch.Tensor(label).view(-1, 1))
                GraphSiameseData.__setitem__('c1_size',
                                             torch.LongTensor([c1_size]))
                GraphSiameseData.__setitem__('c2_size',
                                             torch.LongTensor([c2_size]))
                GraphSiameseData.__setitem__(
                    'side_effect',
                    torch.Tensor(side_effect).view(1, -1))
                data_list.append(GraphSiameseData)
                ###########################################################################

        if self.pre_filter is not None:
            data_list = [data for data in data_list if self.pre_filter(data)]
        if self.pre_transform is not None:
            data_list = [self.pre_transform(data) for data in data_list]

        # check this function
        data, slices = self.collate(data_list)
        torch.save((data, slices), self.processed_paths[0])