def graph_from_smiles(smiles): graph = MolGraph() mol = MolFromSmiles(smiles) if not mol: raise ValueError("Could not parse SMILES string:", smiles) atoms_by_rd_idx = {} rdPartialCharges.ComputeGasteigerCharges(mol) for atom in mol.GetAtoms(): add_Gasteiger = float(atom.GetProp('_GasteigerCharge')) if np.isnan(add_Gasteiger) or np.isinf(add_Gasteiger): add_Gasteiger = 0.0 new_atom_node = graph.new_node('atom', features=atom_features( atom, add_Gasteiger), rdkit_ix=atom.GetIdx()) atoms_by_rd_idx[atom.GetIdx()] = new_atom_node for bond in mol.GetBonds(): atom1_node = atoms_by_rd_idx[bond.GetBeginAtom().GetIdx()] atom2_node = atoms_by_rd_idx[bond.GetEndAtom().GetIdx()] new_bond_node = graph.new_node('bond', features=bond_features(bond)) new_bond_node.add_neighbors((atom1_node, atom2_node)) atom1_node.add_neighbors((atom2_node, )) mol_node = graph.new_node('molecule') mol_node.add_neighbors(graph.nodes['atom']) return graph
def graph_from_smiles(smiles): graph = MolGraph() mol = MolFromSmiles(smiles) # mol = MolFromSmiles(smiles, sanitize=False) # mol.UpdatePropertyCache(strict=False) # Chem.SanitizeMol(mol, Chem.SanitizeFlags.SANITIZE_FINDRADICALS | Chem.SanitizeFlags.SANITIZE_KEKULIZE | Chem.SanitizeFlags.SANITIZE_SETAROMATICITY | Chem.SanitizeFlags.SANITIZE_SETCONJUGATION | Chem.SanitizeFlags.SANITIZE_SETHYBRIDIZATION | Chem.SanitizeFlags.SANITIZE_SYMMRINGS, catchErrors=True) if not mol: raise ValueError("Could not parse SMILES string:", smiles) atoms_by_rd_idx = {} for atom in mol.GetAtoms(): new_atom_node = graph.new_node('atom', features=atom_features(atom), rdkit_ix=atom.GetIdx()) atoms_by_rd_idx[atom.GetIdx()] = new_atom_node for bond in mol.GetBonds(): atom1_node = atoms_by_rd_idx[bond.GetBeginAtom().GetIdx()] atom2_node = atoms_by_rd_idx[bond.GetEndAtom().GetIdx()] new_bond_node = graph.new_node('bond', features=bond_features(bond)) new_bond_node.add_neighbors((atom1_node, atom2_node)) atom1_node.add_neighbors((atom2_node, )) mol_node = graph.new_node('molecule') mol_node.add_neighbors(graph.nodes['atom']) return graph
def load_from_smiles(smiles): """ Load a single molecule graph from its SMIELS string. """ graph = Molecule() mol = MolFromSmiles(smiles) if not mol: raise ValueError("Could not parse SMILES string:", smiles) for atom in mol.GetAtoms(): atom_node = Node('atom', node_id(smiles, atom.GetIdx()), atom_features(atom)) graph.add_node(atom_node) for bond in mol.GetBonds(): src_node = graph.get_node( 'atom', node_id(smiles, bond.GetBeginAtom().GetIdx())) tgt_node = graph.get_node('atom', node_id(smiles, bond.GetEndAtom().GetIdx())) bond_node = Node('bond', node_id(smiles, bond.GetIdx()), bond_features(bond)) graph.add_node(bond_node) bond_node.add_neighbors([src_node, tgt_node]) src_node.add_neighbors([bond_node, tgt_node]) tgt_node.add_neighbors([bond_node, src_node]) mol_node = Node('molecule', smiles) graph.add_node(mol_node) atom_nodes = graph.get_node_list('atom') mol_node.add_neighbors(atom_nodes) graph.sort_by_degree('atom') return graph
def graph_from_smiles(smiles): graph = MolGraph() mol = MolFromSmiles(smiles) Chem.DetectBondStereochemistry(mol, -1) Chem.AssignStereochemistry(mol, flagPossibleStereoCenters=True, force=True) Chem.AssignAtomChiralTagsFromStructure(mol, -1) if not mol: raise ValueError("Could not parse SMILES string:", smiles) atoms_by_rd_idx = {} for atom in mol.GetAtoms(): new_atom_node = graph.new_node('atom', features=atom_features(atom), rdkit_ix=atom.GetIdx()) atoms_by_rd_idx[atom.GetIdx()] = new_atom_node for bond in mol.GetBonds(): atom1_node = atoms_by_rd_idx[bond.GetBeginAtom().GetIdx()] atom2_node = atoms_by_rd_idx[bond.GetEndAtom().GetIdx()] new_bond_node = graph.new_node('bond', features=bond_features(bond)) new_bond_node.add_neighbors((atom1_node, atom2_node)) atom1_node.add_neighbors((atom2_node, )) mol_node = graph.new_node('molecule') mol_node.add_neighbors(graph.nodes['atom']) return graph
def graph_from_smiles(smiles, fp_switch): #ecfp = false, fcfp = true graph = MolGraph() check = np.array(1) if type(check) is not type(smiles): str_smiles = smiles._data[0][0] else: str_smiles = smiles[0] mol = MolFromSmiles(str_smiles) if not mol: raise ValueError("Could not parse SMILES string:", str_smiles) atoms_by_rd_idx = {} fcfp = atom_features_from_fcfp(mol) idx = 0 for atom in mol.GetAtoms(): new_atom_node = graph.new_node( 'atom', features=np.r_[atom_features_from_ecfp(atom), fcfp[idx]], rdkit_ix=atom.GetIdx()) atoms_by_rd_idx[atom.GetIdx()] = new_atom_node idx += 1 for bond in mol.GetBonds(): atom1_node = atoms_by_rd_idx[bond.GetBeginAtom().GetIdx()] atom2_node = atoms_by_rd_idx[bond.GetEndAtom().GetIdx()] new_bond_node = graph.new_node('bond', features=bond_features(bond)) new_bond_node.add_neighbors((atom1_node, atom2_node)) atom1_node.add_neighbors((atom2_node, )) mol_node = graph.new_node('molecule') mol_node.add_neighbors(graph.nodes['atom']) return graph
def Read(self, lgi): """ Method Read imports an lgi to Graph. """ try: # Extract the degree D = [ self.degree[lgi[idx]] for idx in range(len(lgi)) if lgi[idx] in self.known ] # Translate to smiles and import using RDKit smi = "%s" % (lgi) for src, dst in self.replacements: smi = smi.replace(src, dst) mol = MolFromSmiles(smi) # Define the graph G = nx.Graph() for bond in mol.GetBonds(): f, t = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx() G.add_edge(f, t) # Done if IsValid(G, D): return G else: return None except: return None
def graph_from_smiles(smiles): # print ('graph_from_smiles::',smiles) graph = MolGraph() mol = MolFromSmiles(smiles) if not mol: raise ValueError("Could not parse SMILES string:", smiles) atoms_by_rd_idx = {} for atom in mol.GetAtoms(): #print(atom.GetSymbol(), 'deg', atom.GetDegree(), '#H',atom.GetTotalNumHs(),'valence', atom.GetImplicitValence(), 'Idx()',atom.GetIdx()) new_atom_node = graph.new_node('atom', features=atom_features(atom), rdkit_ix=atom.GetIdx()) atoms_by_rd_idx[atom.GetIdx()] = new_atom_node for bond in mol.GetBonds(): #print('bond.GetBeginAtom()--bond.GetBeginAtom():', bond.GetBeginAtom().GetIdx(), bond.GetEndAtom().GetIdx(), 'type',str(bond.GetBondType()).split('.')[-1],'conjugated', bond.GetIsConjugated(), 'ring',bond.IsInRing()) atom1_node = atoms_by_rd_idx[bond.GetBeginAtom().GetIdx()] atom2_node = atoms_by_rd_idx[bond.GetEndAtom().GetIdx()] new_bond_node = graph.new_node('bond', features=bond_features(bond)) new_bond_node.add_neighbors((atom1_node, atom2_node)) atom1_node.add_neighbors((atom2_node, )) mol_node = graph.new_node('molecule') mol_node.add_neighbors(graph.nodes['atom']) return graph
def graph_from_smiles(smiles): graph = MolGraph() try: mol = MolFromSmiles(smiles) except: print('Could not parse...') print(smiles) quit() if not mol: raise ValueError("Could not parse SMILES string:", smiles) atoms_by_rd_idx = {} for atom in mol.GetAtoms(): new_atom_node = graph.new_node('atom', features=atom_features(atom), rdkit_ix=atom.GetIdx()) atoms_by_rd_idx[atom.GetIdx()] = new_atom_node for bond in mol.GetBonds(): atom1_node = atoms_by_rd_idx[bond.GetBeginAtom().GetIdx()] atom2_node = atoms_by_rd_idx[bond.GetEndAtom().GetIdx()] new_bond_node = graph.new_node('bond', features=bond_features(bond)) new_bond_node.add_neighbors((atom1_node, atom2_node)) atom1_node.add_neighbors((atom2_node, )) mol_node = graph.new_node('molecule') mol_node.add_neighbors(graph.nodes['atom']) return graph
def get_feature_lengths(self) -> List[int]: """Calculates the length of each feature Returns: A list of the lengths of each feature. """ molecule = MolFromSmiles('CC') bond = molecule.GetBonds()[0] return self._get_feature_lengths(bond)
def get_max_atom_bond_size(smiles_iterator, explicit_hs=True): """ Convienence function to get max_atoms, max_bonds for a set of input SMILES """ max_atoms = 0 max_bonds = 0 for smiles in tqdm(smiles_iterator): mol = MolFromSmiles(smiles) if explicit_hs: mol = AddHs(mol) max_atoms = max([max_atoms, len(mol.GetAtoms())]) max_bonds = max([max_bonds, len(mol.GetBonds())]) return dict(max_atoms=max_atoms, max_bonds=max_bonds * 2)
def Translate(self, smi, canonical=True): """ Method translates a SMILES-string to a undirected graph G(V,E) with featureless vertices and unweighted edges, e.g. the graph equivalent of a saturated hydrocarbon. Input: smi """ # Make a copy of the molecule to address the degrees mol = MolFromSmiles(smi) degrees = [atom.GetDegree() for atom in mol.GetAtoms()] edges = [(bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()) for bond in mol.GetBonds()] return self.Write(degrees, edges, canonical=canonical)
def process(self, smiles): #构图 mol = MolFromSmiles(smiles) n = mol.GetNumAtoms()+1 graph = DGLGraph() graph.add_nodes(n) graph.add_edges(graph.nodes(), graph.nodes()) graph.add_edges(range(1, n), 0) for e in mol.GetBonds(): u, v = e.GetBeginAtomIdx(), e.GetEndAtomIdx() graph.add_edge(u+1, v+1) graph.add_edge(v+1, u+1) adj = graph.adjacency_matrix(transpose=False).to_dense() v, m = torch.cat([atom_feature(atom)[0][None, :] for atom in mol.GetAtoms()]), FEATURE_DIM vec = torch.cat([torch.zeros((1, m)),v]).to(self.device) return GCNPoint(n, adj, vec)
def process(self, smiles): #构图 mol = MolFromSmiles(smiles) n = mol.GetNumAtoms() graph = DGLGraph() graph.add_nodes(n) graph.add_edges(graph.nodes(), graph.nodes()) graph.add_edges(range(1, n), 0) graph.ndata["element"] = torch.tensor( [ATOM[atom.GetAtomicNum()] for atom in mol.GetAtoms()]) graph.ndata["explicit"] = torch.tensor( [atom.GetExplicitValence() for atom in mol.GetAtoms()]) graph.ndata["implicit"] = torch.tensor( [atom.GetImplicitValence() for atom in mol.GetAtoms()]) graph.ndata["hybrid"] = torch.tensor( [HYBRID[atom.GetHybridization()] for atom in mol.GetAtoms()]) graph.ndata["hcount"] = torch.tensor( [atom.GetTotalNumHs() for atom in mol.GetAtoms()]) graph.ndata["degree"] = torch.tensor( [atom.GetDegree() for atom in mol.GetAtoms()]) graph.ndata["charge"] = torch.tensor( [atom.GetFormalCharge() + 2 for atom in mol.GetAtoms()]) graph.ndata["ring"] = torch.tensor( [int(atom.IsInRing()) for atom in mol.GetAtoms()]) graph.ndata["aromatic"] = torch.tensor( [int(atom.GetIsAromatic()) for atom in mol.GetAtoms()]) for e in mol.GetBonds(): u, v = e.GetBeginAtomIdx(), e.GetEndAtomIdx() graph.add_edge(u, v) graph.add_edge(v, u) vec = self.embed(graph.ndata["element"] + graph.ndata["explicit"] + graph.ndata["implicit"] + graph.ndata["hybrid"] + graph.ndata["hcount"] + graph.ndata["degree"] + graph.ndata["charge"] + graph.ndata["ring"] + graph.ndata["aromatic"]) return GNNPoint(n, graph, vec)
def construct_feature_matrices(self, smiles): """ construct a molecule from the given smiles string and return atom and bond classes. Returns dict with entries 'n_atom' : number of atoms in the molecule 'n_bond' : number of bonds in the molecule 'atom' : (n_atom,) length list of atom classes 'bond' : (n_bond,) list of bond classes 'connectivity' : (n_bond, 2) array of source atom, target atom pairs. """ mol = MolFromSmiles(smiles) if self.explicit_hs: mol = AddHs(mol) n_atom = len(mol.GetAtoms()) n_bond = 2 * len(mol.GetBonds()) # If its an isolated atom, add a self-link if n_bond == 0: n_bond = 1 atom_feature_matrix = np.zeros(n_atom, dtype='int') bond_feature_matrix = np.zeros(n_bond, dtype='int') connectivity = np.zeros((n_bond, 2), dtype='int') bond_index = 0 atom_seq = mol.GetAtoms() atoms = [atom_seq[i] for i in range(n_atom)] for n, atom in enumerate(atoms): # Atom Classes atom_feature_matrix[n] = self.atom_tokenizer( self.atom_features(atom)) start_index = atom.GetIdx() for bond in atom.GetBonds(): # Is the bond pointing at the target atom rev = bond.GetBeginAtomIdx() != start_index # Bond Classes bond_feature_matrix[bond_index] = self.bond_tokenizer( self.bond_features(bond, flipped=rev)) # Connectivity if not rev: # Original direction connectivity[bond_index, 0] = bond.GetBeginAtomIdx() connectivity[bond_index, 1] = bond.GetEndAtomIdx() else: # Reversed connectivity[bond_index, 0] = bond.GetEndAtomIdx() connectivity[bond_index, 1] = bond.GetBeginAtomIdx() bond_index += 1 return { 'n_atom': n_atom, 'n_bond': n_bond, 'atom': atom_feature_matrix, 'bond': bond_feature_matrix, 'connectivity': connectivity, }
def parse_smiles_str(self, smiles_str, id, target=None): # Use RDKit to parse SMILES string mol = MolFromSmiles(smiles_str) if not mol: return None # Represent Hydrogen atoms explicity (if necessary) if self.config['explicit_Hs']: mol = Chem.AddHs(mol) # Compute number of nodes (atoms) and edges (bonds) n_nodes, n_edges = mol.GetNumAtoms(), mol.GetNumBonds() # Allocate space for Numpy arrays representing the molecular graph node_features = np.zeros((n_nodes, self.num_node_features), dtype=np.float32) edge_features = np.zeros((n_edges, self.num_edge_features), dtype=np.float32) adj_mat = np.zeros((2*n_edges, 2), dtype=np.int64) # Adjacency matrix (sparse representation) inc_mat = np.zeros((2*n_edges, 2), dtype=np.int64) # Incidence matrix (sparse representation) # Retrieve node (atom) features, if needed if self.num_node_features > 0: for i, atom in enumerate(mol.GetAtoms()): node_features[i] = self.get_node_features(atom) # Retrieve edges (bonds) for i, bond in enumerate(mol.GetBonds()): # Fill in the two pairs of indices this edge (bond) contributes to the adjacency matrix adj_mat[2*i] = [bond.GetBeginAtom().GetIdx(), bond.GetEndAtom().GetIdx()] adj_mat[2*i+1] = [bond.GetEndAtom().GetIdx(), bond.GetBeginAtom().GetIdx()] # Fill in the two pairs of indices this edge (bond) contributes to the incidence matrix inc_mat[2*i] = [bond.GetBeginAtom().GetIdx(), i] inc_mat[2*i+1] = [bond.GetEndAtom().GetIdx(), i] # Retrieve edge (bond) features, if needed if self.num_edge_features > 0: edge_features[i] = self.get_edge_features(bond) # Sort the adjacency and incidence matrices lexicographically adj_mat = adj_mat[np.lexsort((adj_mat[:, 1], adj_mat[:, 0]))] inc_mat = inc_mat[np.lexsort((inc_mat[:, 1], inc_mat[:, 0]))] # Represent molecular graph as a dictionary g = {'node_features': node_features, 'edge_features': edge_features, 'adj_mat': adj_mat, 'inc_mat': inc_mat} # Add target(s) (if any), making sure they are a NumPy array object with method tobytes() if target is not None: # Convert scalars to NumPy array if not isinstance(target, np.ndarray): target = np.array(target, np.float32) # Ensure target is of type np.float32 target = target.astype(np.float32) # Flatten targets of rank >= 2 if target.ndim > 1: target = target.flatten() # Store target as a (row) 2D NumPy array (for compatibility) g['target'] = np.reshape(target, (1, -1)) n_targets = g['target'].shape[1] # If there are no targets, add an empty NumPy array (for compatibility) else: g['target'] = np.zeros((1, 0), dtype=np.float32) n_targets = 0 # Add ID, making sure it is a NumPy array object with method tobytes() if not isinstance(target, np.ndarray): id = np.array(id, np.int64) g['id'] = id # Finally, add shape information. The last element refers to the number of graphs, and is included for # compatibility with batched graphs g['shape'] = np.array((n_nodes, n_edges, self.num_node_features, self.num_edge_features, n_targets, 1), np.int64) return g
def extract_graph(data_path, out_file_path, max_atom_num, label_name=None): import os from rdkit import RDConfig from rdkit.Chem import ChemicalFeatures fdefName = os.path.join(RDConfig.RDDataDir, 'BaseFeatures.fdef') factory = ChemicalFeatures.BuildFeatureFactory(fdefName) data_pd = pd.read_csv(data_path) smiles_list = data_pd['SMILES'].tolist() symbol_candidates = set() atom_attribute_dim = num_atom_features() bond_attribute_dim = num_bond_features() node_attribute_matrix_list = [] bond_attribute_matrix_list = [] adjacent_matrix_list = [] distance_matrix_list = [] valid_index = [] ### degree_set = set() h_num_set = set() implicit_valence_set = set() charge_set = set() ### for line_idx, smiles in enumerate(smiles_list): smiles = smiles.strip() mol = MolFromSmiles(smiles) AllChem.Compute2DCoords(mol) conformer = mol.GetConformers()[0] feats = factory.GetFeaturesForMol(mol) acceptor_atom_ids = map( lambda x: x.GetAtomIds()[0], filter(lambda x: x.GetFamily() == 'Acceptor', feats)) donor_atom_ids = map(lambda x: x.GetAtomIds()[0], filter(lambda x: x.GetFamily() == 'Donor', feats)) adjacent_matrix = np.zeros((max_atom_num, max_atom_num)) adjacent_matrix = adjacent_matrix.astype(int) distance_matrix = np.zeros((max_atom_num, max_atom_num)) node_attribute_matrix = np.zeros((max_atom_num, atom_attribute_dim)) node_attribute_matrix = node_attribute_matrix.astype(int) if len(mol.GetAtoms()) > max_atom_num: print('Outlier {} has {} atoms'.format(line_idx, mol.GetNumAtoms())) continue valid_index.append(line_idx) atom_positions = [None for _ in range(mol.GetNumAtoms() + 1)] for atom in mol.GetAtoms(): atom_idx = atom.GetIdx() symbol_candidates.add(atom.GetSymbol()) atom_positions[atom_idx] = conformer.GetAtomPosition(atom_idx) degree_set.add(atom.GetDegree()) h_num_set.add(atom.GetTotalNumHs()) implicit_valence_set.add(atom.GetImplicitValence()) charge_set.add(atom.GetFormalCharge()) node_attribute_matrix[atom_idx] = extract_atom_features( atom, is_acceptor=atom_idx in acceptor_atom_ids, is_donor=atom_idx in donor_atom_ids) node_attribute_matrix_list.append(node_attribute_matrix) for idx_i in range(mol.GetNumAtoms()): for idx_j in range(idx_i + 1, mol.GetNumAtoms()): distance = get_atom_distance(conformer.GetAtomPosition(idx_i), conformer.GetAtomPosition(idx_j)) distance_matrix[idx_i, idx_j] = distance distance_matrix[idx_j, idx_i] = distance distance_matrix_list.append(distance_matrix) for bond in mol.GetBonds(): begin_atom = bond.GetBeginAtom() end_atom = bond.GetEndAtom() begin_index = begin_atom.GetIdx() end_index = end_atom.GetIdx() adjacent_matrix[begin_index, end_index] = 1 adjacent_matrix[end_index, begin_index] = 1 adjacent_matrix_list.append(adjacent_matrix) adjacent_matrix_list = np.asarray(adjacent_matrix_list) distance_matrix_list = np.asarray(distance_matrix_list) node_attribute_matrix_list = np.asarray(node_attribute_matrix_list) bond_attribute_matrix_list = np.asarray(bond_attribute_matrix_list) print('adjacent matrix shape\t', adjacent_matrix_list.shape) print('distance matrix shape\t', distance_matrix_list.shape) print('node attr matrix shape\t', node_attribute_matrix_list.shape) print('bond attr matrix shape\t', bond_attribute_matrix_list.shape) print(symbol_candidates) print('{} valid out of {}'.format(len(valid_index), len(smiles_list))) print('degree set:\t', degree_set) print('h num set: \t', h_num_set) print('implicit valence set: \t', implicit_valence_set) print('charge set:\t', charge_set) if label_name is None: np.savez_compressed( out_file_path, adjacent_matrix_list=adjacent_matrix_list, distance_matrix_list=distance_matrix_list, node_attribute_matrix_list=node_attribute_matrix_list, bond_attribute_matrix_list=bond_attribute_matrix_list) else: true_labels = data_pd[label_name].tolist() true_labels = np.array(true_labels) valid_index = np.array(valid_index) true_labels = true_labels[valid_index] np.savez_compressed( out_file_path, adjacent_matrix_list=adjacent_matrix_list, distance_matrix_list=distance_matrix_list, node_attribute_matrix_list=node_attribute_matrix_list, bond_attribute_matrix_list=bond_attribute_matrix_list, label_name=true_labels) print() return
def process(self): if osp.exists( os.path.join(self.processed_dir, 'Decagon-{}-multi.pt'.format(self.datatype))): return data_list = [] # >>> Obtain One-Hot Encoding for Side-Effects json_dict = { literal_eval(k): v for k, v in self.json_load[self.datatype].items() } total = len(json_dict) for idx, (smiles1, smiles2) in enumerate(json_dict): printProgress(idx + 1, total, '{} dataset preparation: '.format(self.datatype), ' ', 2, 50) mol1 = MolFromSmiles(smiles1) mol2 = MolFromSmiles(smiles2) label = np.array(json_dict[(smiles1, smiles2)]) #print(len(label[label == 1])) #print(len(label[label == 0])) #print("\n{}-[{},{},{}:{}] : {}".format(mode, smiles1, smiles2, se, target_dict[se], label)) if mol1 is None or mol2 is None: print("There is a missing drug from the pair (%s,%s)" % (mol1, mol2)) continue ###################################################################### # >>> Get pairwise graph G1, G2 c1_size = mol1.GetNumAtoms() c2_size = mol2.GetNumAtoms() if c1_size == 0 or c2_size == 0: print("There is a size error from pair (%s,%s)" % (mol1, mol2)) continue atoms1 = mol1.GetAtoms() atoms2 = mol2.GetAtoms() bonds1 = mol1.GetBonds() bonds2 = mol2.GetBonds() features, edges = [], [] for atom in atoms1: feature = atom_features(atom) features.append(feature / sum(feature)) # normalize for atom in atoms2: feature = atom_features(atom) features.append(feature / sum(feature)) # normalize for bond in bonds1: edges.append([bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()]) for bond in bonds2: edges.append([ bond.GetBeginAtomIdx() + c1_size, bond.GetEndAtomIdx() + c1_size ]) if len(edges) == 0: continue G = nx.Graph(edges).to_directed() edge_index = [[e1, e2] for e1, e2 in G.edges] GraphSiameseData = DATA.Data( x=torch.Tensor(features), edge_index=torch.LongTensor(edge_index).transpose(1, 0), y=torch.Tensor(label).view(1, -1)) GraphSiameseData.__setitem__('c1_size', torch.LongTensor([c1_size])) GraphSiameseData.__setitem__('c2_size', torch.LongTensor([c2_size])) data_list.append(GraphSiameseData) ########################################################################### if self.pre_filter is not None: data_list = [data for data in data_list if self.pre_filter(data)] if self.pre_transform is not None: data_list = [self.pre_transform(data) for data in data_list] # check this function data, slices = self.collate(data_list) torch.save((data, slices), self.processed_paths[0])
def process(self): if osp.exists( os.path.join(self.processed_dir, 'Decagon-{}.pt'.format(self.datatype))): return data_list = [] # >>> Obtain One-Hot Encoding for Side-Effects target_list = [] with open(self.total_data_dir, 'r', encoding='utf-8') as f: rdr = csv.reader(f) for line in rdr: target_list.append(line[-1]) label_encoder = LabelEncoder() label_encoder.fit( target_list ) # Automatically generate one-hot labels for side-effects label_list = label_encoder.transform(target_list) num_classes = len(label_encoder.classes_) target_dict = {} for target_idx, targets in enumerate(target_list): target_dict[targets] = label_list[target_idx] for label_idx, mode in enumerate(['negative', 'positive']): # negative will be 0, positive will be 1 pair_list, se_list = [], [] with open(osp.join(self.dataset_dir, 'Decagon-{}-{}.csv'.format(mode, self.datatype)), 'r', encoding='utf-8') as f: rdr = csv.reader(f) for line in rdr: se_list.append(line[-1]) pair_list.append(line[:-1]) one_hot = [0] * num_classes total = len(pair_list) for idx, (smiles_pair, se) in enumerate(zip(pair_list, se_list)): smiles1, smiles2 = smiles_pair side_effect = one_hot.copy() side_effect[target_dict[se]] = 1 printProgress(idx + 1, total, '{} dataset preparation: '.format(self.datatype), ' ', 2, 50) mol1 = MolFromSmiles(smiles1) mol2 = MolFromSmiles(smiles2) label = [int(label_idx)] #print("\n{}-[{},{},{}:{}] : {}".format(mode, smiles1, smiles2, se, target_dict[se], label)) if mol1 is None or mol2 is None: print("There is a missing drug from the pair (%s,%s)" % (mol1, mol2)) continue ###################################################################### # >>> Get pairwise graph G1, G2 c1_size = mol1.GetNumAtoms() c2_size = mol2.GetNumAtoms() if c1_size == 0 or c2_size == 0: print("There is a size error from pair (%s,%s)" % (mol1, mol2)) continue atoms1 = mol1.GetAtoms() atoms2 = mol2.GetAtoms() bonds1 = mol1.GetBonds() bonds2 = mol2.GetBonds() features, edges = [], [] for atom in atoms1: feature = atom_features(atom) features.append(feature / sum(feature)) # normalize for atom in atoms2: feature = atom_features(atom) features.append(feature / sum(feature)) # normalize for bond in bonds1: edges.append( [bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()]) for bond in bonds2: edges.append([ bond.GetBeginAtomIdx() + c1_size, bond.GetEndAtomIdx() + c1_size ]) if len(edges) == 0: continue G = nx.Graph(edges).to_directed() edge_index = [[e1, e2] for e1, e2 in G.edges] GraphSiameseData = DATA.Data( x=torch.Tensor(features), edge_index=torch.LongTensor(edge_index).transpose(1, 0), y=torch.Tensor(label).view(-1, 1)) GraphSiameseData.__setitem__('c1_size', torch.LongTensor([c1_size])) GraphSiameseData.__setitem__('c2_size', torch.LongTensor([c2_size])) GraphSiameseData.__setitem__( 'side_effect', torch.Tensor(side_effect).view(1, -1)) data_list.append(GraphSiameseData) ########################################################################### if self.pre_filter is not None: data_list = [data for data in data_list if self.pre_filter(data)] if self.pre_transform is not None: data_list = [self.pre_transform(data) for data in data_list] # check this function data, slices = self.collate(data_list) torch.save((data, slices), self.processed_paths[0])