def _load(self, mol_to_graph, node_featurizer, edge_featurizer): if self.load: self.graphs, label_dict = load_graphs( osp.join(self.file_dir, "{}_graphs.bin".format(self.mode))) self.labels = label_dict['labels'] with open( osp.join(self.file_dir, "{}_smiles.txt".format(self.mode)), 'r') as f: smiles_ = f.readlines() self.smiles = [s.strip() for s in smiles_] else: print('Start preprocessing dataset...') target_file = pathlib.Path(self.file_dir, "{}_target.csv".format(self.mode)) self.target = pd.read_csv( target_file, index_col=0, usecols=[ 'gdb_idx', ] + ['property_{:d}'.format(x) for x in range(12)]) self.target = self.target[[ 'property_{:d}'.format(x) for x in range(12) ]] self.graphs, self.labels, self.smiles = [], [], [] supp = Chem.SDMolSupplier( osp.join(self.file_dir, self.mode + ".sdf")) cnt = 0 dataset_size = len(self.target) for mol, label in zip(supp, self.target.iterrows()): cnt += 1 print('Processing molecule {:d}/{:d}'.format( cnt, dataset_size)) graph = mol_to_graph(mol, node_featurizer=node_featurizer, edge_featurizer=edge_featurizer) smiles = Chem.MolToSmiles(mol) self.smiles.append(smiles) self.graphs.append(graph) label = F.tensor( np.array(label[1].tolist()).astype(np.float32)) self.labels.append(label) save_graphs(osp.join(self.file_dir, "{}_graphs.bin".format(self.mode)), self.graphs, labels={'labels': F.stack(self.labels, dim=0)}) with open( osp.join(self.file_dir, "{}_smiles.txt".format(self.mode)), 'w') as f: for s in self.smiles: f.write(s + '\n') self.set_mean_and_std() print(len(self.graphs), "loaded!")
def batcher_dev(batch): """Batch datapoints Parameters ---------- batch : list batch[i][0] gives the DGLGraph for the ith datapoint, and batch[i][1] gives the label for the ith datapoint. Returns ------- AlchemyBatcher An object holding the batch of data """ graphs, labels = zip(*batch) batch_graphs = dgl.batch(graphs) labels = F.stack(labels, 0) return AlchemyBatcher(graph=batch_graphs, label=labels)
def _load_data(self): if self.load and self.preprocessed: self.data_list, label_dict = load_graphs( osp.join(self.data_path, f"{self.split}.bin")) all_label_list, all_mask_list = label_dict['labels'], label_dict[ 'masks'] with open(osp.join(self.data_path, f'{self.split}_smiles.txt'), 'r') as f: smiles_ = f.readlines() smiles_list = [s.strip() for s in smiles_] else: print('preprocessing data ...') data_file = pathlib.Path(self.data_path, f"{self.split}.csv") all_data = pd.read_csv(data_file, usecols=['smiles'] + self.all_tasks) smiless = all_data['smiles'].values.tolist() targets = all_data[self.all_tasks] self.data_list,all_label_list,smiles_list,all_mask_list,length_list=[],[],[],[],[] for smiles, label in zip(smiless, targets.iterrows()): try: mol = Chem.MolFromSmiles(smiles) cano_smiles = Chem.MolToSmiles(mol) length = F.tensor( np.array(len(cano_smiles)).astype(np.int64)) data = smiles_to_bigraph( cano_smiles, node_featurizer=get_node_featurizer(), edge_featurizer=None) label = np.array(label[1].tolist()) mask = np.ones_like(label) mask[np.isnan(label)] = 0 mask = F.tensor(mask.astype(np.float32)) label[np.isnan(label)] = 0 label = F.tensor(np.array(label.astype(np.float32))) except Exception as e: print(e) else: self.data_list.append(data) all_label_list.append(label) all_mask_list.append(mask) smiles_list.append(cano_smiles) length_list.append(length) all_label_list = F.stack(all_label_list, dim=0) all_mask_list = F.stack(all_mask_list, dim=0) self.length_list = torch.stack(length_list) save_graphs(osp.join(self.data_path, f"{self.split}.bin"), self.data_list, labels={ 'labels': all_label_list, 'masks': all_mask_list }) with open(osp.join(self.data_path, f"{self.split}_smiles.txt"), 'w') as f: for smiles in smiles_list: f.write(smiles + '\n') label_list, mask_list = [], [] for task in self.tasks: label_list.append(all_label_list[:, self.all_tasks.index(task)]) mask_list.append(all_mask_list[:, self.all_tasks.index(task)]) self.smiles_list = np.array(smiles_list) self.label_list = torch.stack(label_list, dim=-1) self.mask_list = torch.stack(mask_list, dim=-1) if len(self.tasks) == 1: remain = (self.mask_list == 1.0).squeeze(-1) self.label_list = self.label_list[remain] self.smiles_list = self.smiles_list[remain.numpy() == 1] self.data_list = np.array( self.data_list)[remain.numpy() == 1].tolist() self.mask_list = torch.ones_like(self.label_list)
def alchemy_nodes(mol): """Featurization for all atoms in a molecule. The atom indices will be preserved. Parameters ---------- mol : rdkit.Chem.rdchem.Mol RDKit molecule object Returns ------- atom_feats_dict : dict Dictionary for atom features """ atom_feats_dict = defaultdict(list) is_donor = defaultdict(int) is_acceptor = defaultdict(int) fdef_name = osp.join(RDConfig.RDDataDir, 'BaseFeatures.fdef') mol_featurizer = ChemicalFeatures.BuildFeatureFactory(fdef_name) mol_feats = mol_featurizer.GetFeaturesForMol(mol) mol_conformers = mol.GetConformers() assert len(mol_conformers) == 1 for i in range(len(mol_feats)): if mol_feats[i].GetFamily() == 'Donor': node_list = mol_feats[i].GetAtomIds() for u in node_list: is_donor[u] = 1 elif mol_feats[i].GetFamily() == 'Acceptor': node_list = mol_feats[i].GetAtomIds() for u in node_list: is_acceptor[u] = 1 num_atoms = mol.GetNumAtoms() for u in range(num_atoms): atom = mol.GetAtomWithIdx(u) atom_type = atom.GetAtomicNum() num_h = atom.GetTotalNumHs() atom_feats_dict['node_type'].append(atom_type) h_u = [] h_u += atom_type_one_hot(atom, ['H', 'C', 'N', 'O', 'F', 'S', 'Cl']) h_u.append(atom_type) h_u.append(is_acceptor[u]) h_u.append(is_donor[u]) h_u += atom_is_aromatic(atom) h_u += atom_hybridization_one_hot(atom, [ Chem.rdchem.HybridizationType.SP, Chem.rdchem.HybridizationType.SP2, Chem.rdchem.HybridizationType.SP3 ]) h_u.append(num_h) atom_feats_dict['n_feat'].append( F.tensor(np.array(h_u).astype(np.float32))) atom_feats_dict['n_feat'] = F.stack(atom_feats_dict['n_feat'], dim=0) atom_feats_dict['node_type'] = F.tensor( np.array(atom_feats_dict['node_type']).astype(np.int64)) return atom_feats_dict