def _lookup(self, file_path): # Assume one ligand in the pdbbind ligand SDF file ligand = fo.read_sdf_to_mol(file_path, sanitize=False, add_hs=False, remove_hs=True)[0] return Chem.MolToSmiles(ligand)
def __getitem__(self, index: int): if not 0 <= index < self._num_examples: raise IndexError(index) # Read biopython structure file_path = self._file_list[index] structure = fo.read_sdf(str(file_path), sanitize=False, add_hs=False, remove_hs=False) # assemble the item (no bonds) item = { 'atoms': fo.bp_to_df(structure), 'id': structure.id, 'file_path': str(file_path), } # Add bonds if included if self._read_bonds: mol = fo.read_sdf_to_mol(str(file_path), sanitize=False, add_hs=False, remove_hs=False) bonds_df = fo.get_bonds_list_from_mol(mol[0]) item['bonds'] = bonds_df if self._transform: item = self._transform(item) return item
def __init__(self, csv_file, sdf_file, name='molecules'): """Initializes a data set from a CSV file. Args: csv_file (str): CSV file with label data. sdf_file (str): SDF file with coordinates. name (str, opt.): Name of the dataset. Default: 'molecules'. """ df = pd.read_csv(csv_file) self.raw_data = [df[col] for col in df.keys()] self.raw_mols = dt.read_sdf_to_mol(sdf_file, sanitize=False) # Simple sanity check: # Is the number of molecules the same in both files? assert len(self.raw_mols) == len(self.raw_data[0]) self.num_atoms = [] self.charges = [] self.positions = [] self.index = [] self.data = [] self.data_keys = [k for k in df.keys()] for im, m in enumerate(self.raw_mols): if m is None: print('Molecule', im + 1, 'could not be processed.') continue new_atnums = np.array([a.GetAtomicNum() for a in m.GetAtoms()]) conf_coord = dt.get_coordinates_of_conformer(m) self.num_atoms.append(m.GetNumAtoms()) self.index.append(im + 1) self.charges.append(new_atnums) self.positions.append(conf_coord) self.data.append([col[im] for col in self.raw_data]) return
def process(self): label_file = os.path.join(self.root, 'pdbbind_refined_set_labels.csv') label_df = pd.read_csv(label_file) i = 0 for raw_path in self.raw_paths: pdb_code = fi.get_pdb_code(raw_path) y = torch.FloatTensor([get_label(pdb_code, label_df)]) if '_ligand' in raw_path: mol_graph = graph.mol_to_graph( dt.read_sdf_to_mol(raw_path, add_hs=True)[0]) elif '_pocket' in raw_path: prot_graph = graph.prot_df_to_graph( dt.bp_to_df(dt.read_any(raw_path, name=pdb_code))) node_feats, edge_index, edge_feats, pos = graph.combine_graphs( prot_graph, mol_graph, edges_between=True) data = Data(node_feats, edge_index, edge_feats, y=y, pos=pos) data.pdb = pdb_code torch.save( data, os.path.join(self.processed_dir, 'data_{}.pt'.format(i))) i += 1 else: continue
def __init__(self, csv_file, sdf_file, name='molecules'): """Initializes a data set from a CSV file. Args: csv_file (str): CSV file with label data. sdf_file (str): SDF file with coordinates. name (str, opt.): Name of the dataset. Default: 'molecules'. """ df = pd.read_csv(csv_file) self.raw_data = [df[col] for col in df.keys()] self.raw_mols = dt.read_sdf_to_mol(sdf_file, sanitize=False) # Simple sanity check: # Is the number of molecules the same in both files? assert len(self.raw_mols) == len(self.raw_data[0]) self.index = [] self.data = [] self.data_keys = [k for k in df.keys()] self.mol_dfs = [] for im, m in enumerate(self.raw_mols): if m is None: print('Molecule', im + 1, 'could not be processed.') continue self.index.append(im + 1) self.data.append([col[im] for col in self.raw_data]) new_mol_df = dt.mol_to_df(m, add_hs=False, structure=df['mol_id'][im]) self.mol_dfs.append(new_mol_df) return
import numpy as np import pandas as pd import atom3d.util.formats as dt in_dir_name = '../../data/qm9/raw' csv_file = in_dir_name + '/gdb9.sdf.csv' sdf_file = in_dir_name + '/gdb9.sdf' out_file = in_dir_name + '/gdb9_with_cv_atom.csv' df = pd.read_csv(csv_file) raw_data = [df[col] for col in df.keys()] raw_mols = dt.read_sdf_to_mol(sdf_file, sanitize=False) raw_num_atoms = [] for im, m in enumerate(raw_mols): if m is None: print('Molecule', im + 1, 'could not be processed.') new_numat = None continue new_numat = m.GetNumAtoms() raw_num_atoms.append(new_numat) raw_num_atoms = np.array(raw_num_atoms) df['cv_atom'] = df['cv'] - (raw_num_atoms * 2.981) print('cv:') print(' mean:', np.mean(df['cv'])) print(' sdev:', np.std(df['cv'])) print('cv_atom:') print(' mean:', np.mean(df['cv_atom']))