def __getitem__(self, index: int): if not 0 <= index < self._num_examples: raise IndexError(index) # Read biopython structure file_path = self._file_list[index] structure = fo.read_sdf(str(file_path), sanitize=False, add_hs=False, remove_hs=False) # assemble the item (no bonds) item = { 'atoms': fo.bp_to_df(structure), 'id': structure.id, 'file_path': str(file_path), } # Add bonds if included if self._read_bonds: mol = fo.read_sdf_to_mol(str(file_path), sanitize=False, add_hs=False, remove_hs=False) bonds_df = fo.get_bonds_list_from_mol(mol[0]) item['bonds'] = bonds_df if self._transform: item = self._transform(item) return item
def parse_ensemble(name, ensemble): if ensemble is None: df = dt.bp_to_df(dt.read_any(name)) else: df = [] for subunit, f in ensemble.items(): if isinstance(f, pd.DataFrame): curr = f else: curr = dt.bp_to_df(dt.read_any(f)) curr['subunit'] = subunit df.append(curr) df = pd.concat(df) df['ensemble'] = name return df
def __call__(self, x): name = os.path.splitext(x['id'])[0] x['id'] = name orig_file = self.mut_orig_mapping[name]['original'] x['original_atoms'] = fo.bp_to_df( fo.read_any(os.path.join(self.orig_file_dir, orig_file))) x['mutated_atoms'] = x.pop('atoms') x['label'] = str(self.labels.loc[name]) return x
def _pose_to_df(self, pose): """ Convert pyrosetta representation to pandas dataframe representation. """ name = pose.pdb_info().name() string_stream = self.pyrosetta.rosetta.std.ostringstream() pose.dump_pdb(string_stream) f = io.StringIO(string_stream.str()) parser = Bio.PDB.PDBParser(QUIET=True) bp = parser.get_structure(name, f) return fo.bp_to_df(bp)
def __getitem__(self, index: int): if not 0 <= index < self._num_examples: raise IndexError(index) file_path = self._file_list[index] item = { 'atoms': fo.bp_to_df(fo.read_any(file_path)), 'id': file_path.name, 'file_path': str(file_path), } if self._transform: item = self._transform(item) return item
def convert_to_hdf5(input_dir, label_file, hdf_file): cif_files = fi.find_files(input_dir, 'cif') proteins = [] pockets = [] pdb_codes = [] for f in tqdm(cif_files, desc='reading structures'): pdb_code = fi.get_pdb_code(f) if '_protein' in f: pdb_codes.append(pdb_code) df = dt.bp_to_df(dt.read_any(f)) proteins.append(df) elif '_pocket' in f: df = dt.bp_to_df(dt.read_any(f)) pockets.append(df) print('converting proteins...') protein_df = pd.concat(proteins) pocket_df = pd.concat(pockets) pdb_codes = pd.DataFrame({'pdb': pdb_codes}) protein_df.to_hdf(hdf_file, 'proteins') pocket_df.to_hdf(hdf_file, 'pockets') pdb_codes.to_hdf(hdf_file, 'pdb_codes') print('converting ligands...') sdf_files = fi.find_files(input_dir, 'sdf') big_sdf = os.path.join(input_dir, 'all_ligands.sdf') dt.combine_sdfs(sdf_files, big_sdf) lig_df = PandasTools.LoadSDF(big_sdf, molColName='Mol') lig_df.index = pdb_codes lig_df.to_hdf(hdf_file, 'ligands') print('converting labels...') label_df = pd.read_csv(label_file) label_df = label_df.set_index('pdb').reindex(pdb_codes) label_df.to_hdf(hdf_file, 'labels')
def __getitem__(self, index: int): if not 0 <= index < self._num_examples: raise IndexError(index) file_path = self._file_list[index] bp = fo.read_xyz(file_path, gdb=self._gdb) if self._gdb: bp, data, freq, smiles, inchi = bp df = fo.bp_to_df(bp) item = { 'atoms': df, 'id': bp.id, 'file_path': str(file_path), } if self._gdb: item['labels'] = self.data_with_subtracted_thchem_energy(data, df) item['freq'] = freq if self._transform: item = self._transform(item) return item
def process(self): label_file = os.path.join(self.root, 'pdbbind_refined_set_labels.csv') label_df = pd.read_csv(label_file) i = 0 for raw_path in self.raw_paths: pdb_code = fi.get_pdb_code(raw_path) y = torch.FloatTensor([get_label(pdb_code, label_df)]) if '_ligand' in raw_path: mol_graph = graph.mol_to_graph( dt.read_sdf_to_mol(raw_path, add_hs=True)[0]) elif '_pocket' in raw_path: prot_graph = graph.prot_df_to_graph( dt.bp_to_df(dt.read_any(raw_path, name=pdb_code))) node_feats, edge_index, edge_feats, pos = graph.combine_graphs( prot_graph, mol_graph, edges_between=True) data = Data(node_feats, edge_index, edge_feats, y=y, pos=pos) data.pdb = pdb_code torch.save( data, os.path.join(self.processed_dir, 'data_{}.pt'.format(i))) i += 1 else: continue
def _lookup(self, file_path): return seq.get_chain_sequences(fo.bp_to_df(fo.read_any(file_path)))