def parse_ensemble(name, ensemble): if ensemble is None: df = dt.bp_to_df(dt.read_any(name)) else: df = [] for subunit, f in ensemble.items(): if isinstance(f, pd.DataFrame): curr = f else: curr = dt.bp_to_df(dt.read_any(f)) curr['subunit'] = subunit df.append(curr) df = pd.concat(df) df['ensemble'] = name return df
def process_files(input_dir): """ Process all protein (pdb) and ligand (sdf) files in input directory. :param input dir: directory containing PDBBind data :type input_dir: str :return structure_dict: dictionary containing each structure, keyed by PDB code. Each PDB is a dict containing protein as Biopython object and ligand as RDKit Mol object :rtype structure_dict: dict """ structure_dict = {} pdb_files = fi.find_files(input_dir, 'pdb') for f in tqdm(pdb_files, desc='pdb files'): pdb_id = fi.get_pdb_code(f) if pdb_id not in structure_dict: structure_dict[pdb_id] = {} if '_protein' in f: prot = ft.read_any(f) structure_dict[pdb_id]['protein'] = prot lig_files = fi.find_files(input_dir, 'sdf') for f in tqdm(lig_files, desc='ligand files'): pdb_id = fi.get_pdb_code(f) structure_dict[pdb_id]['ligand'] = get_ligand(f) return structure_dict
def __call__(self, x): name = os.path.splitext(x['id'])[0] x['id'] = name orig_file = self.mut_orig_mapping[name]['original'] x['original_atoms'] = fo.bp_to_df( fo.read_any(os.path.join(self.orig_file_dir, orig_file))) x['mutated_atoms'] = x.pop('atoms') x['label'] = str(self.labels.loc[name]) return x
def __getitem__(self, index: int): if not 0 <= index < self._num_examples: raise IndexError(index) file_path = self._file_list[index] item = { 'atoms': fo.bp_to_df(fo.read_any(file_path)), 'id': file_path.name, 'file_path': str(file_path), } if self._transform: item = self._transform(item) return item
def convert_to_hdf5(input_dir, label_file, hdf_file): cif_files = fi.find_files(input_dir, 'cif') proteins = [] pockets = [] pdb_codes = [] for f in tqdm(cif_files, desc='reading structures'): pdb_code = fi.get_pdb_code(f) if '_protein' in f: pdb_codes.append(pdb_code) df = dt.bp_to_df(dt.read_any(f)) proteins.append(df) elif '_pocket' in f: df = dt.bp_to_df(dt.read_any(f)) pockets.append(df) print('converting proteins...') protein_df = pd.concat(proteins) pocket_df = pd.concat(pockets) pdb_codes = pd.DataFrame({'pdb': pdb_codes}) protein_df.to_hdf(hdf_file, 'proteins') pocket_df.to_hdf(hdf_file, 'pockets') pdb_codes.to_hdf(hdf_file, 'pdb_codes') print('converting ligands...') sdf_files = fi.find_files(input_dir, 'sdf') big_sdf = os.path.join(input_dir, 'all_ligands.sdf') dt.combine_sdfs(sdf_files, big_sdf) lig_df = PandasTools.LoadSDF(big_sdf, molColName='Mol') lig_df.index = pdb_codes lig_df.to_hdf(hdf_file, 'ligands') print('converting labels...') label_df = pd.read_csv(label_file) label_df = label_df.set_index('pdb').reindex(pdb_codes) label_df.to_hdf(hdf_file, 'labels')
def process(self): label_file = os.path.join(self.root, 'pdbbind_refined_set_labels.csv') label_df = pd.read_csv(label_file) i = 0 for raw_path in self.raw_paths: pdb_code = fi.get_pdb_code(raw_path) y = torch.FloatTensor([get_label(pdb_code, label_df)]) if '_ligand' in raw_path: mol_graph = graph.mol_to_graph( dt.read_sdf_to_mol(raw_path, add_hs=True)[0]) elif '_pocket' in raw_path: prot_graph = graph.prot_df_to_graph( dt.bp_to_df(dt.read_any(raw_path, name=pdb_code))) node_feats, edge_index, edge_feats, pos = graph.combine_graphs( prot_graph, mol_graph, edges_between=True) data = Data(node_feats, edge_index, edge_feats, y=y, pos=pos) data.pdb = pdb_code torch.save( data, os.path.join(self.processed_dir, 'data_{}.pt'.format(i))) i += 1 else: continue
def _lookup(self, file_path): return seq.get_chain_sequences(fo.bp_to_df(fo.read_any(file_path)))