Example #1
0
def parse_ensemble(name, ensemble):
    if ensemble is None:
        df = dt.bp_to_df(dt.read_any(name))
    else:
        df = []
        for subunit, f in ensemble.items():
            if isinstance(f, pd.DataFrame):
                curr = f
            else:
                curr = dt.bp_to_df(dt.read_any(f))

            curr['subunit'] = subunit
            df.append(curr)
        df = pd.concat(df)
        df['ensemble'] = name
    return df
Example #2
0
def process_files(input_dir):
    """
    Process all protein (pdb) and ligand (sdf) files in input directory.
    
    :param input dir: directory containing PDBBind data
    :type input_dir: str
    
    :return structure_dict: dictionary containing each structure, keyed by PDB code. Each PDB is a dict containing protein as Biopython object and ligand as RDKit Mol object
    :rtype structure_dict: dict
    """
    structure_dict = {}
    pdb_files = fi.find_files(input_dir, 'pdb')

    for f in tqdm(pdb_files, desc='pdb files'):
        pdb_id = fi.get_pdb_code(f)
        if pdb_id not in structure_dict:
            structure_dict[pdb_id] = {}
        if '_protein' in f:
            prot = ft.read_any(f)
            structure_dict[pdb_id]['protein'] = prot

    lig_files = fi.find_files(input_dir, 'sdf')
    for f in tqdm(lig_files, desc='ligand files'):
        pdb_id = fi.get_pdb_code(f)
        structure_dict[pdb_id]['ligand'] = get_ligand(f)

    return structure_dict
Example #3
0
 def __call__(self, x):
     name = os.path.splitext(x['id'])[0]
     x['id'] = name
     orig_file = self.mut_orig_mapping[name]['original']
     x['original_atoms'] = fo.bp_to_df(
         fo.read_any(os.path.join(self.orig_file_dir, orig_file)))
     x['mutated_atoms'] = x.pop('atoms')
     x['label'] = str(self.labels.loc[name])
     return x
Example #4
0
    def __getitem__(self, index: int):
        if not 0 <= index < self._num_examples:
            raise IndexError(index)

        file_path = self._file_list[index]

        item = {
            'atoms': fo.bp_to_df(fo.read_any(file_path)),
            'id': file_path.name,
            'file_path': str(file_path),
        }
        if self._transform:
            item = self._transform(item)
        return item
Example #5
0
def convert_to_hdf5(input_dir, label_file, hdf_file):
    cif_files = fi.find_files(input_dir, 'cif')
    proteins = []
    pockets = []
    pdb_codes = []
    for f in tqdm(cif_files, desc='reading structures'):
        pdb_code = fi.get_pdb_code(f)
        if '_protein' in f:
            pdb_codes.append(pdb_code)
            df = dt.bp_to_df(dt.read_any(f))
            proteins.append(df)
        elif '_pocket' in f:
            df = dt.bp_to_df(dt.read_any(f))
            pockets.append(df)

    print('converting proteins...')
    protein_df = pd.concat(proteins)
    pocket_df = pd.concat(pockets)
    pdb_codes = pd.DataFrame({'pdb': pdb_codes})

    protein_df.to_hdf(hdf_file, 'proteins')
    pocket_df.to_hdf(hdf_file, 'pockets')
    pdb_codes.to_hdf(hdf_file, 'pdb_codes')

    print('converting ligands...')
    sdf_files = fi.find_files(input_dir, 'sdf')
    big_sdf = os.path.join(input_dir, 'all_ligands.sdf')
    dt.combine_sdfs(sdf_files, big_sdf)
    lig_df = PandasTools.LoadSDF(big_sdf, molColName='Mol')
    lig_df.index = pdb_codes
    lig_df.to_hdf(hdf_file, 'ligands')

    print('converting labels...')
    label_df = pd.read_csv(label_file)
    label_df = label_df.set_index('pdb').reindex(pdb_codes)
    label_df.to_hdf(hdf_file, 'labels')
Example #6
0
 def process(self):
     label_file = os.path.join(self.root, 'pdbbind_refined_set_labels.csv')
     label_df = pd.read_csv(label_file)
     i = 0
     for raw_path in self.raw_paths:
         pdb_code = fi.get_pdb_code(raw_path)
         y = torch.FloatTensor([get_label(pdb_code, label_df)])
         if '_ligand' in raw_path:
             mol_graph = graph.mol_to_graph(
                 dt.read_sdf_to_mol(raw_path, add_hs=True)[0])
         elif '_pocket' in raw_path:
             prot_graph = graph.prot_df_to_graph(
                 dt.bp_to_df(dt.read_any(raw_path, name=pdb_code)))
             node_feats, edge_index, edge_feats, pos = graph.combine_graphs(
                 prot_graph, mol_graph, edges_between=True)
             data = Data(node_feats, edge_index, edge_feats, y=y, pos=pos)
             data.pdb = pdb_code
             torch.save(
                 data,
                 os.path.join(self.processed_dir, 'data_{}.pt'.format(i)))
             i += 1
         else:
             continue
Example #7
0
 def _lookup(self, file_path):
     return seq.get_chain_sequences(fo.bp_to_df(fo.read_any(file_path)))