def test_from_text_raises(self, filepath, format): """ Test if input produces a ValueError for invalid inputs. Note: Use file as test function input; file content will be read as string, which is the input for the class method to be tested here! """ # Let's load a file's content as string (text) to simulate example input data with open(filepath, "r") as f: text = f.read() with pytest.raises(ValueError): DataFrame.from_text(text, format)
def _set_data(self): """ Load atoms as DataFrames from text. Keep only atoms with int-castable residue PDB IDs. Returns ------- pd.DataFrame Structural protein data with the following mandatory columns: "residue.id", "atom.name", "atom.x", "atom.y", "atom.z". """ # Load atoms as DataFrame from text # Note: Column "residue.id" is of string type dataframe = DataFrame.from_text(self._text, self._extension) # Cast the IDs to str, so that they can match the DataFrame's ID residue_ids = [ str(residue_id) for residue_id in self._residue_ids if residue_id ] # Fetch all atoms matching residues IDs and cast them back to integers dataframe = dataframe[dataframe["residue.id"].isin(residue_ids)] dataframe = dataframe.astype({"residue.id": "int32"}) return dataframe
def from_file(cls, filepath, residue_ids, name="", residue_labels=None): """ Initialize Pocket object from structure protein file. Attributes ---------- filepath : str or pathlib.Path File path to structural protein data. residue_ids : list of str Pocket residue IDs. name : str Name of protein (default: empty string). residue_labels : None or list of str Pocket residue labels. Set to None by default. Returns ------- opencadd.structure.pocket.Pocket Pocket object. """ pocket = cls() pocket.name = name pocket._filepath = filepath pocket._data = DataFrame.from_file(filepath) residue_ids, residue_labels = _format_residue_ids_and_labels( residue_ids, residue_labels) pocket._residue_ids = residue_ids pocket._residue_labels = residue_labels return pocket
def test_from_file(self, filepath, verbose): """ Test if input produces a DataFrame. """ dataframe = DataFrame.from_file(filepath, verbose) isinstance(dataframe, pd.DataFrame)
def to_dataframe(self, structure_klifs_id_or_filepath, entity="complex", extension="mol2"): # pylint: disable=W0221 filepath = self._to_filepath(structure_klifs_id_or_filepath, entity, extension) dataframe = DataFrame.from_file(filepath) dataframe = self._add_residue_klifs_ids(dataframe, filepath) return dataframe
def by_structure_klifs_id(self, structure_klifs_id, extension="mol2"): # pylint: disable=W0221 # Get kinase pocket from structure ID structures_local = Structures(self._database, self._path_to_klifs_download) structure = structures_local.by_structure_klifs_id( structure_klifs_id).squeeze() # Get list of KLIFS positions (starting at 1) excluding gap positions klifs_ids = [ index for index, residue in enumerate(structure["structure.pocket"], 1) if residue != "_" ] # Load pocket coordinates from file pocket_path = (self._path_to_klifs_download / structure["structure.filepath"] / f"pocket.{extension}") dataframe = DataFrame.from_file(pocket_path) # Get number of atoms per residue # Note: sort=False important otherwise negative residue IDs will be sorted to the top number_of_atoms_per_residue = dataframe.groupby( ["residue.name", "residue.id"], sort=False).size() # Get KLIFS position IDs for each atom in molecule klifs_ids_per_atom = [] for klifs_id, n in zip(klifs_ids, number_of_atoms_per_residue): klifs_ids_per_atom.extend([klifs_id] * n) # Add column for KLIFS position IDs to molecule dataframe["residue.klifs_id"] = klifs_ids_per_atom dataframe = dataframe[["residue.id", "residue.klifs_id"]].drop_duplicates() # Add KLIFS IDs that are missing in pocket and fill with "_" full_klifs_ids_df = pd.Series(range(1, 86), name="residue.klifs_id").to_frame() dataframe = full_klifs_ids_df.merge(dataframe, on="residue.klifs_id", how="left") dataframe.fillna("_", inplace=True) # Add column for KLIFS regions dataframe = dataframe.merge(POCKET_KLIFS_REGIONS, on="residue.klifs_id", how="left") dataframe = dataframe.astype({"residue.klifs_id": "Int64"}) # Standardize DataFrame dataframe = self._standardize_dataframe( dataframe, DATAFRAME_COLUMNS["pockets"], ) # Add KLIFS region and color TODO not so nice to have this after standardization dataframe = self._add_klifs_region_details(dataframe) return dataframe
def test_from_pdb_file(self, pdb_file): """ Test loading pdb files as DataFrame. Parameters ---------- pdb_file : pathlib.Path or str Path to pdb file. """ df = DataFrame._from_pdb_file(pdb_file) self._dataframe_format_tests(df)
def test_from_mol2_file(self, mol2_file): """ Test loading mol2 files as DataFrame. Parameters ---------- mol2_file : pathlib.Path or str Path to mol2 file. """ df = DataFrame._from_mol2_file(mol2_file) self._dataframe_format_tests(df)
def test_from_text(self, filepath, format, verbose): """ Test if input produces a DataFrame. Note: Use file as test function input; file content will be read as string, which is the input for the class method to be tested here! """ # Let's load a file's content as string (text) to simulate example input data with open(filepath, "r") as f: text = f.read() dataframe = DataFrame.from_text(text, format, verbose) isinstance(dataframe, pd.DataFrame)
def _map_residue_ids_names_nglixs(self, pocket): """ Map residue IDs and names to nglview indices depending on file format. In case of mol2 files, nglview will use indices starting from 1. In case of pdb files, nglview will use the residue IDs as indices. Parameters ---------- pocket : opencadd.structure.pocket.Pocket Pocket object. Returns ------- pandas.Series Residue IDs (index) and residue nglview indices (values). """ # Get atom data # Cast residue IDs to integer - drop atoms where this is not possible! dataframe = DataFrame.from_text(pocket._text, pocket._extension) drop_ixs = [] for index, residue_id in dataframe["residue.id"].items(): try: residue_id = int(residue_id) except (TypeError, ValueError): drop_ixs.append(index) dataframe.drop(drop_ixs, inplace=True) dataframe = dataframe.astype({"residue.id": "int32"}) # Get all residue names and IDs (full structure!!) residue_id2ix = dataframe[["residue.name", "residue.id"]].drop_duplicates() if pocket._extension == "mol2": # Map residue names to nglview index (starting from 1) residue_id2ix["residue.ngl_ix"] = [ str(i) for i in range(1, len(residue_id2ix) + 1) ] else: # In this case, residue ID and nglview index are the same residue_id2ix["residue.ngl_ix"] = [ str(i) for i in residue_id2ix["residue.id"] ] self._residue_ids_to_ngl_ixs[pocket.name] = residue_id2ix
def test_from_mol2_text(self, mol2_file): """ Test loading mol2 file contents (text) as DataFrame. Parameters ---------- mol2_file : pathlib.Path or str Path to mol2 file. """ # Let's load a file's content as string (text) to simulate example input data with open(mol2_file, "r") as f: mol2_text = f.read() df = DataFrame._from_mol2_text(mol2_text) self._dataframe_format_tests(df)
def to_dataframe(self, structure_klifs_id, entity="complex", extension="mol2"): text = self.to_text(structure_klifs_id, entity, extension) dataframe = DataFrame.from_text(text, extension) dataframe = self._add_residue_klifs_ids(dataframe, structure_klifs_id) return dataframe