def test_conversion(path, model): pdbx_file = pdbx.PDBxFile.read(path) try: array1 = pdbx.get_structure(pdbx_file, model=model) except biotite.InvalidFileError: if model is None: # The file cannot be parsed into an AtomArrayStack, # as the models contain different numbers of atoms # -> skip this test case return else: raise pdbx_file = pdbx.PDBxFile() pdbx.set_structure(pdbx_file, array1, data_block="test") # Remove one optional auth section in label to test fallback to label fields atom_cat = pdbx_file.get_category("atom_site", "test") atom_cat.pop("auth_atom_id") pdbx_file.set_category("atom_site", atom_cat, "test") array2 = pdbx.get_structure(pdbx_file, model=model) assert array1.array_length() > 0 if array1.box is not None: assert np.allclose(array1.box, array2.box) assert array1.bonds == array2.bonds for category in array1.get_annotation_categories(): assert (array1.get_annotation(category).tolist() == array2.get_annotation(category).tolist()) assert array1.coord.tolist() == array2.coord.tolist()
def test_extra_fields(): path = join(data_dir("structure"), "1l2y.cif") pdbx_file = pdbx.PDBxFile.read(path) stack1 = pdbx.get_structure( pdbx_file, extra_fields=["atom_id", "b_factor", "occupancy", "charge"]) pdbx_file = pdbx.PDBxFile() pdbx.set_structure(pdbx_file, stack1, data_block="test") stack2 = pdbx.get_structure( pdbx_file, extra_fields=["atom_id", "b_factor", "occupancy", "charge"]) assert stack1 == stack2 path = join(data_dir("structure"), "1l2y.cif") pdbx_file = pdbx.PDBxFile.read(path) stack1 = pdbx.get_structure( pdbx_file, extra_fields=["atom_id", "b_factor", "occupancy", "charge"]) pdbx_file = pdbx.PDBxFile() pdbx.set_structure(pdbx_file, stack1, data_block="test") stack2 = pdbx.get_structure( pdbx_file, extra_fields=["atom_id", "b_factor", "occupancy", "charge"]) assert stack1.ins_code.tolist() == stack2.ins_code.tolist() assert stack1.atom_id.tolist() == stack2.atom_id.tolist() assert stack1.b_factor.tolist() == approx(stack2.b_factor.tolist()) assert stack1.occupancy.tolist() == approx(stack2.occupancy.tolist()) assert stack1.charge.tolist() == stack2.charge.tolist() assert stack1 == stack2
def test_conversion(path, model): pdbx_file = pdbx.PDBxFile.read(path) try: array1 = pdbx.get_structure(pdbx_file, model=model) except biotite.InvalidFileError: if model is None: # The file cannot be parsed into an AtomArrayStack, # as the models contain different numbers of atoms # -> skip this test case return else: raise pdbx_file = pdbx.PDBxFile() pdbx.set_structure(pdbx_file, array1, data_block="test") array2 = pdbx.get_structure(pdbx_file, model=model) if array1.box is not None: assert np.allclose(array1.box, array2.box) assert array1.bonds == array2.bonds for category in array1.get_annotation_categories(): assert array1.get_annotation(category).tolist() == \ array2.get_annotation(category).tolist() assert array1.coord.tolist() == array2.coord.tolist()
def test_extra_fields(): path = join(data_dir, "1l2y.cif") pdbx_file = pdbx.PDBxFile() pdbx_file.read(path) stack1 = pdbx.get_structure(pdbx_file, extra_fields=["atom_id","b_factor", "occupancy","charge"]) pdbx_file = pdbx.PDBxFile() pdbx.set_structure(pdbx_file, stack1, data_block="test") stack2 = pdbx.get_structure(pdbx_file, extra_fields=["atom_id","b_factor", "occupancy","charge"]) assert stack1 == stack2
def test_conversion(path, single_model): model = 1 if single_model else None pdbx_file = pdbx.PDBxFile() pdbx_file.read(path) array1 = pdbx.get_structure(pdbx_file, model=model) pdbx_file = pdbx.PDBxFile() pdbx.set_structure(pdbx_file, array1, data_block="test") array2 = pdbx.get_structure(pdbx_file, model=model) for category in array1.get_annotation_categories(): assert array1.get_annotation(category).tolist() == \ array2.get_annotation(category).tolist() assert array1.coord.tolist() == array2.coord.tolist()
def test_fetch(format, as_file_like): path = None if as_file_like else biotite.temp_dir() file_path_or_obj = rcsb.fetch("1l2y", format, path, overwrite=True) if format == "pdb": file = pdb.PDBFile() file.read(file_path_or_obj) pdb.get_structure(file) elif format == "pdbx": file = pdbx.PDBxFile() file.read(file_path_or_obj) pdbx.get_structure(file) elif format == "mmtf": file = mmtf.MMTFFile() file.read(file_path_or_obj) mmtf.get_structure(file)
def test_array_conversion(path): pdbx_file = pdbx.PDBxFile.read(path) ref_structure = pdbx.get_structure(pdbx_file, model=1, extra_fields=["charge"]) ref_structure.bonds = struc.connect_via_residue_names(ref_structure) pdbqt_file = pdbqt.PDBQTFile() with warnings.catch_warnings(): warnings.simplefilter("ignore") # Ignore warnings about atoms not parametrized mask = pdbqt.set_structure(pdbqt_file, ref_structure) ref_structure = ref_structure[mask] temp = TemporaryFile("r+") pdbqt_file.write(temp) temp.seek(0) pdbqt_file = pdbqt.PDBQTFile.read(temp) test_structure = pdbqt.get_structure(pdbqt_file, model=1) temp.close() assert np.allclose(test_structure.coord, ref_structure.coord) for category in test_structure.get_annotation_categories(): if category == "element": # PDBQT uses special atom types, which replace the usual # elements # -> there cannot be equality of the 'element' annotation continue try: assert np.array_equal(test_structure.get_annotation(category), ref_structure.get_annotation(category)) except AssertionError: print(f"Inequality in '{category}' category") raise
def test_PDBx_consistency(format): pdbx_file = pdbx.PDBxFile() pdbx_file.read(join(data_dir, "1l2y.cif")) array1 = pdbx.get_structure(pdbx_file) template = pdbx.get_structure(pdbx_file, model=1) if format == "trr": traj_file = trr.TRRFile() traj_file.read(join(data_dir, "1l2y.trr")) if format == "xtc": traj_file = xtc.XTCFile() traj_file.read(join(data_dir, "1l2y.xtc")) array2 = traj_file.get_structure(template) for cat in array1. get_annotation_categories(): assert array1.get_annotation(cat).tolist() == \ array2.get_annotation(cat).tolist() assert array1.coord == pytest.approx(array2.coord)
def load_structure(fpath, chain=None): """ Args: fpath: filepath to either pdb or cif file chain: the chain id Returns: biotite.structure.AtomArray """ if fpath.endswith('cif'): with open(fpath) as fin: pdbxf = pdbx.PDBxFile.read(fin) structure = pdbx.get_structure(pdbxf, model=1) elif fpath.endswith('pdb'): with open(fpath) as fin: pdbf = pdb.PDBFile.read(fin) structure = pdb.get_structure(pdbf, model=1) issolvent = filter_solvent(structure) structure = structure[~issolvent] chains = get_chains(structure) print(f'Found {len(chains)} chains:', chains, '\n') if len(chains) == 0: raise ValueError('No chains found in the input file.') if chain is None: chain = chains[0] if chain not in chains: raise ValueError(f'Chain {chain} not found in input file') structure = structure[structure.chain_id == chain] print(f'Loaded chain {chain}\n') return structure
def test_fetch(format, as_file_like): path = None if as_file_like else biotite.temp_dir() file_path_or_obj = rcsb.fetch("1l2y", format, path, overwrite=True) if format == "pdb": file = pdb.PDBFile.read(file_path_or_obj) pdb.get_structure(file) elif format == "pdbx": file = pdbx.PDBxFile.read(file_path_or_obj) pdbx.get_structure(file) elif format == "mmtf": file = mmtf.MMTFFile.read(file_path_or_obj) mmtf.get_structure(file) elif format == "fasta": file = fasta.FastaFile.read(file_path_or_obj) # Test if the file contains any sequences assert len(fasta.get_sequences(file)) > 0
def test_command(command_name, kwargs): reset() pdbx_file = pdbx.PDBxFile.read(join(data_dir, "1l2y.cif")) structure = pdbx.get_structure(pdbx_file) structure.bonds = struc.connect_via_residue_names(structure) pymol_obj = PyMOLObject.from_structure(structure) command = getattr(PyMOLObject, command_name) command(pymol_obj, **kwargs)
def test_superimposition_array(path): pdbx_file = pdbx.PDBxFile() pdbx_file.read(path) fixed = pdbx.get_structure(pdbx_file, model=1) mobile = fixed.copy() mobile = struc.rotate(mobile, (1, 2, 3)) mobile = struc.translate(mobile, (1, 2, 3)) fitted, transformation = struc.superimpose(fixed, mobile, (mobile.atom_name == "CA")) assert struc.rmsd(fixed, fitted) == pytest.approx(0) fitted = struc.superimpose_apply(mobile, transformation) assert struc.rmsd(fixed, fitted) == pytest.approx(0)
def test_pdbx_consistency(path, single_model): model = 1 if single_model else None cif_path = splitext(path)[0] + ".cif" pdb_file = pdb.PDBFile() pdb_file.read(path) a1 = pdb_file.get_structure(model=model) pdbx_file = pdbx.PDBxFile() pdbx_file.read(cif_path) a2 = pdbx.get_structure(pdbx_file, model=model) for category in a1.get_annotation_categories(): assert a1.get_annotation(category).tolist() == \ a2.get_annotation(category).tolist() assert a1.coord.tolist() == a2.coord.tolist()
def test_both_directions(path, state): pdbx_file = pdbx.PDBxFile.read(path) ref_array = pdbx.get_structure(pdbx_file, model=state) ref_array.bonds = struc.connect_via_residue_names(ref_array) reset() test_array = PyMOLObject.from_structure(ref_array) \ .to_structure(state=state, include_bonds=True) for cat in ref_array.get_annotation_categories(): assert (test_array.get_annotation(cat) == ref_array.get_annotation(cat) ).all() assert np.allclose(test_array.coord, ref_array.coord) assert test_array.bonds == ref_array.bonds
def test_pdbx_consistency(path, single_model): model = None if single_model else 1 cif_path = splitext(path)[0] + ".cif" mmtf_file = mmtf.MMTFFile() mmtf_file.read(path) a1 = mmtf.get_structure(mmtf_file, model=model) pdbx_file = pdbx.PDBxFile() pdbx_file.read(cif_path) a2 = pdbx.get_structure(pdbx_file, model=model) for category in a1.get_annotation_categories(): assert a1.get_annotation(category).tolist() == \ a2.get_annotation(category).tolist() assert a1.coord.flatten().tolist() == \ approx(a2.coord.flatten().tolist(), abs=1e-3)
def test_to_biotite(path, altloc, state): pdbx_file = pdbx.PDBxFile.read(path) ref_array = pdbx.get_structure(pdbx_file, model=state, altloc=altloc) reset() cmd.load(path, "test") test_array = PyMOLObject("test").to_structure(state=state, altloc=altloc) for cat in [ c for c in ref_array.get_annotation_categories() if c != "altloc_id" ]: assert (test_array.get_annotation(cat) == ref_array.get_annotation(cat) ).all() assert np.allclose(test_array.coord, ref_array.coord)
def test_pdbx_consistency(path, single_model): model = None if single_model else 1 cif_path = splitext(path)[0] + ".cif" mmtf_file = mmtf.MMTFFile.read(path) a1 = mmtf.get_structure(mmtf_file, model=model) pdbx_file = pdbx.PDBxFile.read(cif_path) a2 = pdbx.get_structure(pdbx_file, model=model) # Sometimes mmCIF files can have 'cell' entry # but corresponding MMTF file has not 'unitCell' entry # -> Do not assert for dummy entry in mmCIF file # (all vector elements = {0, 1}) if a2.box is not None and not ((a2.box == 0) | (a2.box == 1)).all(): assert np.allclose(a1.box, a2.box) for category in a1.get_annotation_categories(): assert a1.get_annotation(category).tolist() == \ a2.get_annotation(category).tolist() assert a1.coord.flatten().tolist() == \ approx(a2.coord.flatten().tolist(), abs=1e-3)
def test_get_assembly(model): """ Test whether the :func:`get_assembly()` function produces the same number of peptide chains as the ``_pdbx_struct_assembly.oligomeric_count`` field indicates. Furthermore, check if the number of atoms in the entire assembly is a multiple of the numbers of atoms in a monomer. """ path = join(data_dir("structure"), "1f2n.cif") pdbx_file = pdbx.PDBxFile.read(path) assembly_category = pdbx_file.get_category("pdbx_struct_assembly", expect_looped=True) # Test each available assembly for id, ref_oligomer_count in zip(assembly_category["id"], assembly_category["oligomeric_count"]): print("Assembly ID:", id) try: assembly = pdbx.get_assembly(pdbx_file, assembly_id=id, model=model) except biotite.InvalidFileError: if model is None: # The file cannot be parsed into an AtomArrayStack, # as the models contain different numbers of atoms # -> skip this test case return else: raise protein_assembly = assembly[..., struc.filter_amino_acids(assembly)] test_oligomer_count = struc.get_chain_count(protein_assembly) if model is None: assert isinstance(assembly, struc.AtomArrayStack) else: assert isinstance(assembly, struc.AtomArray) assert test_oligomer_count == int(ref_oligomer_count) # The atom count of the entire assembly should be a multiple # a monomer, monomer_atom_count = pdbx.get_structure(pdbx_file).array_length() assert assembly.array_length() % monomer_atom_count == 0
def test_pdbx_consistency(path, model): cif_path = splitext(path)[0] + ".cif" mmtf_file = mmtf.MMTFFile.read(path) try: a1 = mmtf.get_structure(mmtf_file, model=model) except biotite.InvalidFileError: if model is None: # The file cannot be parsed into an AtomArrayStack, # as the models contain different numbers of atoms # -> skip this test case return else: raise pdbx_file = pdbx.PDBxFile.read(cif_path) a2 = pdbx.get_structure(pdbx_file, model=model) # Sometimes mmCIF files can have 'cell' entry # but corresponding MMTF file has not 'unitCell' entry # -> Do not assert for dummy entry in mmCIF file # (all vector elements = {0, 1}) if a2.box is not None and not ((a2.box == 0) | (a2.box == 1)).all(): assert np.allclose(a1.box, a2.box) # MMTF might assign some residues, that PDBx assigns as 'hetero', # as 'non-hetero' if they are RNA/DNA or peptide linking try: assert a1.hetero.tolist() == \ a2.hetero.tolist() except AssertionError: conflict_residues = np.unique(a1.res_name[a1.hetero != a2.hetero]) for res in conflict_residues: assert info.link_type(res) in [ "L-PEPTIDE LINKING", "PEPTIDE LINKING", "DNA LINKING", "RNA LINKING" ] # Test the remaining categories for category in [ c for c in a1.get_annotation_categories() if c != "hetero" ]: assert a1.get_annotation(category).tolist() == \ a2.get_annotation(category).tolist() assert a1.coord.flatten().tolist() == \ approx(a2.coord.flatten().tolist(), abs=1e-3)
def test_superimposition_stack(ca_only): path = join(data_dir, "1l2y.cif") pdbx_file = pdbx.PDBxFile() pdbx_file.read(path) stack = pdbx.get_structure(pdbx_file) fixed = stack[0] mobile = stack[1:] if ca_only: mask = (mobile.atom_name == "CA") else: mask = None fitted, transformation = struc.superimpose(fixed, mobile, mask) if ca_only: # The superimpositions are better for most cases than the # superimpositions in the structure file # -> Use average assert np.mean(struc.rmsd(fixed, fitted)) \ < np.mean(struc.rmsd(fixed, mobile)) else: # The superimpositions are better than the superimpositions # in the structure file assert (struc.rmsd(fixed, fitted) < struc.rmsd(fixed, mobile)).all()
def test_to_pymol(path): reset() cmd.load(path, "test") ref_model = cmd.get_model("test", 1) pdbx_file = pdbx.PDBxFile.read(path) atom_array = pdbx.get_structure( pdbx_file, model=1, extra_fields=["b_factor", "occupancy", "charge"]) test_model = convert_to_chempy_model(atom_array) test_atoms = test_model.atom ref_atoms = [atom for atom in ref_model.atom if atom.alt in ("", "A")] assert len(test_atoms) == len(ref_atoms) for test_atom, ref_atom in zip(test_atoms, ref_atoms): assert test_atom.symbol == ref_atom.symbol assert test_atom.name == ref_atom.name assert test_atom.resn == ref_atom.resn assert test_atom.ins_code == ref_atom.ins_code assert test_atom.resi_number == ref_atom.resi_number assert test_atom.b == pytest.approx(ref_atom.b) assert test_atom.q == pytest.approx(ref_atom.q) assert test_atom.hetatm == ref_atom.hetatm assert test_atom.chain == ref_atom.chain assert test_atom.coord == pytest.approx(ref_atom.coord)
def test_select(random_seed): reset() pdbx_file = pdbx.PDBxFile.read(join(data_dir, "1l2y.cif")) array = pdbx.get_structure(pdbx_file, model=1) # Add bonds to avoid warning array.bonds = struc.connect_via_residue_names(array) # Use B factor as indicator if the selection was correctly applied array.set_annotation("b_factor", np.zeros(array.array_length())) pymol_object = PyMOLObject.from_structure(array) np.random.seed(random_seed) ref_mask = np.random.choice([False, True], array.array_length()) # The method that is actually tested test_selection = pymol_object.where(ref_mask) # Set B factor of all masked atoms to 1 cmd.alter(test_selection, "b=1.0") test_b_factor = pymol_object.to_structure(state=1).b_factor # Get the mask from the occupancy back again test_mask = (test_b_factor == 1.0) assert np.array_equal(test_mask, ref_mask)
def test_pdbx_consistency(path, model): cif_path = splitext(path)[0] + ".cif" pdb_file = pdb.PDBFile.read(path) try: a1 = pdb_file.get_structure(model=model) except biotite.InvalidFileError: if model is None: # The file cannot be parsed into an AtomArrayStack, # as the models contain different numbers of atoms # -> skip this test case return else: raise pdbx_file = pdbx.PDBxFile.read(cif_path) a2 = pdbx.get_structure(pdbx_file, model=model) if a2.box is not None: assert np.allclose(a1.box, a2.box) assert a1.bonds == a2.bonds for category in a1.get_annotation_categories(): assert a1.get_annotation(category).tolist() == \ a2.get_annotation(category).tolist() assert a1.coord.tolist() == a2.coord.tolist()
# :class:`ndarray`. # Setting/adding a category in the file is done in a similar way: cif_file["audit_author"] = { "name": ["Doe, Jane", "Doe, John"], "pdbx_ordinal": ["1", "2"] } ######################################################################## # In most applications only the structure itself # (stored in the *atom_site* category) is relevant. # :func:`get_structure()` and :func:`set_structure()` are convenience # functions that are used to convert the # ``atom_site`` category into an atom array (stack) and vice versa. tc5b = pdbx.get_structure(cif_file) # Do some fancy stuff pdbx.set_structure(cif_file, tc5b) ######################################################################## # :func:`get_structure()` creates automatically an # :class:`AtomArrayStack`, even if the file actually contains only a # single model. # If you would like to have an :class:`AtomArray` instead, you have to # specifiy the :obj:`model` parameter. # # .. currentmodule:: biotite.structure.io.mmtf # # If you want to parse a large batch of structure files or you have to # load very large structure files, the usage of PDB or mmCIF files might # be too slow for your requirements.
# :class:`ndarray`. # Setting/adding a category in the file is done in a similar way: file["audit_author"] = { "name": ["Doe, Jane", "Doe, John"], "pdbx_ordinal": ["1", "2"] } ######################################################################## # In most applications only the structure itself # (stored in the *atom_site* category) is relevant. # :func:`get_structure()` and :func:`set_structure()` are convenience # functions that are used to convert the # *atom_site* category into an atom array (stack) and vice versa. tc5b = pdbx.get_structure(file) # Do some fancy stuff pdbx.set_structure(file, tc5b) ######################################################################## # :func:`get_structure()` creates automatically an # :class:`AtomArrayStack`, even if the file actually contains only a # single model. # If you would like to have an :class:`AtomArray` instead, you have to # specifiy the :obj:`model` parameter. # # .. currentmodule:: biotite.structure.io.mmtf # # If you want to parse a large batch of structure files or you have to # load very large structure files, the usage of PDB or mmCIF files might # be too slow for your requirements. In this case you probably might
from os.path import join import numpy as np import pytest import biotite.structure as struc import biotite.structure.io.pdbx as pdbx from ammolite import PyMOLObject, reset from .util import data_dir pdbx_file = pdbx.PDBxFile.read(join(data_dir, "1l2y.cif")) structure = pdbx.get_structure(pdbx_file) mask = structure.res_id < 10 expr = "resi 1-10" @pytest.mark.parametrize( "command_name, kwargs", [ ("alter", { "selection": mask, "expression": "chain='B'", }), ("cartoon", { "type": "tube", }), ("cartoon", { "type": "tube", "selection": expr, }), ("cartoon", { "type": "tube", "selection": mask,
def test_get_model_count(): pdbx_file = pdbx.PDBxFile.read(join(data_dir("structure"), "1l2y.cif")) test_model_count = pdbx.get_model_count(pdbx_file) ref_model_count = pdbx.get_structure(pdbx_file).stack_depth() assert test_model_count == ref_model_count
import biotite.structure.io.pdbx as pdbx import biotite.sequence as seq import biotite.sequence.io.fasta as fasta import biotite.sequence.align as align import biotite.sequence.graphics as graphics import biotite.application.blast as blast import biotite.application.clustalo as clustalo import biotite.database.rcsb as rcsb import biotite.database.entrez as entrez # Get structure and sequence pdbx_file = pdbx.PDBxFile.read(rcsb.fetch("1GUU", "mmcif")) sequence = pdbx.get_sequence(pdbx_file)[0] # 'use_author_fields' is set to false, # to ensure that values in the 'res_id' annotation point to the sequence structure = pdbx.get_structure(pdbx_file, model=1, use_author_fields=False) structure = structure[struc.filter_amino_acids(structure)] # Identity threshold for a sequence to be counted as homologous sequence IDENTITY_THESHOLD = 0.4 # Find homologous proteins in SwissProt via BLAST app = blast.BlastWebApp("blastp", sequence, database="swissprot") app.start() app.join() alignments = app.get_alignments() hit_seqs = [sequence] hit_ids = ["Query"] hit_starts = [1] for ali in alignments: identity = align.get_sequence_identity(ali) # Do not include the exact same sequence -> identity < 1.0
# Benchmark the parsing of a mmCIF file given as an argument import sys import time import biotite.structure.io.pdbx as pdbx mmcif_filepath = sys.argv[1] start = time.time() file = pdbx.PDBxFile() file.read(mmcif_filepath) pdbx.get_structure(file) end = time.time() print(end - start)
from tempfile import NamedTemporaryFile import biotite.structure as struc import biotite.structure.io.pdbx as pdbx import biotite.database.rcsb as rcsb import numpy as np ku_dna_file = NamedTemporaryFile(suffix=".cif") ku_file = NamedTemporaryFile(suffix=".cif") # The output file names # Modify these values for actual file output ku_dna_file_name = ku_dna_file.name ku_file_name = ku_file.name # Download and parse structure files ku_dna = pdbx.get_structure(pdbx.PDBxFile.read(rcsb.fetch("1JEY", "cif")))[0] ku = pdbx.get_structure(pdbx.PDBxFile.read(rcsb.fetch("1JEQ", "cif")))[0] # Remove DNA and water ku_dna = ku_dna[(ku_dna.chain_id == "A") | (ku_dna.chain_id == "B")] ku_dna = ku_dna[~struc.filter_solvent(ku_dna)] ku = ku[~struc.filter_solvent(ku)] # The structures have a differing amount of atoms missing # at the the start and end of the structure # -> Find common structure ku_dna_common = ku_dna[struc.filter_intersection(ku_dna, ku)] ku_common = ku[struc.filter_intersection(ku, ku_dna)] # Superimpose ku_superimposed, transformation = struc.superimpose( ku_dna_common, ku_common, (ku_common.atom_name == "CA")) # We do not want the cropped structures # -> apply superimposition on original structures