def test_array_conversion(path, model, hybrid36): pdb_file = pdb.PDBFile.read(path) # Test also the thin wrapper around the methods # 'get_structure()' and 'set_structure()' try: array1 = pdb.get_structure(pdb_file, model=model) except biotite.InvalidFileError: if model is None: # The file cannot be parsed into an AtomArrayStack, # as the models contain different numbers of atoms # -> skip this test case return else: raise if hybrid36 and (array1.res_id < 1).any(): with pytest.raises(ValueError, match="Only positive integers can be converted " "into hybrid-36 notation"): pdb_file = pdb.PDBFile() pdb.set_structure(pdb_file, array1, hybrid36=hybrid36) return else: pdb_file = pdb.PDBFile() pdb.set_structure(pdb_file, array1, hybrid36=hybrid36) array2 = pdb.get_structure(pdb_file, model=model) if array1.box is not None: assert np.allclose(array1.box, array2.box) assert array1.bonds == array2.bonds for category in array1.get_annotation_categories(): assert array1.get_annotation(category).tolist() == \ array2.get_annotation(category).tolist() assert array1.coord.tolist() == array2.coord.tolist()
def test_array_conversion(path, single_model, hybrid36): model = 1 if single_model else None pdb_file = pdb.PDBFile.read(path) # Test also the thin wrapper around the methods # 'get_structure()' and 'set_structure()' array1 = pdb.get_structure(pdb_file, model=model) if hybrid36 and (array1.res_id < 1).any(): with pytest.raises(ValueError, match="Only positive integers can be converted " "into hybrid-36 notation"): pdb_file = pdb.PDBFile() pdb.set_structure(pdb_file, array1, hybrid36=hybrid36) return else: pdb_file = pdb.PDBFile() pdb.set_structure(pdb_file, array1, hybrid36=hybrid36) array2 = pdb.get_structure(pdb_file, model=model) if array1.box is not None: assert np.allclose(array1.box, array2.box) assert array1.bonds == array2.bonds for category in array1.get_annotation_categories(): assert array1.get_annotation(category).tolist() == \ array2.get_annotation(category).tolist() assert array1.coord.tolist() == array2.coord.tolist()
def test_extra_fields(hybrid36): path = join(data_dir, "1l2y.pdb") pdb_file = pdb.PDBFile() pdb_file.read(path) stack1 = pdb_file.get_structure( extra_fields=[ "atom_id", "b_factor", "occupancy", "charge" ] ) with pytest.raises(ValueError): pdb_file.get_structure(extra_fields=["unsupported_field"]) pdb_file = pdb.PDBFile() pdb_file.set_structure(stack1, hybrid36=hybrid36) stack2 = pdb_file.get_structure( extra_fields=[ "atom_id", "b_factor", "occupancy", "charge" ] ) assert stack1.ins_code.tolist() == stack2.ins_code.tolist() assert stack1.atom_id.tolist() == stack2.atom_id.tolist() assert stack1.b_factor.tolist() == approx(stack2.b_factor.tolist()) assert stack1.occupancy.tolist() == approx(stack2.occupancy.tolist()) assert stack1.charge.tolist() == stack2.charge.tolist() assert stack1 == stack2
def test_array_conversion(path, single_model): model = 1 if single_model else None pdb_file = pdb.PDBFile() pdb_file.read(path) array1 = pdb_file.get_structure(model=model) pdb_file = pdb.PDBFile() pdb_file.set_structure(array1) array2 = pdb_file.get_structure(model=model) assert array1 == array2
def test_id_overflow(): # Create an atom array >= 100k atoms length = 100000 a = struc.AtomArray(length) a.coord = np.zeros(a.coord.shape) a.chain_id = np.full(length, "A") # Create residue IDs over 10000 a.res_id = np.arange(1, length + 1) a.res_name = np.full(length, "GLY") a.hetero = np.full(length, False) a.atom_name = np.full(length, "CA") a.element = np.full(length, "C") # Write stack to pdb file and make sure a warning is thrown with pytest.warns(UserWarning): temp = TemporaryFile("w+") pdb_file = pdb.PDBFile() pdb_file.set_structure(a) pdb_file.write(temp) # Assert file can be read properly temp.seek(0) a2 = pdb.get_structure(pdb.PDBFile.read(temp)) assert (a2.array_length() == a.array_length()) # Manually check if the written atom id is correct temp.seek(0) last_line = temp.readlines()[-1] atom_id = int(last_line.split()[1]) assert (atom_id == 1) temp.close() # Write stack as hybrid-36 pdb file: no warning should be thrown with pytest.warns(None) as record: temp = TemporaryFile("w+") tmp_pdb_file = pdb.PDBFile() tmp_pdb_file.set_structure(a, hybrid36=True) tmp_pdb_file.write(temp) assert len(record) == 0 # Manually check if the output is written as correct hybrid-36 temp.seek(0) last_line = temp.readlines()[-1] atom_id = last_line.split()[1] assert (atom_id == "A0000") res_id = last_line.split()[4][1:] assert (res_id == "BXG0") temp.close()
def temp_pdb(structure,tmp_dir): fd,temp_path = tempfile.mkstemp(dir=tmp_dir.name,text=True) pdb_out = PDB.PDBFile() PDB.set_structure(pdb_out,structure) pdb_out.write(temp_path) os.close(fd) return temp_path
def test_extra_fields(hybrid36): path = join(data_dir("structure"), "1l2y.pdb") pdb_file = pdb.PDBFile.read(path) stack1 = pdb_file.get_structure( extra_fields=["atom_id", "b_factor", "occupancy", "charge"]) with pytest.raises(ValueError): pdb_file.get_structure(extra_fields=["unsupported_field"]) # Add non-neutral charge values, # as the input PDB has only neutral charges stack1.charge[0] = -1 stack1.charge[1] = 2 pdb_file = pdb.PDBFile() pdb_file.set_structure(stack1, hybrid36=hybrid36) stack2 = pdb_file.get_structure( extra_fields=["atom_id", "b_factor", "occupancy", "charge"]) assert stack1.ins_code.tolist() == stack2.ins_code.tolist() assert stack1.atom_id.tolist() == stack2.atom_id.tolist() assert stack1.b_factor.tolist() == approx(stack2.b_factor.tolist()) assert stack1.occupancy.tolist() == approx(stack2.occupancy.tolist()) assert stack1.charge.tolist() == stack2.charge.tolist() assert stack1 == stack2
def test_single(pdb_id): file_name = join(data_dir, pdb_id + ".pdb") # Single atom SASA, compare with MDTraj file = pdb.PDBFile() file.read(file_name) array = file.get_structure(model=1) sasa = struc.sasa(array, vdw_radii="Single", point_number=5000) from biotite.structure.info.radii import _SINGLE_RADII as radii import mdtraj # Use the same atom radii radii = { element.capitalize(): radius / 10 for element, radius in radii.items() } traj = mdtraj.load(file_name) # Conversion from nm^2 to A^2 sasa_exp = mdtraj.shrake_rupley( traj, change_radii=radii, n_sphere_points=5000)[0] * 100 # Assert that more than 90% of atoms # have less than 10% SASA difference assert np.count_nonzero(np.isclose(sasa, sasa_exp, rtol=1e-1, atol=1e-1)) / len(sasa) > 0.9 # Assert that more than 98% of atoms # have less than 1% SASA difference assert np.count_nonzero(np.isclose(sasa, sasa_exp, rtol=1e-2, atol=1e-1)) / len(sasa) > 0.98
def test_pdb_to_gro(path, single_model): # Converting stacks between formats should not change data model = 1 if single_model else None # Read in data pdb_file = pdb.PDBFile() pdb_file.read(path) a1 = pdb_file.get_structure(model=model) # Save stack as gro tmp_file_name = biotite.temp_file("gro") gro_file = gro.GROFile() gro_file.set_structure(a1) gro_file.write(tmp_file_name) # Reload stack from gro gro_file = gro.GROFile() gro_file.read(tmp_file_name) a2 = gro_file.get_structure(model=model) assert a1.array_length() == a2.array_length() for category in ["res_id", "res_name", "atom_name"]: assert a1.get_annotation(category).tolist() == \ a2.get_annotation(category).tolist() # Mind rounding errors when converting pdb to gro (A -> nm) assert a1.coord.flatten().tolist() \ == approx(a2.coord.flatten().tolist(), abs=1e-2)
def test_array_conversion(path, single_model, hybrid36): model = 1 if single_model else None pdb_file = pdb.PDBFile() pdb_file.read(path) # Test also the thin wrapper around the methods # 'get_structure()' and 'set_structure()' array1 = pdb.get_structure(pdb_file, model=model) pdb_file = pdb.PDBFile() pdb.set_structure(pdb_file, array1, hybrid36=hybrid36) array2 = pdb.get_structure(pdb_file, model=model) if array1.box is not None: assert np.allclose(array1.box, array2.box) assert array1.bonds == array2.bonds for category in array1.get_annotation_categories(): assert array1.get_annotation(category).tolist() == \ array2.get_annotation(category).tolist() assert array1.coord.tolist() == array2.coord.tolist()
def test_get_coord(model): # Choose a structure without inscodes and altlocs # to avoid atom filtering in reference atom array (stack) path = join(data_dir, "1l2y.pdb") pdb_file = pdb.PDBFile() pdb_file.read(path) ref_coord = pdb_file.get_structure(model=model).coord test_coord = pdb_file.get_coord(model=model) assert test_coord.shape == ref_coord.shape assert (test_coord == ref_coord).all()
def test_box_shape(path, single_model): model = 1 if single_model else None pdb_file = pdb.PDBFile() pdb_file.read(path) a = pdb_file.get_structure(model=model) if isinstance(a, struc.AtomArray): expected_box_dim = (3, 3) else: expected_box_dim = (len(a), 3, 3) assert expected_box_dim == a.box.shape
def test_box_parsing(): path = join(data_dir, "1igy.pdb") pdb_file = pdb.PDBFile() pdb_file.read(path) a = pdb_file.get_structure() expected_box = np.array([[ [66.65, 0.00, 0.00], [0.00, 190.66, 0.00], [-24.59, 0.00, 68.84] ]]) assert expected_box.flatten().tolist() \ == approx(a.box.flatten().tolist(), abs=1e-2)
def test_pdbx_consistency(path, single_model): model = 1 if single_model else None cif_path = splitext(path)[0] + ".cif" pdb_file = pdb.PDBFile() pdb_file.read(path) a1 = pdb_file.get_structure(model=model) pdbx_file = pdbx.PDBxFile() pdbx_file.read(cif_path) a2 = pdbx.get_structure(pdbx_file, model=model) for category in a1.get_annotation_categories(): assert a1.get_annotation(category).tolist() == \ a2.get_annotation(category).tolist() assert a1.coord.tolist() == a2.coord.tolist()
def test_extra_fields(): path = join(data_dir, "1l2y.pdb") pdb_file = pdb.PDBFile() pdb_file.read(path) stack1 = pdb_file.get_structure( extra_fields=["atom_id", "b_factor", "occupancy", "charge"]) pdb_file.set_structure(stack1) stack2 = pdb_file.get_structure( extra_fields=["atom_id", "b_factor", "occupancy", "charge"]) assert stack1.atom_id.tolist() == stack2.atom_id.tolist() assert stack1.b_factor.tolist() == stack2.b_factor.tolist() assert stack1.occupancy.tolist() == stack2.occupancy.tolist() assert stack1.charge.tolist() == stack2.charge.tolist() assert stack1 == stack2
def test_guess_elements(): # read valid pdb file path = join(data_dir, "1l2y.pdb") pdb_file = pdb.PDBFile() pdb_file.read(path) stack = pdb_file.get_structure() # remove all elements removed_stack = stack.copy() removed_stack.element[:] = '' # save stack without elements to tmp file tmp_file_name = biotite.temp_file(".pdb") tmp_pdb_file = pdb.PDBFile() tmp_pdb_file.set_structure(removed_stack) tmp_pdb_file.write(tmp_file_name) # read new stack from file with guessed elements guessed_pdb_file = pdb.PDBFile() guessed_pdb_file.read(tmp_file_name) guessed_stack = guessed_pdb_file.get_structure() assert guessed_stack.element.tolist() == stack.element.tolist()
def test_fetch(format, as_file_like): path = None if as_file_like else biotite.temp_dir() file_path_or_obj = rcsb.fetch("1l2y", format, path, overwrite=True) if format == "pdb": file = pdb.PDBFile() file.read(file_path_or_obj) pdb.get_structure(file) elif format == "pdbx": file = pdbx.PDBxFile() file.read(file_path_or_obj) pdbx.get_structure(file) elif format == "mmtf": file = mmtf.MMTFFile() file.read(file_path_or_obj) mmtf.get_structure(file)
def test_pdb_consistency(path): pdb_path = splitext(path)[0] + ".pdb" pdb_file = pdb.PDBFile() pdb_file.read(pdb_path) a1 = pdb_file.get_structure(model=1) gro_file = gro.GROFile() gro_file.read(path) a2 = gro_file.get_structure(model=1) assert a1.array_length() == a2.array_length() for category in ["res_id", "res_name", "atom_name"]: assert a1.get_annotation(category).tolist() == \ a2.get_annotation(category).tolist() # Mind rounding errors when converting pdb to gro (A -> nm) assert a1.coord.flatten().tolist() \ == approx(a2.coord.flatten().tolist(), abs=1e-2)
def load_pdbs_from_tar_gz(file_path, mode="r:gz"): """ """ if type(file_path) is not str: raise RuntimeError("Argument \'file_path\' is not of type str!") archive = tarfile.open(name=file_path, mode=mode) coords = [] for pdb in archive.getmembers(): if pdb.name == 'test_archive': continue pdb_handle = archive.extractfile(pdb) content = pdb_handle.read().decode() f = io.StringIO(content) pdbfile = bpdb.PDBFile() pdbfile.read(f) coords.append(pdbfile.get_coord()[0]) archive.close() return np.array(coords)
def test_fetch(format, as_file_like): path = None if as_file_like else biotite.temp_dir() file_path_or_obj = rcsb.fetch("1l2y", format, path, overwrite=True) if format == "pdb": file = pdb.PDBFile() file.read(file_path_or_obj) pdb.get_structure(file) elif format == "pdbx": file = pdbx.PDBxFile() file.read(file_path_or_obj) pdbx.get_structure(file) elif format == "mmtf": file = mmtf.MMTFFile() file.read(file_path_or_obj) mmtf.get_structure(file) elif format == "fasta": file = fasta.FastaFile() file.read(file_path_or_obj) # Test if the file contains any sequences assert len(fasta.get_sequences(file)) > 0
def test_guess_elements(): # Read valid pdb file path = join(data_dir("structure"), "1l2y.pdb") pdb_file = pdb.PDBFile.read(path) stack = pdb_file.get_structure() # Remove all elements removed_stack = stack.copy() removed_stack.element[:] = '' # Save stack without elements to tmp file temp = TemporaryFile("w+") tmp_pdb_file = pdb.PDBFile() tmp_pdb_file.set_structure(removed_stack) tmp_pdb_file.write(temp) # Read new stack from file with guessed elements temp.seek(0) guessed_pdb_file = pdb.PDBFile.read(temp) temp.close() guessed_stack = guessed_pdb_file.get_structure() assert guessed_stack.element.tolist() == stack.element.tolist()
# possible, namely the miniprotein *TC5b* (PDB: ``1L2Y```). # The structure of this 20-residue protein (304 atoms) has been # elucidated via NMR. # Thus, the corresponding PDB file consists of multiple (namely 38) # models, each showing another conformation. # # .. currentmodule:: biotite.structure.io.pdb # # At first we load the structure from a PDB file via the class # :class:`PDBFile` in the subpackage :mod:`biotite.structure.io.pdb`. import biotite import biotite.structure.io.pdb as pdb import biotite.database.rcsb as rcsb pdb_file_path = rcsb.fetch("1l2y", "pdb", biotite.temp_dir()) file = pdb.PDBFile() file.read(pdb_file_path) tc5b = file.get_structure() print(type(tc5b).__name__) print(tc5b.stack_depth()) print(tc5b.array_length()) ######################################################################## # The method :func:`PDBFile.get_structure()` returns an atom array stack # unless the :obj:`model` parameter is specified, # even if the file contains only one model. # Alternatively, the module level function :func:`get_structure()` # can be used. # The following example # shows how to write an array or stack back into a PDB file:
mask_exit = np.logical_and(inds_chain,np.in1d(struct.res_id, np.array([3586,3661,3668,3832,3840,4024]))) active_sites = traj[:,mask_active_site,:] active_sites = np.mean(active_sites,axis = 1) coords_exit = traj[:,mask_exit,:] coords_exit = np.mean(coords_exit,axis = 1) density = np.empty((traj.shape[0],len(radius))) for j,r in enumerate(radius): for i in range(traj.shape[0]): n_atoms = np.where(in_cylinder(coords_exit[i],active_sites[i],r,traj[i]))[0].shape volume = np.linalg.norm(coords_exit[i] - active_sites[i]) * np.pi * r**2 density[i,j] = n_atoms / volume return density radius = [6,8,9,10,11,12,13,14,15,16] struct = PDB.PDBFile() struct.read("../md_simulations/kinase_dimer.pdb") struct = struct.get_structure()[0] traj = XTC.XTCFile() traj.read("../md_simulations/kinase_dimer_nopbc_cluster_fit.xtc") traj = traj.get_coord() density_a = calculate_densities(struct,traj,"A",radius) density_data_a = pd.DataFrame(data=density_a,columns = [str(i) for i in radius]) density_data_a.to_csv("density_data_kinase_a.csv") density_d = calculate_densities(struct,traj,"D",radius) density_data_d = pd.DataFrame(data=density_d,columns = [str(i) for i in radius]) density_data_d.to_csv("density_data_kinase_d.csv")
import biotite.structure.io.pdb as pdb import biotite.structure as struc pdb_file = pdb.PDBFile() pdb_file.read(snakemake.input[0]) # Only use one model structure = pdb_file.get_structure(model=1) # Remove water structure = structure[~struc.filter_solvent(structure)] # Remove hydrogens structure = structure[structure.element != "H"] pdb_file.set_structure(structure) pdb_file.write(snakemake.output[0])
def load_pdbs_from_dict(dir_path, sc=None): """ """ if type(dir_path) is not str: raise RuntimeError("Argument \'dir_path\' is not of type str!") files = os.listdir(dir_path) if sc is not None: print("using score") score = pd.read_csv(sc, header=1, delimiter="\s+") score = score[~pd.isnull(score['description'])] assert (len(score.keys()) > 0) assert ('description' in score.keys()) # for f in score['description']: # cond = f.split("/")[-1]+".pdb" in files # if not cond: # print(f) # assert(cond) files = [ os.path.join(dir_path, f.split("/")[-1] + ".pdb") for f in score['description'] ] else: files = [os.path.join(dir_path, f) for f in files] coords = [] i = 0 dT = 0 N_files = len(files) #markers = np.linspace(0, 1, 100) p_marker_delta = 1.0 / 100 marker = '' p_last = 0 times = [] print("first file ::" + str(files[0])) print("last file ::" + str(files[-1])) for f in files: t1 = time.time() pdbfile = bpdb.PDBFile() pdbfile.read(f) try: # if coords is None: # coords = pdbfile.get_coord()[0] # else: # coords = np.vstack((coords, pdbfile.get_coord()[0])) coords.append(pdbfile.get_coord()[0]) except: print("") print("") print(" tried loading file ::" + str(f)) print("") exit() t2 = time.time() times.append(t2 - t1) p = float(i + 1) / float(N_files) if p - p_last >= p_marker_delta: p_last = p marker = marker + "=" T_eta = np.mean(times) * (N_files - (i + 1)) T_eta_err = np.std(times) print("loading file [" + str("") + " | " + str(i) + " / " + str(N_files) + "] " + "|" + str(marker) + "> " + str(p * 100) + "% complete" + " ETA :: " + str(datetime.timedelta(seconds=T_eta)) + " +/- " + str(T_eta_err) + "s ", end='\r') i = i + 1 print("") print("") print(" ... finished loading pdbs ... ") return np.array(coords, dtype=np.float64)