def test_extra_fields(): path = join(data_dir, "1l2y.mmtf") mmtf_file = mmtf.MMTFFile() mmtf_file.read(path) stack1 = mmtf.get_structure( mmtf_file, extra_fields=[ "atom_id", "b_factor", "occupancy", "charge" ] ) mmtf_file == mmtf.MMTFFile() mmtf.set_structure(mmtf_file, stack1) stack2 = mmtf.get_structure( mmtf_file, extra_fields=[ "atom_id", "b_factor", "occupancy", "charge" ] ) assert stack1.atom_id.tolist() == stack2.atom_id.tolist() assert stack1.b_factor.tolist() == approx(stack2.b_factor.tolist()) assert stack1.occupancy.tolist() == approx(stack2.occupancy.tolist()) assert stack1.charge.tolist() == stack2.charge.tolist()
def test_array_conversion(path, single_model): model = 1 if single_model else None mmtf_file = mmtf.MMTFFile() mmtf_file.read(path) a1 = mmtf.get_structure(mmtf_file, model=model, include_bonds=True) mmtf_file = mmtf.MMTFFile() mmtf.set_structure(mmtf_file, a1) a2 = mmtf.get_structure(mmtf_file, model=model, include_bonds=True) for category in a1.get_annotation_categories(): assert a1.get_annotation(category).tolist() == \ a2.get_annotation(category).tolist() assert a1.coord.flatten().tolist() == \ approx(a2.coord.flatten().tolist(), abs=1e-3) assert a1.bonds == a2.bonds
def api_route(): pdb_id = request.args.get("pdb_id", "1Q2W") file_format = request.args.get("format", "mmtf") file_name = rcsb.fetch(pdb_id, file_format, biotite.temp_dir()) mmtf_file = mmtf.MMTFFile() mmtf_file.read(file_name) print() try: mmtf_s = mmtf_sec(mmtf_file).tolist() except: mmtf_s = [] try: dssp_s = dssp_sec(mmtf_file).tolist() except: dssp_s = [] try: psea_s = psea_sec(mmtf_file).tolist() except: dssp_s = [] structs = { "mmtf": mmtf_s, "dssp": dssp_s, "psea": psea_s, } return jsonify( sequence=list(mmtf_file["entityList"][0]["sequence"]), **structs, diffs=diff_all(**structs), )
def test_array_conversion(path, model): mmtf_file = mmtf.MMTFFile.read(path) try: a1 = mmtf.get_structure(mmtf_file, model=model, include_bonds=True) except biotite.InvalidFileError: if model is None: # The file cannot be parsed into an AtomArrayStack, # as the models contain different numbers of atoms # -> skip this test case return else: raise mmtf_file = mmtf.MMTFFile() mmtf.set_structure(mmtf_file, a1) temp = TemporaryFile("w+b") mmtf_file.write(temp) temp.seek(0) mmtf_file = mmtf.MMTFFile.read(temp) temp.close() a2 = mmtf.get_structure(mmtf_file, model=model, include_bonds=True) for category in a1.get_annotation_categories(): assert a1.get_annotation(category).tolist() == \ a2.get_annotation(category).tolist() assert a1.coord.flatten().tolist() == \ approx(a2.coord.flatten().tolist(), abs=1e-3) assert a1.bonds == a2.bonds if a1.box is not None: assert np.allclose(a1.box, a2.box)
def test_dssp(path): sec_struct_codes = {0 : "I", 1 : "S", 2 : "H", 3 : "E", 4 : "G", 5 : "B", 6 : "T", 7 : "C"} mmtf_file = mmtf.MMTFFile() mmtf_file.read(path) array = mmtf.get_structure(mmtf_file, model=1) array = array[array.hetero == False] first_chain_id = array.chain_id[0] chain = array[array.chain_id == first_chain_id] n_residues = struc.get_residue_count(chain) # Secondary structure annotation in PDB use also DSSP # -> compare PDB and local DSSP sse = mmtf_file["secStructList"] sse = sse[:n_residues] if (sse == -1).all(): # First chain is not a polypeptide chain (presumably DNA/RNA) # DSSP not applicable -> return return sse = np.array([sec_struct_codes[code] for code in sse], dtype="U1") chain = array[array.chain_id == first_chain_id] sse_from_app = DsspApp.annotate_sse(chain) np.set_printoptions(threshold=10000) # PDB uses different DSSP version -> slight differences possible # -> only 90% must be identical assert np.count_nonzero(sse_from_app == sse) / len(sse) > 0.9
def test_dihedral_backbone_result(file_name): import mdtraj mmtf_file = mmtf.MMTFFile() mmtf_file.read(file_name) array = mmtf.get_structure(mmtf_file, model=1) array = array[struc.filter_amino_acids(array)] for chain in struc.chain_iter(array): print("Chain: ", chain.chain_id[0]) if len(struc.check_id_continuity(chain)) != 0: # Do not test discontinuous chains return test_phi, test_psi, test_ome = struc.dihedral_backbone(chain) temp_file_name = biotite.temp_file("pdb") strucio.save_structure(temp_file_name, chain) traj = mdtraj.load(temp_file_name) _, ref_phi = mdtraj.compute_phi(traj) _, ref_psi = mdtraj.compute_psi(traj) _, ref_ome = mdtraj.compute_omega(traj) ref_phi, ref_psi, ref_ome = ref_phi[0], ref_psi[0], ref_ome[0] assert test_phi[1:] == pytest.approx(ref_phi, abs=1e-5, rel=5e-3) assert test_psi[:-1] == pytest.approx(ref_psi, abs=1e-5, rel=5e-3) assert test_ome[:-1] == pytest.approx(ref_ome, abs=1e-5, rel=5e-3)
def test_numpy_objects(): """ Test whether the Msgpack encoder is able to handle NumPy values (e.g. np.float32) properly. """ mmtf_file = mmtf.MMTFFile() mmtf_file["A float"] = np.float32(42.0) mmtf_file["A list"] = [np.int64(1), np.int64(2), np.int64(3)] mmtf_file["A dictionary"] = {"a": np.bool(True), "b": np.bool(False)} mmtf_file.write(biotite.temp_file("mmtf"))
def test_array_conversion(path, single_model): model = 1 if single_model else None mmtf_file = mmtf.MMTFFile() mmtf_file.read(path) a1 = mmtf.get_structure(mmtf_file, model=model, include_bonds=True) mmtf_file = mmtf.MMTFFile() mmtf.set_structure(mmtf_file, a1) temp_file_name = biotite.temp_file("mmtf") mmtf_file.write(temp_file_name) mmtf_file = mmtf.MMTFFile() mmtf_file.read(temp_file_name) a2 = mmtf.get_structure(mmtf_file, model=model, include_bonds=True) for category in a1.get_annotation_categories(): assert a1.get_annotation(category).tolist() == \ a2.get_annotation(category).tolist() assert a1.coord.flatten().tolist() == \ approx(a2.coord.flatten().tolist(), abs=1e-3) assert a1.bonds == a2.bonds if a1.box is not None: assert np.allclose(a1.box, a2.box)
def test_numpy_objects(): """ Test whether the Msgpack encoder is able to handle NumPy values (e.g. np.float32) properly. Only check if no error occurs. """ mmtf_file = mmtf.MMTFFile() mmtf_file["A float"] = np.float32(42.0) mmtf_file["A list"] = [np.int64(1), np.int64(2), np.int64(3)] mmtf_file["A dictionary"] = {"a": np.bool(True), "b": np.bool(False)} temp = TemporaryFile("w+b") mmtf_file.write(temp) temp.close()
def test_pdbx_consistency(path, single_model): model = None if single_model else 1 cif_path = splitext(path)[0] + ".cif" mmtf_file = mmtf.MMTFFile() mmtf_file.read(path) a1 = mmtf.get_structure(mmtf_file, model=model) pdbx_file = pdbx.PDBxFile() pdbx_file.read(cif_path) a2 = pdbx.get_structure(pdbx_file, model=model) for category in a1.get_annotation_categories(): assert a1.get_annotation(category).tolist() == \ a2.get_annotation(category).tolist() assert a1.coord.flatten().tolist() == \ approx(a2.coord.flatten().tolist(), abs=1e-3)
def test_fetch(format, as_file_like): path = None if as_file_like else biotite.temp_dir() file_path_or_obj = rcsb.fetch("1l2y", format, path, overwrite=True) if format == "pdb": file = pdb.PDBFile() file.read(file_path_or_obj) pdb.get_structure(file) elif format == "pdbx": file = pdbx.PDBxFile() file.read(file_path_or_obj) pdbx.get_structure(file) elif format == "mmtf": file = mmtf.MMTFFile() file.read(file_path_or_obj) mmtf.get_structure(file)
def test_codecs(path): mmtf_file = mmtf.MMTFFile() mmtf_file.read(path) for key in mmtf_file: if mmtf_file.get_codec(key) is not None: codec = mmtf_file.get_codec(key) param = mmtf_file.get_param(key) array1 = mmtf_file[key] mmtf_file.set_array(key, array1, codec, param) array2 = mmtf_file[key] if array1.dtype == np.float32: if param != 0: tol = 1/param else: tol = 0 assert np.isclose(array1, array2, atol=tol).all() else: assert (array1 == array2).all()
def test_connect_via_residue_names(single_model): """ Test whether the created bond list is equal to the bonds deposited in the MMTF file. """ # Structure with peptide, nucleotide, small molecules and water file = mmtf.MMTFFile() file.read(join(data_dir, "5ugo.mmtf")) if single_model: atoms = mmtf.get_structure(file, include_bonds=True, model=1) else: atoms = mmtf.get_structure(file, include_bonds=True) ref_bonds = atoms.bonds test_bonds = struc.connect_via_residue_names(atoms) assert test_bonds == ref_bonds
def test_fetch(format, as_file_like): path = None if as_file_like else biotite.temp_dir() file_path_or_obj = rcsb.fetch("1l2y", format, path, overwrite=True) if format == "pdb": file = pdb.PDBFile() file.read(file_path_or_obj) pdb.get_structure(file) elif format == "pdbx": file = pdbx.PDBxFile() file.read(file_path_or_obj) pdbx.get_structure(file) elif format == "mmtf": file = mmtf.MMTFFile() file.read(file_path_or_obj) mmtf.get_structure(file) elif format == "fasta": file = fasta.FastaFile() file.read(file_path_or_obj) # Test if the file contains any sequences assert len(fasta.get_sequences(file)) > 0
def test_pdbx_consistency(path, single_model): model = None if single_model else 1 cif_path = splitext(path)[0] + ".cif" mmtf_file = mmtf.MMTFFile() mmtf_file.read(path) a1 = mmtf.get_structure(mmtf_file, model=model) pdbx_file = pdbx.PDBxFile() pdbx_file.read(cif_path) a2 = pdbx.get_structure(pdbx_file, model=model) # Sometimes mmCIF files can have 'cell' entry # but corresponding MMTF file has not 'unitCell' entry # -> Do not assert for dummy entry in mmCIF file # (all vector elements = {0, 1}) if a2.box is not None and not ((a2.box == 0) | (a2.box == 1)).all(): assert np.allclose(a1.box, a2.box) for category in a1.get_annotation_categories(): assert a1.get_annotation(category).tolist() == \ a2.get_annotation(category).tolist() assert a1.coord.flatten().tolist() == \ approx(a2.coord.flatten().tolist(), abs=1e-3)
def test_connect_via_distances(): """ Test whether the created bond list is equal to the bonds deposited in the MMTF file. """ file = mmtf.MMTFFile() file.read(join(data_dir, "1l2y.mmtf")) atoms = mmtf.get_structure(file, include_bonds=True, model=1) # Remove termini to solve the issue that the reference bonds do not # contain proper bonds for the protonated/deprotonated termini atoms = atoms[(atoms.res_id > 1) & (atoms.res_id < 20)] ref_bonds = atoms.bonds # Convert all bonds to BondType.ANY ref_bonds = struc.BondList( ref_bonds.get_atom_count(), ref_bonds.as_array()[:, :2] ) test_bonds = struc.connect_via_distances(atoms) assert test_bonds == ref_bonds
def test_bonds(path): """ Test whether the bond data is consistent with the content of MMTF files. """ bond_data = strucinfo.bond_dataset() mmtf_file = mmtf.MMTFFile() mmtf_file.read(path) for group in mmtf_file["groupList"]: group_name = group["groupName"] atom_names = group["atomNameList"] bond_indices = group["bondAtomList"] bond_orders = group["bondOrderList"] for i in range(0, len(bond_indices), 2): atom1 = atom_names[bond_indices[i]] atom2 = atom_names[bond_indices[i + 1]] order = bond_orders[i // 2] assert strucinfo.bond_order(group_name, atom1, atom2) == order assert frozenset((atom1, atom2)) \ in strucinfo.bonds_in_residue(group_name) assert frozenset((atom1, atom2)) \ in bond_data[group_name]
def test_coarse_grained(pdb_id): # Multi atom SASA (ProtOr), compare with single atom SASA # on residue level file = mmtf.MMTFFile() file.read(join(data_dir, pdb_id + ".mmtf")) array = mmtf.get_structure(file, model=1) array = array[struc.filter_amino_acids(array)] sasa = struc.apply_residue_wise(array, struc.sasa(array, vdw_radii="ProtOr"), np.nansum) sasa_exp = struc.apply_residue_wise(array, struc.sasa(array, vdw_radii="Single"), np.nansum) # Assert that more than 90% of atoms # have less than 10% SASA difference assert np.count_nonzero(np.isclose(sasa, sasa_exp, rtol=1e-1, atol=1)) / len(sasa) > 0.9 # Assert that more than 98% of atoms # have less than 40% SASA difference assert np.count_nonzero(np.isclose(sasa, sasa_exp, rtol=4e-1, atol=1)) / len(sasa) > 0.98
""" # Code source: Patrick Kunzmann # License: BSD 3 clause import numpy as np import biotite.structure as struc import biotite.structure.io.mmtf as mmtf import biotite.database.rcsb as rcsb # The maximum distance between an atom in the repressor and an atom in # the DNA for them to be considered 'in contact' THRESHOLD_DISTANCE = 4.0 # Fetch and load structure mmtf_file = mmtf.MMTFFile() mmtf_file.read(rcsb.fetch("2or1", "mmtf")) structure = mmtf.get_structure(mmtf_file, model=1) # Separate structure into the DNA and the two identical protein chains dna = structure[np.isin(structure.chain_id, ["A", "B"]) & (structure.hetero == False)] protein_l = structure[(structure.chain_id == "L") & (structure.hetero == False)] protein_r = structure[(structure.chain_id == "R") & (structure.hetero == False)] # Quick check if the two protein chains are really identical assert len(struc.get_residues(protein_l)) == len(struc.get_residues(protein_r)) # Fast identification of contacts via a cell list: # The cell list is initiliazed with the coordinates of the DNA
def build_patterns(structfam, folder): patterns = [] for pdb, c, start, end in tqdm(structfam): file_name = rcsb.fetch(pdb, "mmtf", biotite.temp_dir()) mmtf_file = mmtf.MMTFFile() mmtf_file.read(file_name) array = mmtf.get_structure(mmtf_file, model=1) tk_dimer = array[struc.filter_amino_acids(array)] # The chain ID corresponding to each residue chain_id_per_res = array.chain_id[struc.get_residue_starts(tk_dimer)] sse = mmtf_file["secStructList"] sse = sse[:chain_id_per_res.shape[0]][chain_id_per_res == c] sse = np.array(sse[start:end + 1]) sse = np.array([sec_struct_codes[code % 8] for code in sse], dtype="U1") sse8 = to_onehot([dssp_codes[x] for x in sse], (None, 8)) dss8 = (sse8[1:] - sse8[:-1]) cls = to_onehot(np.where(dss8 == -1)[1], (None, 8)).T bbox = np.array( [np.where(dss8 == 1)[0], np.where(dss8 == -1)[0], *cls]).T pat8 = np.argmax(bbox[:, 2:], 1) sse3 = to_onehot([abc_codes[dssp_to_abc[x]] for x in sse], (None, 3)) dss3 = (sse3[1:] - sse3[:-1]) cls = to_onehot(np.where(dss3 == -1)[1], (None, 3)).T bbox = np.array( [np.where(dss3 == 1)[0], np.where(dss3 == -1)[0], *cls]).T pat3 = np.argmax(bbox[:, 2:], 1) patterns.append((pat3, pat8)) if len(patterns) == 0: print("No pattern find") return None, None, None, None c_patterns3, n_patterns3, c_patterns8, n_patterns8, weights = [], [], [], [], [] for pat3, pat8 in patterns: char_pat8 = "".join([sec_struct_codes[x] for x in pat8]) char_pat3 = "".join(["abc"[x] for x in pat3]) c_patterns8.append(char_pat8) n_patterns8.append(list(pat8)) c_patterns3.append(char_pat3) n_patterns3.append(list(pat3)) occ_sum8 = dict() occ_sum3 = dict() correspondings8 = dict() correspondings3 = dict() for c8, n8, c3, n3 in zip(c_patterns8, n_patterns8, c_patterns3, n_patterns3): if len(c3) == 0: continue if c3[0] != "c": c3 = "c" + c3 n3 = [2] + n3 if c3[-1] != "c": c3 = c3 + "c" n3 = n3 + [2] if c8[0] != "C": c8 = "C" + c8 n8 = [7] + n8 if c8[-1] != "C": c8 = c8 + "C" n8 = n8 + [7] if c8 not in occ_sum8.keys(): occ_sum8[c8] = 0 correspondings8[c8] = c8, n8 occ_sum8[c8] += 1 if c3 not in occ_sum3.keys(): occ_sum3[c3] = 0 correspondings3[c3] = c3, n3 occ_sum3[c3] += 1 c_pattern8, n_pattern8 = correspondings8[max(occ_sum8, key=occ_sum8.get)] c_pattern3, n_pattern3 = correspondings3[max(occ_sum3, key=occ_sum3.get)] push(f"{folder}/data.pt", "pattern", (c_pattern3, n_pattern3, c_pattern8, n_pattern8)) return c_pattern3, n_pattern3, c_pattern8, n_pattern8, occ_sum3, occ_sum8
def search_pattern(path, uniprot, seq_nat): r""" Search a pattern with PDB Args: path (str): path to save data uniprot (str): uniprot id of the search sequence seq_nat (str): raw sequences for a better alignment of the pattern with the sequence """ pdb_uniprot = pd.read_csv(f"{CROSS}/uniprot_pdb.csv", index_col=0) longest, patterns = 0, [] for pdb in pdb_uniprot[pdb_uniprot.uni == uniprot].pdb.values: try: file_name = rcsb.fetch(pdb, "mmtf", biotite.temp_dir()) mmtf_file = mmtf.MMTFFile() mmtf_file.read(file_name) # Transketolase homodimer ss_seq = np.array(list(mmtf_file["entityList"][0]["sequence"])) length, (m_nat, M_nat, m_mut, M_mut), _ = lcs_pattern(seq_nat, "".join(ss_seq)) sse = mmtf_file["secStructList"] sse = np.array(sse[m_mut:M_mut + 1]) length = len(sse) if max(sse) == -1: continue if length < longest: continue if length > longest: longest = length patterns = [] sse = np.array([pdb_codes[code % 8] for code in sse], dtype="U1") sse8 = to_onehot([dssp_codes[x] for x in sse], (None, 8)) dss8 = (sse8[1:] - sse8[:-1]) cls = to_onehot(np.where(dss8 == -1)[1], (None, 8)).T bbox = np.array( [np.where(dss8 == 1)[0], np.where(dss8 == -1)[0], *cls]).T pat8 = np.argmax(bbox[:, 2:], 1) sse3 = to_onehot([abc_codes[dssp_to_abc[x]] for x in sse], (None, 3)) dss3 = (sse3[1:] - sse3[:-1]) cls = to_onehot(np.where(dss3 == -1)[1], (None, 3)).T bbox = np.array( [np.where(dss3 == 1)[0], np.where(dss3 == -1)[0], *cls]).T pat3 = np.argmax(bbox[:, 2:], 1) patterns.append((list(pat3), list(pat8))) except: continue ratio_covered = longest / len(seq_nat) if ratio_covered <= 0.9: push(f"{path}/data.pt", "pattern", (None, None, None, None)) return None, ratio_covered c_patterns3, n_patterns3, c_patterns8, n_patterns8 = [], [], [], [] for pat3, pat8 in patterns: if len(pat3) == 0: continue if pat3[0] != 2: pat3 = [2] + pat3 if pat3[-1] != 2: pat3 = pat3 + [2] if pat8[0] != 7: pat8 = [7] + pat8 if pat8[-1] != 7: pat8 = pat8 + [7] char_pat8 = "".join([sec_struct_codes[x] for x in pat8]) char_pat3 = "".join(["abc"[x] for x in pat3]) c_patterns8.append(char_pat8) n_patterns8.append(list(pat8)) c_patterns3.append(char_pat3) n_patterns3.append(list(pat3)) max_occ, c_pattern3, n_pattern3, c_pattern8, n_pattern8 = 0, None, None, None, None for c3, n3, c8, n8 in zip(c_patterns3, n_patterns3, c_patterns8, n_patterns8): n_occ = c_patterns8.count(c8) if n_occ > max_occ: max_occ = n_occ c_pattern3, n_pattern3 = c3, n3 c_pattern8, n_pattern8 = c8, n8 push(f"{path}/data.pt", "pattern", (c_pattern3, n_pattern3, c_pattern8, n_pattern8)) return (c_pattern3, n_pattern3, c_pattern8, n_pattern8), ratio_covered