Esempio n. 1
0
def test_extra_fields():
    path = join(data_dir, "1l2y.mmtf")
    mmtf_file = mmtf.MMTFFile()
    mmtf_file.read(path)
    stack1 = mmtf.get_structure(
        mmtf_file,
        extra_fields=[
            "atom_id", "b_factor", "occupancy", "charge"
        ]
    )

    mmtf_file == mmtf.MMTFFile()
    mmtf.set_structure(mmtf_file, stack1)
    
    stack2 = mmtf.get_structure(
        mmtf_file,
        extra_fields=[
            "atom_id", "b_factor", "occupancy", "charge"
        ]
    )
    
    assert stack1.atom_id.tolist() == stack2.atom_id.tolist()
    assert stack1.b_factor.tolist() == approx(stack2.b_factor.tolist())
    assert stack1.occupancy.tolist() == approx(stack2.occupancy.tolist())
    assert stack1.charge.tolist() == stack2.charge.tolist()
Esempio n. 2
0
def test_array_conversion(path, single_model):
    model = 1 if single_model else None
    mmtf_file = mmtf.MMTFFile()
    mmtf_file.read(path)
    a1 = mmtf.get_structure(mmtf_file, model=model, include_bonds=True)
    mmtf_file = mmtf.MMTFFile()
    mmtf.set_structure(mmtf_file, a1)
    a2 = mmtf.get_structure(mmtf_file, model=model, include_bonds=True)
    for category in a1.get_annotation_categories():
        assert a1.get_annotation(category).tolist() == \
               a2.get_annotation(category).tolist()
    assert a1.coord.flatten().tolist() == \
           approx(a2.coord.flatten().tolist(), abs=1e-3)
    assert a1.bonds == a2.bonds
Esempio n. 3
0
def api_route():
    pdb_id = request.args.get("pdb_id", "1Q2W")
    file_format = request.args.get("format", "mmtf")
    file_name = rcsb.fetch(pdb_id, file_format, biotite.temp_dir())
    mmtf_file = mmtf.MMTFFile()
    mmtf_file.read(file_name)
    print()

    try:
        mmtf_s = mmtf_sec(mmtf_file).tolist()
    except:
        mmtf_s = []
    try:
        dssp_s = dssp_sec(mmtf_file).tolist()
    except:
        dssp_s = []
    try:
        psea_s = psea_sec(mmtf_file).tolist()
    except:
        dssp_s = []

    structs = {
        "mmtf": mmtf_s,
        "dssp": dssp_s,
        "psea": psea_s,
    }
    return jsonify(
        sequence=list(mmtf_file["entityList"][0]["sequence"]),
        **structs,
        diffs=diff_all(**structs),
    )
Esempio n. 4
0
def test_array_conversion(path, model):
    mmtf_file = mmtf.MMTFFile.read(path)
    try:
        a1 = mmtf.get_structure(mmtf_file, model=model, include_bonds=True)
    except biotite.InvalidFileError:
        if model is None:
            # The file cannot be parsed into an AtomArrayStack,
            # as the models contain different numbers of atoms
            # -> skip this test case
            return
        else:
            raise

    mmtf_file = mmtf.MMTFFile()
    mmtf.set_structure(mmtf_file, a1)
    temp = TemporaryFile("w+b")
    mmtf_file.write(temp)

    temp.seek(0)
    mmtf_file = mmtf.MMTFFile.read(temp)
    temp.close()
    a2 = mmtf.get_structure(mmtf_file, model=model, include_bonds=True)

    for category in a1.get_annotation_categories():
        assert a1.get_annotation(category).tolist() == \
               a2.get_annotation(category).tolist()
    assert a1.coord.flatten().tolist() == \
           approx(a2.coord.flatten().tolist(), abs=1e-3)
    assert a1.bonds == a2.bonds
    if a1.box is not None:
        assert np.allclose(a1.box, a2.box)
Esempio n. 5
0
def test_dssp(path):
    sec_struct_codes = {0 : "I",
                        1 : "S",
                        2 : "H",
                        3 : "E",
                        4 : "G",
                        5 : "B",
                        6 : "T",
                        7 : "C"}

    mmtf_file = mmtf.MMTFFile()
    mmtf_file.read(path)
    array = mmtf.get_structure(mmtf_file, model=1)
    array = array[array.hetero == False]
    first_chain_id = array.chain_id[0]
    chain = array[array.chain_id == first_chain_id]

    n_residues = struc.get_residue_count(chain)
    # Secondary structure annotation in PDB use also DSSP
    # -> compare PDB and local DSSP
    sse = mmtf_file["secStructList"]
    sse = sse[:n_residues]
    if (sse == -1).all():
        # First chain is not a polypeptide chain (presumably DNA/RNA)
        # DSSP not applicable -> return
        return
    sse = np.array([sec_struct_codes[code] for code in sse],
                    dtype="U1")
    
    chain = array[array.chain_id == first_chain_id]
    sse_from_app = DsspApp.annotate_sse(chain)
    np.set_printoptions(threshold=10000)
    # PDB uses different DSSP version -> slight differences possible
    # -> only 90% must be identical
    assert np.count_nonzero(sse_from_app == sse) / len(sse) > 0.9
Esempio n. 6
0
def test_dihedral_backbone_result(file_name):
    import mdtraj

    mmtf_file = mmtf.MMTFFile()
    mmtf_file.read(file_name)
    array = mmtf.get_structure(mmtf_file, model=1)
    array = array[struc.filter_amino_acids(array)]
    for chain in struc.chain_iter(array):
        print("Chain: ", chain.chain_id[0])
        if len(struc.check_id_continuity(chain)) != 0:
            # Do not test discontinuous chains
            return
        test_phi, test_psi, test_ome = struc.dihedral_backbone(chain)

        temp_file_name = biotite.temp_file("pdb")
        strucio.save_structure(temp_file_name, chain)
        traj = mdtraj.load(temp_file_name)
        _, ref_phi = mdtraj.compute_phi(traj)
        _, ref_psi = mdtraj.compute_psi(traj)
        _, ref_ome = mdtraj.compute_omega(traj)
        ref_phi, ref_psi, ref_ome = ref_phi[0], ref_psi[0], ref_ome[0]

        assert test_phi[1:] == pytest.approx(ref_phi, abs=1e-5, rel=5e-3)
        assert test_psi[:-1] == pytest.approx(ref_psi, abs=1e-5, rel=5e-3)
        assert test_ome[:-1] == pytest.approx(ref_ome, abs=1e-5, rel=5e-3)
Esempio n. 7
0
def test_numpy_objects():
    """
    Test whether the Msgpack encoder is able to handle NumPy values
    (e.g. np.float32) properly.
    """
    mmtf_file = mmtf.MMTFFile()
    mmtf_file["A float"] = np.float32(42.0)
    mmtf_file["A list"] = [np.int64(1), np.int64(2), np.int64(3)]
    mmtf_file["A dictionary"] = {"a": np.bool(True), "b": np.bool(False)}
    mmtf_file.write(biotite.temp_file("mmtf"))
Esempio n. 8
0
def test_array_conversion(path, single_model):
    model = 1 if single_model else None
    mmtf_file = mmtf.MMTFFile()
    mmtf_file.read(path)
    a1 = mmtf.get_structure(mmtf_file, model=model, include_bonds=True)
    mmtf_file = mmtf.MMTFFile()
    mmtf.set_structure(mmtf_file, a1)
    temp_file_name = biotite.temp_file("mmtf")
    mmtf_file.write(temp_file_name)

    mmtf_file = mmtf.MMTFFile()
    mmtf_file.read(temp_file_name)
    a2 = mmtf.get_structure(mmtf_file, model=model, include_bonds=True)
    for category in a1.get_annotation_categories():
        assert a1.get_annotation(category).tolist() == \
               a2.get_annotation(category).tolist()
    assert a1.coord.flatten().tolist() == \
           approx(a2.coord.flatten().tolist(), abs=1e-3)
    assert a1.bonds == a2.bonds
    if a1.box is not None:
        assert np.allclose(a1.box, a2.box)
Esempio n. 9
0
def test_numpy_objects():
    """
    Test whether the Msgpack encoder is able to handle NumPy values
    (e.g. np.float32) properly.

    Only check if no error occurs.
    """
    mmtf_file = mmtf.MMTFFile()
    mmtf_file["A float"] = np.float32(42.0)
    mmtf_file["A list"] = [np.int64(1), np.int64(2), np.int64(3)]
    mmtf_file["A dictionary"] = {"a": np.bool(True), "b": np.bool(False)}
    temp = TemporaryFile("w+b")
    mmtf_file.write(temp)
    temp.close()
Esempio n. 10
0
def test_pdbx_consistency(path, single_model):
    model = None if single_model else 1
    cif_path = splitext(path)[0] + ".cif"
    mmtf_file = mmtf.MMTFFile()
    mmtf_file.read(path)
    a1 = mmtf.get_structure(mmtf_file, model=model)
    pdbx_file = pdbx.PDBxFile()
    pdbx_file.read(cif_path)
    a2 = pdbx.get_structure(pdbx_file, model=model)
    for category in a1.get_annotation_categories():
        assert a1.get_annotation(category).tolist() == \
               a2.get_annotation(category).tolist()
    assert a1.coord.flatten().tolist() == \
           approx(a2.coord.flatten().tolist(), abs=1e-3)
Esempio n. 11
0
def test_fetch(format, as_file_like):
    path = None if as_file_like else biotite.temp_dir()
    file_path_or_obj = rcsb.fetch("1l2y", format, path, overwrite=True)
    if format == "pdb":
        file = pdb.PDBFile()
        file.read(file_path_or_obj)
        pdb.get_structure(file)
    elif format == "pdbx":
        file = pdbx.PDBxFile()
        file.read(file_path_or_obj)
        pdbx.get_structure(file)
    elif format == "mmtf":
        file = mmtf.MMTFFile()
        file.read(file_path_or_obj)
        mmtf.get_structure(file)
Esempio n. 12
0
def test_codecs(path):
    mmtf_file = mmtf.MMTFFile()
    mmtf_file.read(path)
    for key in mmtf_file:
        if mmtf_file.get_codec(key) is not None:
            codec = mmtf_file.get_codec(key)
            param = mmtf_file.get_param(key)
            array1 = mmtf_file[key]
            mmtf_file.set_array(key, array1, codec, param)
            array2 = mmtf_file[key]
            if array1.dtype == np.float32:
                if param != 0:
                    tol = 1/param
                else:
                    tol = 0
                assert np.isclose(array1, array2, atol=tol).all()
            else:
                assert (array1 == array2).all()
Esempio n. 13
0
def test_connect_via_residue_names(single_model):
    """
    Test whether the created bond list is equal to the bonds deposited
    in the MMTF file.
    """
    # Structure with peptide, nucleotide, small molecules and water
    file = mmtf.MMTFFile()
    file.read(join(data_dir, "5ugo.mmtf"))
    if single_model:
        atoms = mmtf.get_structure(file, include_bonds=True, model=1)
    else:
        atoms = mmtf.get_structure(file, include_bonds=True)
    
    ref_bonds = atoms.bonds

    test_bonds = struc.connect_via_residue_names(atoms)

    assert test_bonds == ref_bonds
Esempio n. 14
0
def test_fetch(format, as_file_like):
    path = None if as_file_like else biotite.temp_dir()
    file_path_or_obj = rcsb.fetch("1l2y", format, path, overwrite=True)
    if format == "pdb":
        file = pdb.PDBFile()
        file.read(file_path_or_obj)
        pdb.get_structure(file)
    elif format == "pdbx":
        file = pdbx.PDBxFile()
        file.read(file_path_or_obj)
        pdbx.get_structure(file)
    elif format == "mmtf":
        file = mmtf.MMTFFile()
        file.read(file_path_or_obj)
        mmtf.get_structure(file)
    elif format == "fasta":
        file = fasta.FastaFile()
        file.read(file_path_or_obj)
        # Test if the file contains any sequences
        assert len(fasta.get_sequences(file)) > 0
Esempio n. 15
0
def test_pdbx_consistency(path, single_model):
    model = None if single_model else 1
    cif_path = splitext(path)[0] + ".cif"
    mmtf_file = mmtf.MMTFFile()
    mmtf_file.read(path)
    a1 = mmtf.get_structure(mmtf_file, model=model)
    pdbx_file = pdbx.PDBxFile()
    pdbx_file.read(cif_path)
    a2 = pdbx.get_structure(pdbx_file, model=model)
    # Sometimes mmCIF files can have 'cell' entry
    # but corresponding MMTF file has not 'unitCell' entry
    # -> Do not assert for dummy entry in mmCIF file
    # (all vector elements = {0, 1})
    if a2.box is not None and not ((a2.box == 0) | (a2.box == 1)).all():
        assert np.allclose(a1.box, a2.box)
    for category in a1.get_annotation_categories():
        assert a1.get_annotation(category).tolist() == \
               a2.get_annotation(category).tolist()
    assert a1.coord.flatten().tolist() == \
           approx(a2.coord.flatten().tolist(), abs=1e-3)
Esempio n. 16
0
def test_connect_via_distances():
    """
    Test whether the created bond list is equal to the bonds deposited
    in the MMTF file.
    """
    file = mmtf.MMTFFile()
    file.read(join(data_dir, "1l2y.mmtf"))
    atoms = mmtf.get_structure(file, include_bonds=True, model=1)
    # Remove termini to solve the issue that the reference bonds do not
    # contain proper bonds for the protonated/deprotonated termini
    atoms = atoms[(atoms.res_id > 1) & (atoms.res_id < 20)]
    
    ref_bonds = atoms.bonds
    # Convert all bonds to BondType.ANY
    ref_bonds = struc.BondList(
        ref_bonds.get_atom_count(), ref_bonds.as_array()[:, :2]
    )

    test_bonds = struc.connect_via_distances(atoms)

    assert test_bonds == ref_bonds
Esempio n. 17
0
def test_bonds(path):
    """
    Test whether the bond data is consistent with the content of MMTF
    files.
    """
    bond_data = strucinfo.bond_dataset()
    mmtf_file = mmtf.MMTFFile()
    mmtf_file.read(path)
    for group in mmtf_file["groupList"]:
        group_name = group["groupName"]
        atom_names = group["atomNameList"]
        bond_indices = group["bondAtomList"]
        bond_orders = group["bondOrderList"]
        for i in range(0, len(bond_indices), 2):
            atom1 = atom_names[bond_indices[i]]
            atom2 = atom_names[bond_indices[i + 1]]
            order = bond_orders[i // 2]
            assert strucinfo.bond_order(group_name, atom1, atom2) == order
            assert frozenset((atom1, atom2)) \
                   in strucinfo.bonds_in_residue(group_name)
            assert frozenset((atom1, atom2)) \
                   in bond_data[group_name]
Esempio n. 18
0
def test_coarse_grained(pdb_id):
    # Multi atom SASA (ProtOr), compare with single atom SASA
    # on residue level
    file = mmtf.MMTFFile()
    file.read(join(data_dir, pdb_id + ".mmtf"))
    array = mmtf.get_structure(file, model=1)
    array = array[struc.filter_amino_acids(array)]
    sasa = struc.apply_residue_wise(array, struc.sasa(array,
                                                      vdw_radii="ProtOr"),
                                    np.nansum)
    sasa_exp = struc.apply_residue_wise(array,
                                        struc.sasa(array, vdw_radii="Single"),
                                        np.nansum)

    # Assert that more than 90% of atoms
    # have less than 10% SASA difference
    assert np.count_nonzero(np.isclose(sasa, sasa_exp, rtol=1e-1,
                                       atol=1)) / len(sasa) > 0.9
    # Assert that more than 98% of atoms
    # have less than 40% SASA difference
    assert np.count_nonzero(np.isclose(sasa, sasa_exp, rtol=4e-1,
                                       atol=1)) / len(sasa) > 0.98
Esempio n. 19
0
"""

# Code source: Patrick Kunzmann
# License: BSD 3 clause

import numpy as np
import biotite.structure as struc
import biotite.structure.io.mmtf as mmtf
import biotite.database.rcsb as rcsb

# The maximum distance between an atom in the repressor and an atom in
# the DNA for them to be considered 'in contact'
THRESHOLD_DISTANCE = 4.0

# Fetch and load structure
mmtf_file = mmtf.MMTFFile()
mmtf_file.read(rcsb.fetch("2or1", "mmtf"))
structure = mmtf.get_structure(mmtf_file, model=1)

# Separate structure into the DNA and the two identical protein chains
dna = structure[np.isin(structure.chain_id, ["A", "B"])
                & (structure.hetero == False)]
protein_l = structure[(structure.chain_id == "L")
                      & (structure.hetero == False)]
protein_r = structure[(structure.chain_id == "R")
                      & (structure.hetero == False)]
# Quick check if the two protein chains are really identical
assert len(struc.get_residues(protein_l)) == len(struc.get_residues(protein_r))

# Fast identification of contacts via a cell list:
# The cell list is initiliazed with the coordinates of the DNA
Esempio n. 20
0
def build_patterns(structfam, folder):
    patterns = []
    for pdb, c, start, end in tqdm(structfam):
        file_name = rcsb.fetch(pdb, "mmtf", biotite.temp_dir())
        mmtf_file = mmtf.MMTFFile()
        mmtf_file.read(file_name)

        array = mmtf.get_structure(mmtf_file, model=1)
        tk_dimer = array[struc.filter_amino_acids(array)]

        # The chain ID corresponding to each residue
        chain_id_per_res = array.chain_id[struc.get_residue_starts(tk_dimer)]

        sse = mmtf_file["secStructList"]
        sse = sse[:chain_id_per_res.shape[0]][chain_id_per_res == c]
        sse = np.array(sse[start:end + 1])
        sse = np.array([sec_struct_codes[code % 8] for code in sse],
                       dtype="U1")

        sse8 = to_onehot([dssp_codes[x] for x in sse], (None, 8))
        dss8 = (sse8[1:] - sse8[:-1])
        cls = to_onehot(np.where(dss8 == -1)[1], (None, 8)).T
        bbox = np.array(
            [np.where(dss8 == 1)[0],
             np.where(dss8 == -1)[0], *cls]).T
        pat8 = np.argmax(bbox[:, 2:], 1)

        sse3 = to_onehot([abc_codes[dssp_to_abc[x]] for x in sse], (None, 3))
        dss3 = (sse3[1:] - sse3[:-1])
        cls = to_onehot(np.where(dss3 == -1)[1], (None, 3)).T
        bbox = np.array(
            [np.where(dss3 == 1)[0],
             np.where(dss3 == -1)[0], *cls]).T
        pat3 = np.argmax(bbox[:, 2:], 1)
        patterns.append((pat3, pat8))
    if len(patterns) == 0:
        print("No pattern find")
        return None, None, None, None
    c_patterns3, n_patterns3, c_patterns8, n_patterns8, weights = [], [], [], [], []
    for pat3, pat8 in patterns:
        char_pat8 = "".join([sec_struct_codes[x] for x in pat8])
        char_pat3 = "".join(["abc"[x] for x in pat3])
        c_patterns8.append(char_pat8)
        n_patterns8.append(list(pat8))
        c_patterns3.append(char_pat3)
        n_patterns3.append(list(pat3))
    occ_sum8 = dict()
    occ_sum3 = dict()

    correspondings8 = dict()
    correspondings3 = dict()
    for c8, n8, c3, n3 in zip(c_patterns8, n_patterns8, c_patterns3,
                              n_patterns3):
        if len(c3) == 0:
            continue
        if c3[0] != "c":
            c3 = "c" + c3
            n3 = [2] + n3
        if c3[-1] != "c":
            c3 = c3 + "c"
            n3 = n3 + [2]
        if c8[0] != "C":
            c8 = "C" + c8
            n8 = [7] + n8
        if c8[-1] != "C":
            c8 = c8 + "C"
            n8 = n8 + [7]
        if c8 not in occ_sum8.keys():
            occ_sum8[c8] = 0
            correspondings8[c8] = c8, n8
        occ_sum8[c8] += 1
        if c3 not in occ_sum3.keys():
            occ_sum3[c3] = 0
            correspondings3[c3] = c3, n3
        occ_sum3[c3] += 1

    c_pattern8, n_pattern8 = correspondings8[max(occ_sum8, key=occ_sum8.get)]
    c_pattern3, n_pattern3 = correspondings3[max(occ_sum3, key=occ_sum3.get)]

    push(f"{folder}/data.pt", "pattern",
         (c_pattern3, n_pattern3, c_pattern8, n_pattern8))

    return c_pattern3, n_pattern3, c_pattern8, n_pattern8, occ_sum3, occ_sum8
Esempio n. 21
0
def search_pattern(path, uniprot, seq_nat):
    r"""
    Search a pattern with PDB
    Args:
        path (str): path to save data
        uniprot (str): uniprot id of the search sequence
        seq_nat (str): raw sequences for a better alignment of the pattern with the sequence
    """
    pdb_uniprot = pd.read_csv(f"{CROSS}/uniprot_pdb.csv", index_col=0)
    longest, patterns = 0, []
    for pdb in pdb_uniprot[pdb_uniprot.uni == uniprot].pdb.values:
        try:
            file_name = rcsb.fetch(pdb, "mmtf", biotite.temp_dir())
            mmtf_file = mmtf.MMTFFile()
            mmtf_file.read(file_name)
            # Transketolase homodimer
            ss_seq = np.array(list(mmtf_file["entityList"][0]["sequence"]))
            length, (m_nat, M_nat, m_mut,
                     M_mut), _ = lcs_pattern(seq_nat, "".join(ss_seq))
            sse = mmtf_file["secStructList"]
            sse = np.array(sse[m_mut:M_mut + 1])
            length = len(sse)
            if max(sse) == -1:
                continue
            if length < longest:
                continue
            if length > longest:
                longest = length
                patterns = []
            sse = np.array([pdb_codes[code % 8] for code in sse], dtype="U1")

            sse8 = to_onehot([dssp_codes[x] for x in sse], (None, 8))
            dss8 = (sse8[1:] - sse8[:-1])
            cls = to_onehot(np.where(dss8 == -1)[1], (None, 8)).T
            bbox = np.array(
                [np.where(dss8 == 1)[0],
                 np.where(dss8 == -1)[0], *cls]).T
            pat8 = np.argmax(bbox[:, 2:], 1)

            sse3 = to_onehot([abc_codes[dssp_to_abc[x]] for x in sse],
                             (None, 3))
            dss3 = (sse3[1:] - sse3[:-1])
            cls = to_onehot(np.where(dss3 == -1)[1], (None, 3)).T
            bbox = np.array(
                [np.where(dss3 == 1)[0],
                 np.where(dss3 == -1)[0], *cls]).T
            pat3 = np.argmax(bbox[:, 2:], 1)
            patterns.append((list(pat3), list(pat8)))
        except:
            continue
    ratio_covered = longest / len(seq_nat)
    if ratio_covered <= 0.9:
        push(f"{path}/data.pt", "pattern", (None, None, None, None))
        return None, ratio_covered
    c_patterns3, n_patterns3, c_patterns8, n_patterns8 = [], [], [], []
    for pat3, pat8 in patterns:
        if len(pat3) == 0:
            continue
        if pat3[0] != 2:
            pat3 = [2] + pat3
        if pat3[-1] != 2:
            pat3 = pat3 + [2]
        if pat8[0] != 7:
            pat8 = [7] + pat8
        if pat8[-1] != 7:
            pat8 = pat8 + [7]
        char_pat8 = "".join([sec_struct_codes[x] for x in pat8])
        char_pat3 = "".join(["abc"[x] for x in pat3])
        c_patterns8.append(char_pat8)
        n_patterns8.append(list(pat8))
        c_patterns3.append(char_pat3)
        n_patterns3.append(list(pat3))
    max_occ, c_pattern3, n_pattern3, c_pattern8, n_pattern8 = 0, None, None, None, None
    for c3, n3, c8, n8 in zip(c_patterns3, n_patterns3, c_patterns8,
                              n_patterns8):
        n_occ = c_patterns8.count(c8)
        if n_occ > max_occ:
            max_occ = n_occ
            c_pattern3, n_pattern3 = c3, n3
            c_pattern8, n_pattern8 = c8, n8
    push(f"{path}/data.pt", "pattern",
         (c_pattern3, n_pattern3, c_pattern8, n_pattern8))

    return (c_pattern3, n_pattern3, c_pattern8, n_pattern8), ratio_covered