Beispiel #1
0
def test_extra_fields():
    path = join(data_dir("structure"), "1l2y.cif")
    pdbx_file = pdbx.PDBxFile.read(path)
    stack1 = pdbx.get_structure(
        pdbx_file, extra_fields=["atom_id", "b_factor", "occupancy", "charge"])
    pdbx_file = pdbx.PDBxFile()
    pdbx.set_structure(pdbx_file, stack1, data_block="test")
    stack2 = pdbx.get_structure(
        pdbx_file, extra_fields=["atom_id", "b_factor", "occupancy", "charge"])
    assert stack1 == stack2

    path = join(data_dir("structure"), "1l2y.cif")
    pdbx_file = pdbx.PDBxFile.read(path)
    stack1 = pdbx.get_structure(
        pdbx_file, extra_fields=["atom_id", "b_factor", "occupancy", "charge"])

    pdbx_file = pdbx.PDBxFile()
    pdbx.set_structure(pdbx_file, stack1, data_block="test")

    stack2 = pdbx.get_structure(
        pdbx_file, extra_fields=["atom_id", "b_factor", "occupancy", "charge"])

    assert stack1.ins_code.tolist() == stack2.ins_code.tolist()
    assert stack1.atom_id.tolist() == stack2.atom_id.tolist()
    assert stack1.b_factor.tolist() == approx(stack2.b_factor.tolist())
    assert stack1.occupancy.tolist() == approx(stack2.occupancy.tolist())
    assert stack1.charge.tolist() == stack2.charge.tolist()
    assert stack1 == stack2
Beispiel #2
0
def test_extra_fields():
    path = join(data_dir, "1l2y.cif")
    pdbx_file = pdbx.PDBxFile()
    pdbx_file.read(path)
    stack1 = pdbx.get_structure(pdbx_file, extra_fields=["atom_id","b_factor",
                                "occupancy","charge"])
    pdbx_file = pdbx.PDBxFile()
    pdbx.set_structure(pdbx_file, stack1, data_block="test")
    stack2 = pdbx.get_structure(pdbx_file, extra_fields=["atom_id","b_factor",
                                "occupancy","charge"])
    assert stack1 == stack2
Beispiel #3
0
def test_conversion(path, single_model):
    model = 1 if single_model else None
    pdbx_file = pdbx.PDBxFile()
    pdbx_file.read(path)
    array1 = pdbx.get_structure(pdbx_file, model=model)
    pdbx_file = pdbx.PDBxFile()
    pdbx.set_structure(pdbx_file, array1, data_block="test")
    array2 = pdbx.get_structure(pdbx_file, model=model)
    for category in array1.get_annotation_categories():
        assert array1.get_annotation(category).tolist() == \
               array2.get_annotation(category).tolist()
    assert array1.coord.tolist() == array2.coord.tolist()
Beispiel #4
0
def test_conversion(path, model):
    pdbx_file = pdbx.PDBxFile.read(path)

    try:
        array1 = pdbx.get_structure(pdbx_file, model=model)
    except biotite.InvalidFileError:
        if model is None:
            # The file cannot be parsed into an AtomArrayStack,
            # as the models contain different numbers of atoms
            # -> skip this test case
            return
        else:
            raise

    pdbx_file = pdbx.PDBxFile()
    pdbx.set_structure(pdbx_file, array1, data_block="test")

    array2 = pdbx.get_structure(pdbx_file, model=model)

    if array1.box is not None:
        assert np.allclose(array1.box, array2.box)
    assert array1.bonds == array2.bonds
    for category in array1.get_annotation_categories():
        assert array1.get_annotation(category).tolist() == \
               array2.get_annotation(category).tolist()
    assert array1.coord.tolist() == array2.coord.tolist()
Beispiel #5
0
def test_conversion(path, model):
    pdbx_file = pdbx.PDBxFile.read(path)

    try:
        array1 = pdbx.get_structure(pdbx_file, model=model)
    except biotite.InvalidFileError:
        if model is None:
            # The file cannot be parsed into an AtomArrayStack,
            # as the models contain different numbers of atoms
            # -> skip this test case
            return
        else:
            raise

    pdbx_file = pdbx.PDBxFile()
    pdbx.set_structure(pdbx_file, array1, data_block="test")

    # Remove one optional auth section in label to test fallback to label fields
    atom_cat = pdbx_file.get_category("atom_site", "test")
    atom_cat.pop("auth_atom_id")
    pdbx_file.set_category("atom_site", atom_cat, "test")

    array2 = pdbx.get_structure(pdbx_file, model=model)

    assert array1.array_length() > 0
    if array1.box is not None:
        assert np.allclose(array1.box, array2.box)
    assert array1.bonds == array2.bonds
    for category in array1.get_annotation_categories():
        assert (array1.get_annotation(category).tolist() ==
                array2.get_annotation(category).tolist())
    assert array1.coord.tolist() == array2.coord.tolist()
Beispiel #6
0
def create_bond_dict(components_pdbx_file_path, msgpack_file_path):
    pdbx_file = pdbx.PDBxFile()
    pdbx_file.read(components_pdbx_file_path)
    components = pdbx_file.get_block_names()
    bond_dict = {}
    for component in components:
        print(component)
        cif_bonds = pdbx_file.get_category("chem_comp_bond", block=component)
        if cif_bonds is None:
            # No bond info for this compound
            continue
        if isinstance(cif_bonds["comp_id"], str):
            # Single string -> single bond
            group_bonds = {
                (cif_bonds["atom_id_1"], cif_bonds["atom_id_2"]):
                BOND_ORDERS[cif_bonds["value_order"]]
            }
        else:
            # Looped values -> multiple bonds
            group_bonds = {(atom1, atom2): BOND_ORDERS[order]
                           for atom1, atom2, order in zip(
                               cif_bonds["atom_id_1"], cif_bonds["atom_id_2"],
                               cif_bonds["value_order"])}
        bond_dict[component] = group_bonds
    with open(msgpack_file_path, "wb") as msgpack_file:
        msgpack.dump(bond_dict, msgpack_file)
Beispiel #7
0
def test_get_assembly(single_model):
    """
    Test whether the :func:`get_assembly()` function produces the same
    number of peptide chains as the
    ``_pdbx_struct_assembly.oligomeric_count`` field indicates.
    """
    model = 1 if single_model else None

    path = join(data_dir, "1f2n.cif")
    pdbx_file = pdbx.PDBxFile()
    pdbx_file.read(path)

    assembly_category = pdbx_file.get_category("pdbx_struct_assembly",
                                               expect_looped=True)
    # Test each available assembly
    for id, ref_oligomer_count in zip(assembly_category["id"],
                                      assembly_category["oligomeric_count"]):
        assembly = pdbx.get_assembly(pdbx_file, assembly_id=id, model=model)
        protein_assembly = assembly[..., struc.filter_amino_acids(assembly)]
        test_oligomer_count = struc.get_chain_count(protein_assembly)

        if single_model:
            assert isinstance(assembly, struc.AtomArray)
        else:
            assert isinstance(assembly, struc.AtomArrayStack)
        assert test_oligomer_count == int(ref_oligomer_count)
def create_dict(components_pdbx_file_path, msgpack_file_path,
                subcategory, expected_type):
    pdbx_file = pdbx.PDBxFile()
    pdbx_file.read(components_pdbx_file_path)
    components = pdbx_file.get_block_names()
    data_dict = {}
    for i, component in enumerate(components):
        print(f"{((i+1) / len(components) * 100):4.1f} %", end="\r")
        try:
            cif_dict = pdbx_file.get_category("chem_comp", block=component)
        except ValueError:
            # The 'chem_comp' category may contain unparsable names
            # with wrong quote escaping
            # In this case the PDBx file parser raises an Exception
            cif_dict = None
        if cif_dict is None:
            # No or erroneous info for this compound
            data_dict[component] = None
        else:
            try:
                data = expected_type(cif_dict[subcategory])
            except ValueError:
                # Unparsable data, e.g. '?' as float
                data = None
            data_dict[component] = data
    print()
    with open(msgpack_file_path, "wb") as msgpack_file:
        msgpack.dump(data_dict, msgpack_file)
Beispiel #9
0
def test_parsing(category, key, exp_value):
    pdbx_file = pdbx.PDBxFile()
    pdbx_file.read(join(data_dir, "1l2y.cif"))
    cat_dict = pdbx_file[category]
    value = cat_dict[key]
    if isinstance(value, np.ndarray):
        assert value.tolist() == exp_value
    else:
        assert value == exp_value
Beispiel #10
0
def test_superimposition_array(path):
    pdbx_file = pdbx.PDBxFile()
    pdbx_file.read(path)
    fixed = pdbx.get_structure(pdbx_file, model=1)
    mobile = fixed.copy()
    mobile = struc.rotate(mobile, (1, 2, 3))
    mobile = struc.translate(mobile, (1, 2, 3))
    fitted, transformation = struc.superimpose(fixed, mobile,
                                               (mobile.atom_name == "CA"))
    assert struc.rmsd(fixed, fitted) == pytest.approx(0)
    fitted = struc.superimpose_apply(mobile, transformation)
    assert struc.rmsd(fixed, fitted) == pytest.approx(0)
Beispiel #11
0
def test_pdbx_consistency(path, single_model):
    model = 1 if single_model else None
    cif_path = splitext(path)[0] + ".cif"
    pdb_file = pdb.PDBFile()
    pdb_file.read(path)
    a1 = pdb_file.get_structure(model=model)
    pdbx_file = pdbx.PDBxFile()
    pdbx_file.read(cif_path)
    a2 = pdbx.get_structure(pdbx_file, model=model)
    for category in a1.get_annotation_categories():
        assert a1.get_annotation(category).tolist() == \
               a2.get_annotation(category).tolist()
    assert a1.coord.tolist() == a2.coord.tolist()
Beispiel #12
0
def test_unequal_lengths():
    valid_category_dict = {"foo1": ["1", "2", "3"], "foo2": ["1", "2", "3"]}
    # Arrays have unequal lengths -> invalid
    invalid_category_dict = {
        "foo1": ["1", "2", "3"],
        "foo2": ["1", "2", "3", "4"]
    }
    pdbx_file = pdbx.PDBxFile()
    pdbx_file.set_category("test", valid_category_dict, block="test_block")
    with pytest.raises(ValueError):
        pdbx_file.set_category("test",
                               invalid_category_dict,
                               block="test_block")
Beispiel #13
0
def test_pdbx_consistency(path, single_model):
    model = None if single_model else 1
    cif_path = splitext(path)[0] + ".cif"
    mmtf_file = mmtf.MMTFFile()
    mmtf_file.read(path)
    a1 = mmtf.get_structure(mmtf_file, model=model)
    pdbx_file = pdbx.PDBxFile()
    pdbx_file.read(cif_path)
    a2 = pdbx.get_structure(pdbx_file, model=model)
    for category in a1.get_annotation_categories():
        assert a1.get_annotation(category).tolist() == \
               a2.get_annotation(category).tolist()
    assert a1.coord.flatten().tolist() == \
           approx(a2.coord.flatten().tolist(), abs=1e-3)
Beispiel #14
0
def test_fetch(format, as_file_like):
    path = None if as_file_like else biotite.temp_dir()
    file_path_or_obj = rcsb.fetch("1l2y", format, path, overwrite=True)
    if format == "pdb":
        file = pdb.PDBFile()
        file.read(file_path_or_obj)
        pdb.get_structure(file)
    elif format == "pdbx":
        file = pdbx.PDBxFile()
        file.read(file_path_or_obj)
        pdbx.get_structure(file)
    elif format == "mmtf":
        file = mmtf.MMTFFile()
        file.read(file_path_or_obj)
        mmtf.get_structure(file)
Beispiel #15
0
def test_PDBx_consistency(format):
    pdbx_file = pdbx.PDBxFile()
    pdbx_file.read(join(data_dir, "1l2y.cif"))
    array1 = pdbx.get_structure(pdbx_file)
    template = pdbx.get_structure(pdbx_file, model=1)
    if format == "trr":
        traj_file = trr.TRRFile()
        traj_file.read(join(data_dir, "1l2y.trr"))
    if format == "xtc":
        traj_file = xtc.XTCFile()
        traj_file.read(join(data_dir, "1l2y.xtc"))
    array2 = traj_file.get_structure(template)
    for cat in array1. get_annotation_categories():
        assert array1.get_annotation(cat).tolist() == \
               array2.get_annotation(cat).tolist()
        assert array1.coord == pytest.approx(array2.coord)
Beispiel #16
0
def test_empty_values(string, use_array):
    """
    Test whether empty strings for field values are properly replaced
    by ``'.'``.
    """
    LENGTH = 10
    ref_value = np.full(LENGTH, string, dtype="U1") if use_array else ""
    pdbx_file = pdbx.PDBxFile()
    pdbx_file.set_category(category="test_category",
                           block="test",
                           category_dict={"test_field": ref_value})

    test_value = pdbx_file["test_category"]["test_field"]

    if use_array:
        assert test_value.tolist() == ["."] * LENGTH
    else:
        assert test_value == "."
Beispiel #17
0
def test_list_assemblies():
    """
    Test the :func:`list_assemblies()` function based on a known
    example.
    """
    path = join(data_dir, "1f2n.cif")
    pdbx_file = pdbx.PDBxFile()
    pdbx_file.read(path)

    assembly_list = pdbx.list_assemblies(pdbx_file)
    assert assembly_list == {
        "1": "complete icosahedral assembly",
        "2": "icosahedral asymmetric unit",
        "3": "icosahedral pentamer",
        "4": "icosahedral 23 hexamer",
        "5": "icosahedral asymmetric unit, std point frame",
        "6": "crystal asymmetric unit, crystal frame"
    }
Beispiel #18
0
def test_fetch(format, as_file_like):
    path = None if as_file_like else biotite.temp_dir()
    file_path_or_obj = rcsb.fetch("1l2y", format, path, overwrite=True)
    if format == "pdb":
        file = pdb.PDBFile()
        file.read(file_path_or_obj)
        pdb.get_structure(file)
    elif format == "pdbx":
        file = pdbx.PDBxFile()
        file.read(file_path_or_obj)
        pdbx.get_structure(file)
    elif format == "mmtf":
        file = mmtf.MMTFFile()
        file.read(file_path_or_obj)
        mmtf.get_structure(file)
    elif format == "fasta":
        file = fasta.FastaFile()
        file.read(file_path_or_obj)
        # Test if the file contains any sequences
        assert len(fasta.get_sequences(file)) > 0
Beispiel #19
0
def test_pdbx_consistency(path, single_model):
    model = None if single_model else 1
    cif_path = splitext(path)[0] + ".cif"
    mmtf_file = mmtf.MMTFFile()
    mmtf_file.read(path)
    a1 = mmtf.get_structure(mmtf_file, model=model)
    pdbx_file = pdbx.PDBxFile()
    pdbx_file.read(cif_path)
    a2 = pdbx.get_structure(pdbx_file, model=model)
    # Sometimes mmCIF files can have 'cell' entry
    # but corresponding MMTF file has not 'unitCell' entry
    # -> Do not assert for dummy entry in mmCIF file
    # (all vector elements = {0, 1})
    if a2.box is not None and not ((a2.box == 0) | (a2.box == 1)).all():
        assert np.allclose(a1.box, a2.box)
    for category in a1.get_annotation_categories():
        assert a1.get_annotation(category).tolist() == \
               a2.get_annotation(category).tolist()
    assert a1.coord.flatten().tolist() == \
           approx(a2.coord.flatten().tolist(), abs=1e-3)
Beispiel #20
0
def test_superimposition_stack(ca_only):
    path = join(data_dir, "1l2y.cif")
    pdbx_file = pdbx.PDBxFile()
    pdbx_file.read(path)
    stack = pdbx.get_structure(pdbx_file)
    fixed = stack[0]
    mobile = stack[1:]
    if ca_only:
        mask = (mobile.atom_name == "CA")
    else:
        mask = None
    fitted, transformation = struc.superimpose(fixed, mobile, mask)
    if ca_only:
        # The superimpositions are better for most cases than the
        # superimpositions in the structure file
        # -> Use average
        assert np.mean(struc.rmsd(fixed, fitted)) \
             < np.mean(struc.rmsd(fixed, mobile))
    else:
        # The superimpositions are better than the superimpositions
        # in the structure file
        assert (struc.rmsd(fixed, fitted) < struc.rmsd(fixed, mobile)).all()
Beispiel #21
0
def create_bond_dict(components_pdbx_file_path, msgpack_file_path):
    pdbx_file = pdbx.PDBxFile()
    pdbx_file.read(components_pdbx_file_path)
    components = pdbx_file.get_block_names()
    bond_dict = {}
    for i, component in enumerate(components):
        print(f"{component:3}   {int(i/len(components)*100):>3d}%", end="\r")
        cif_bonds = pdbx_file.get_category("chem_comp_bond",
                                           block=component,
                                           expect_looped=True)
        if cif_bonds is None:
            # No bond info for this compound
            continue
        else:
            group_bonds = {}
            for atom1, atom2, order, aromatic_flag in zip(
                    cif_bonds["atom_id_1"], cif_bonds["atom_id_2"],
                    cif_bonds["value_order"], cif_bonds["pdbx_aromatic_flag"]):
                bond_type = BOND_ORDERS[order, aromatic_flag]
                group_bonds[(atom1, atom2)] = bond_type
        bond_dict[component] = group_bonds
    with open(msgpack_file_path, "wb") as msgpack_file:
        msgpack.dump(bond_dict, msgpack_file)
ku_file = biotite.temp_file("ku.cif")

# Download and parse structure files
file = rcsb.fetch("1JEY", "mmtf", biotite.temp_dir())
ku_dna = strucio.load_structure(file)
file = rcsb.fetch("1JEQ", "mmtf", biotite.temp_dir())
ku = strucio.load_structure(file)
# Remove DNA and water
ku_dna = ku_dna[(ku_dna.chain_id == "A") | (ku_dna.chain_id == "B")]
ku_dna = ku_dna[~struc.filter_solvent(ku_dna)]
ku = ku[~struc.filter_solvent(ku)]
# The structures have a differing amount of atoms missing
# at the the start and end of the structure
# -> Find common structure
ku_dna_common = ku_dna[struc.filter_intersection(ku_dna, ku)]
ku_common = ku[struc.filter_intersection(ku, ku_dna)]
# Superimpose
ku_superimposed, transformation = struc.superimpose(
    ku_dna_common, ku_common, (ku_common.atom_name == "CA"))
# We do not want the cropped structures
# -> apply superimposition on structures before intersection filtering
ku_superimposed = struc.superimpose_apply(ku, transformation)
# Write PDBx files as input for PyMOL
cif_file = pdbx.PDBxFile()
pdbx.set_structure(cif_file, ku_dna, data_block="ku_dna")
cif_file.write(ku_dna_file)
cif_file = pdbx.PDBxFile()
pdbx.set_structure(cif_file, ku_superimposed, data_block="ku")
cif_file.write(ku_file)
# Visualization with PyMOL...
# biotite_static_image = ku_superimposition.png
Beispiel #23
0
def create_residue_dict(components_pdbx_file_path, msgpack_file_path):
    pdbx_file = pdbx.PDBxFile()
    pdbx_file.read(components_pdbx_file_path)
    components = pdbx_file.get_block_names()
    residue_dict = {}

    for i, component in enumerate(components):
        print(f"{component:3}   {int(i/len(components)*100):>3d}%", end="\r")
        try:
            # Some entries use invalid quotation for the component name
            cif_general = pdbx_file.get_category("chem_comp", block=component)
        except ValueError:
            cif_general = None
        cif_atoms = pdbx_file.get_category("chem_comp_atom",
                                           block=component,
                                           expect_looped=True)
        cif_bonds = pdbx_file.get_category("chem_comp_bond",
                                           block=component,
                                           expect_looped=True)
        if cif_atoms is None:
            continue

        array = struc.AtomArray(len(list(cif_atoms.values())[0]))

        array.res_name = cif_atoms["comp_id"]
        array.atom_name = cif_atoms["atom_id"]
        array.element = cif_atoms["type_symbol"]
        array.add_annotation("charge", int)
        array.charge = np.array(
            [int(c) if c != "?" else 0 for c in cif_atoms["charge"]])
        if cif_general is None:
            array.hetero[:] = True
        else:
            array.hetero[:] = True if cif_general["type"] == "NON-POLYMER" \
                              else False

        # For some entries only 'model_Cartn',
        # for some entries only 'pdbx_model_Cartn_ideal' and
        # for some entries none of them is defined
        try:
            array.coord[:, 0] = cif_atoms["pdbx_model_Cartn_x_ideal"]
            array.coord[:, 1] = cif_atoms["pdbx_model_Cartn_y_ideal"]
            array.coord[:, 2] = cif_atoms["pdbx_model_Cartn_z_ideal"]
        except (KeyError, ValueError):
            try:
                array.coord[:, 0] = cif_atoms["model_Cartn_x"]
                array.coord[:, 1] = cif_atoms["model_Cartn_y"]
                array.coord[:, 2] = cif_atoms["model_Cartn_z"]
            except (KeyError, ValueError):
                # If none of them is defined, skip this component
                continue

        bonds = struc.BondList(array.array_length())
        if cif_bonds is not None:
            for atom1, atom2, order, aromatic_flag in zip(
                    cif_bonds["atom_id_1"], cif_bonds["atom_id_2"],
                    cif_bonds["value_order"], cif_bonds["pdbx_aromatic_flag"]):
                atom_i = np.where(array.atom_name == atom1)[0][0]
                atom_j = np.where(array.atom_name == atom2)[0][0]
                bond_type = BOND_ORDERS[order, aromatic_flag]
                bonds.add_bond(atom_i, atom_j, bond_type)
        array.bonds = bonds

        residue_dict[component] = array_to_dict(array)

    with open(msgpack_file_path, "wb") as msgpack_file:
        msgpack.dump(residue_dict, msgpack_file)
Beispiel #24
0
capsid from *Paramecium bursaria Chlorella virus type 1*
- a h**o-5040-mer!

At first we will check, which assemblies are available to us.
"""

# Code source: Patrick Kunzmann
# License: BSD 3 clause

import numpy as np
import biotite.structure as struc
import biotite.structure.io.pdbx as pdbx
import biotite.structure.io as strucio
import biotite.database.rcsb as rcsb

pdbx_file = pdbx.PDBxFile()
pdbx_file.read(rcsb.fetch("1M4X", "mmcif"))

assemblies = pdbx.list_assemblies(pdbx_file)
print("ID    name")
print()
for assembly_id, name in assemblies.items():
    print(f"{assembly_id:2}    {name}")

########################################################################
# ``'complete icosahedral assembly'`` sounds good.
# In fact, often the first assembly is the complete one.
# Hence, the :func:`get_assembly()` function builds the first assembly
# by default.
# Since we know the ID we want (``'1'``), we will provide it to this
# function anyway.
Beispiel #25
0
# modern PDBx/mmCIF format in favor of the PDB format.
# It solves limitations of the PDB format, that arise from the column
# restrictions.
# Furthermore, much more additional information is stored in these
# files.
# 
# .. currentmodule:: biotite.structure.io.pdbx
# 
# In contrast to PDB files, *Biotite* can read the entire content of
# PDBx/mmCIF files, which can be accessed in a dictionary like manner.
# At first, we read the file similarily to before, but this time we
# use the :class:`PDBxFile` class.

import biotite.structure.io.pdbx as pdbx
cif_file_path = rcsb.fetch("1l2y", "cif", biotite.temp_dir())
file = pdbx.PDBxFile()
file.read(cif_file_path)

########################################################################
# Now we can access the data like a dictionary of dictionaries.

print(file["1L2Y", "audit_author"]["name"])

########################################################################
# The first index contains the data block and the category name.
# The data block could be omitted, since there is only one block in the
# file.
# This returns a dictionary.
# If the category is in a *loop*, the dictionary contains `ndarrays`
# of strings as values, otherwise the dictionary contains strings
# directly.