Example #1
0
def api_route():
    pdb_id = request.args.get("pdb_id", "1Q2W")
    file_format = request.args.get("format", "mmtf")
    file_name = rcsb.fetch(pdb_id, file_format, biotite.temp_dir())
    mmtf_file = mmtf.MMTFFile()
    mmtf_file.read(file_name)
    print()

    try:
        mmtf_s = mmtf_sec(mmtf_file).tolist()
    except:
        mmtf_s = []
    try:
        dssp_s = dssp_sec(mmtf_file).tolist()
    except:
        dssp_s = []
    try:
        psea_s = psea_sec(mmtf_file).tolist()
    except:
        dssp_s = []

    structs = {
        "mmtf": mmtf_s,
        "dssp": dssp_s,
        "psea": psea_s,
    }
    return jsonify(
        sequence=list(mmtf_file["entityList"][0]["sequence"]),
        **structs,
        diffs=diff_all(**structs),
    )
Example #2
0
def test_fetch(common_name, as_file_like):
    path = None if as_file_like else biotite.temp_dir()
    db_name = "Protein" if common_name else "protein"
    file = entrez.fetch("1L2Y_A", path, "fa", db_name, "fasta", overwrite=True)
    fasta_file = fasta.FastaFile()
    fasta_file.read(file)
    prot_seq = fasta.get_sequence(fasta_file)
Example #3
0
def test_fetch_invalid():
    with pytest.raises(ValueError):
        file = entrez.fetch("xxxx",
                            biotite.temp_dir(),
                            "fa",
                            "protein",
                            "fasta",
                            overwrite=True)
Example #4
0
def test_fetch():
    file = entrez.fetch("1L2Y_A",
                        biotite.temp_dir(),
                        "fa",
                        "protein",
                        "fasta",
                        overwrite=True)
    fasta_file = fasta.FastaFile()
    fasta_file.read(file)
    prot_seq = fasta.get_sequence(fasta_file)
Example #5
0
def fetch_gb_annotation(pdb_chain=str):

    # input line retained for debugging
    # pdb_chain = "6FRH_A"

    # Fetch GenBank files of the TK's first chain and extract annotatation
    file_name = entrez.fetch(pdb_chain, biotite.temp_dir(), "gb", "protein",
                             "gb")
    gb_file = gb.GenBankFile()
    gb_file.read(file_name)
    annotation = gb.get_annotation(gb_file, include_only=["SecStr"])
    return annotation
Example #6
0
def get_diameter(pdb_id):
    file_name = rcsb.fetch(pdb_id, "mmtf", biotite.temp_dir())
    atom_array = strucio.load_structure(file_name)
    # Remove all non-amino acids
    atom_array = atom_array[struc.filter_amino_acids(atom_array)]
    coord = atom_array.coord
    # Calculate all pairwise difference vectors
    diff = coord[:, np.newaxis, :] - coord[np.newaxis, :, :]
    # Calculate absolute of difference vectors -> square distances
    sq_dist = np.sum(diff*diff, axis=-1)
    # Maximum distance is diameter
    diameter = np.sqrt(np.max(sq_dist))
    return diameter
Example #7
0
def test_fetch(format, as_file_like):
    path = None if as_file_like else biotite.temp_dir()
    file_path_or_obj = rcsb.fetch("1l2y", format, path, overwrite=True)
    if format == "pdb":
        file = pdb.PDBFile()
        file.read(file_path_or_obj)
        pdb.get_structure(file)
    elif format == "pdbx":
        file = pdbx.PDBxFile()
        file.read(file_path_or_obj)
        pdbx.get_structure(file)
    elif format == "mmtf":
        file = mmtf.MMTFFile()
        file.read(file_path_or_obj)
        mmtf.get_structure(file)
Example #8
0
def test_fetch(format, as_file_like):
    path = None if as_file_like else biotite.temp_dir()
    file_path_or_obj = rcsb.fetch("1l2y", format, path, overwrite=True)
    if format == "pdb":
        file = pdb.PDBFile.read(file_path_or_obj)
        pdb.get_structure(file)
    elif format == "pdbx":
        file = pdbx.PDBxFile.read(file_path_or_obj)
        pdbx.get_structure(file)
    elif format == "mmtf":
        file = mmtf.MMTFFile.read(file_path_or_obj)
        mmtf.get_structure(file)
    elif format == "fasta":
        file = fasta.FastaFile.read(file_path_or_obj)
        # Test if the file contains any sequences
        assert len(fasta.get_sequences(file)) > 0
Example #9
0
def test_doctest(package_name, context_package_names):
    """
    Run all doctest strings in all Biotite subpackages.
    """
    # Collect all attributes of this package and its subpackages
    # as globals for the doctests
    globs = {}
    mod_names = []
    #The package itself is also used as context
    for name in context_package_names + [package_name]:
        context_package = import_module(name)
        mod_names += _list_modules(context_package, False)
    for modname in mod_names:
        mod = import_module(modname)
        attrs = mod.__all__
        globs.update({attr: getattr(mod, attr) for attr in attrs})
    # Add fixed names for certain paths
    globs["path_to_directory"] = biotite.temp_dir()
    globs["path_to_structures"] = "./tests/structure/data/"
    globs["path_to_sequences"] = "./tests/sequence/data/"
    # Add frequently used modules
    globs["np"] = np
    # Add frequently used objects
    globs["atom_array_stack"] = strucio.load_structure(
        "./tests/structure/data/1l2y.mmtf")
    globs["atom_array"] = globs["atom_array_stack"][0]
    # Adjust NumPy print formatting
    np.set_printoptions(precision=3, floatmode="maxprec_equal")

    # Run doctests
    package = import_module(package_name)
    mod_names = _list_modules(package, False)
    for modname in mod_names:
        mod = import_module(modname)
        results = doctest.testmod(mod,
                                  extraglobs=globs,
                                  optionflags=doctest.ELLIPSIS
                                  | doctest.REPORT_ONLY_FIRST_FAILURE,
                                  verbose=False,
                                  report=False)
        try:
            assert results.failed == 0
        except AssertionError:
            print(f"Failing doctest in module {mod}")
            raise
Example #10
0
def build_patterns(structfam, folder):
    patterns = []
    for pdb, c, start, end in tqdm(structfam):
        file_name = rcsb.fetch(pdb, "mmtf", biotite.temp_dir())
        mmtf_file = mmtf.MMTFFile()
        mmtf_file.read(file_name)

        array = mmtf.get_structure(mmtf_file, model=1)
        tk_dimer = array[struc.filter_amino_acids(array)]

        # The chain ID corresponding to each residue
        chain_id_per_res = array.chain_id[struc.get_residue_starts(tk_dimer)]

        sse = mmtf_file["secStructList"]
        sse = sse[:chain_id_per_res.shape[0]][chain_id_per_res == c]
        sse = np.array(sse[start:end + 1])
        sse = np.array([sec_struct_codes[code % 8] for code in sse],
                       dtype="U1")

        sse8 = to_onehot([dssp_codes[x] for x in sse], (None, 8))
        dss8 = (sse8[1:] - sse8[:-1])
        cls = to_onehot(np.where(dss8 == -1)[1], (None, 8)).T
        bbox = np.array(
            [np.where(dss8 == 1)[0],
             np.where(dss8 == -1)[0], *cls]).T
        pat8 = np.argmax(bbox[:, 2:], 1)

        sse3 = to_onehot([abc_codes[dssp_to_abc[x]] for x in sse], (None, 3))
        dss3 = (sse3[1:] - sse3[:-1])
        cls = to_onehot(np.where(dss3 == -1)[1], (None, 3)).T
        bbox = np.array(
            [np.where(dss3 == 1)[0],
             np.where(dss3 == -1)[0], *cls]).T
        pat3 = np.argmax(bbox[:, 2:], 1)
        patterns.append((pat3, pat8))
    if len(patterns) == 0:
        print("No pattern find")
        return None, None, None, None
    c_patterns3, n_patterns3, c_patterns8, n_patterns8, weights = [], [], [], [], []
    for pat3, pat8 in patterns:
        char_pat8 = "".join([sec_struct_codes[x] for x in pat8])
        char_pat3 = "".join(["abc"[x] for x in pat3])
        c_patterns8.append(char_pat8)
        n_patterns8.append(list(pat8))
        c_patterns3.append(char_pat3)
        n_patterns3.append(list(pat3))
    occ_sum8 = dict()
    occ_sum3 = dict()

    correspondings8 = dict()
    correspondings3 = dict()
    for c8, n8, c3, n3 in zip(c_patterns8, n_patterns8, c_patterns3,
                              n_patterns3):
        if len(c3) == 0:
            continue
        if c3[0] != "c":
            c3 = "c" + c3
            n3 = [2] + n3
        if c3[-1] != "c":
            c3 = c3 + "c"
            n3 = n3 + [2]
        if c8[0] != "C":
            c8 = "C" + c8
            n8 = [7] + n8
        if c8[-1] != "C":
            c8 = c8 + "C"
            n8 = n8 + [7]
        if c8 not in occ_sum8.keys():
            occ_sum8[c8] = 0
            correspondings8[c8] = c8, n8
        occ_sum8[c8] += 1
        if c3 not in occ_sum3.keys():
            occ_sum3[c3] = 0
            correspondings3[c3] = c3, n3
        occ_sum3[c3] += 1

    c_pattern8, n_pattern8 = correspondings8[max(occ_sum8, key=occ_sum8.get)]
    c_pattern3, n_pattern3 = correspondings3[max(occ_sum3, key=occ_sum3.get)]

    push(f"{folder}/data.pt", "pattern",
         (c_pattern3, n_pattern3, c_pattern8, n_pattern8))

    return c_pattern3, n_pattern3, c_pattern8, n_pattern8, occ_sum3, occ_sum8
Example #11
0
Since we want to perform a six-frame translation we have to look at
the complementary strand of the genome as well.
"""

# Code source: Patrick Kunzmann
# License: BSD 3 clause

import biotite
import biotite.sequence as seq
import biotite.sequence.io.fasta as fasta
import biotite.database.entrez as entrez
import matplotlib.pyplot as plt

# Download Porcine circovirus genome
file_name = entrez.fetch("KP282147", biotite.temp_dir(), "fa", "nuccore",
                         "fasta")
fasta_file = fasta.FastaFile()
fasta_file.read(file_name)
genome = fasta.get_sequence(fasta_file)
# Perform translation for forward strand
proteins, positions = genome.translate()
print("Forward strand:")
for i in range(len(proteins)):
    print("{:4d} - {:4d}:   {:}".format(positions[i][0], positions[i][1],
                                        str(proteins[i])))
print("\n")
# Perform translation for complementary strand
genome_rev = genome.reverse().complement()
proteins, positions = genome_rev.translate()
print("Reverse strand:")
# License: BSD 3 clause

import biotite
import biotite.structure as struc
import biotite.structure.io as strucio
import biotite.structure.io.pdbx as pdbx
import biotite.database.rcsb as rcsb
import numpy as np

# The output file names
# Modify these values for actual file output
ku_dna_file = biotite.temp_file("ku_dna.cif")
ku_file = biotite.temp_file("ku.cif")

# Download and parse structure files
file = rcsb.fetch("1JEY", "mmtf", biotite.temp_dir())
ku_dna = strucio.load_structure(file)
file = rcsb.fetch("1JEQ", "mmtf", biotite.temp_dir())
ku = strucio.load_structure(file)
# Remove DNA and water
ku_dna = ku_dna[(ku_dna.chain_id == "A") | (ku_dna.chain_id == "B")]
ku_dna = ku_dna[~struc.filter_solvent(ku_dna)]
ku = ku[~struc.filter_solvent(ku)]
# The structures have a differing amount of atoms missing
# at the the start and end of the structure
# -> Find common structure
ku_dna_common = ku_dna[struc.filter_intersection(ku_dna, ku)]
ku_common = ku[struc.filter_intersection(ku, ku_dna)]
# Superimpose
ku_superimposed, transformation = struc.superimpose(
    ku_dna_common, ku_common, (ku_common.atom_name == "CA"))
Example #13
0
Downloading structure files from the *RCSB PDB* is quite easy:
Simply specify the PDB ID, the file format and the target directory
for the :func:`fetch()` function and you are done.
The function even returns the path to the downloaded file, so you
can just load it via the other *Biotite* subpackages
(more on this later).
We will download on a protein structure of the miniprotein *TC5b*
(PDB: 1L2Y) into a temporary directory.
"""

from os.path import relpath
import biotite
import biotite.database.rcsb as rcsb

file_path = rcsb.fetch("1l2y", "pdb", biotite.temp_dir())
print(relpath(file_path))

########################################################################
# In case you want to download multiple files, you are able to specify a
# list of PDB IDs, which in return gives you a list of file paths.

# Download files in the more modern mmCIF format
file_paths = rcsb.fetch(["1l2y", "1aki"], "cif", biotite.temp_dir())
print([relpath(file_path) for file_path in file_paths])

########################################################################
# By default :func:`fetch()` checks whether the file to be fetched
# already exists in the directory, and downloads it, if it does not
# exist yet.
# If you want to download files irrespectively, set :obj:`overwrite` to
Example #14
0
    [0.40, -81.83, 4.91, -100.59, 85.50, -71.65, 130.78, 84.98],
    [119.14, -102.58, 130.83, -67.91, 121.55, 76.25, -2.95, -90.88],
    [130.68, -56.92, 119.26, 77.85, 10.42, -99.43, 141.40, -98.01],
    [114.32, -121.47, 118.14, 82.88, -150.05, -83.81, 23.35, -85.82],
    [117.16, -95.41, 140.40, -59.35, -29.23, -72.39, -25.08, -76.16],
    [139.20, -55.96, -32.70, -68.51, -26.09, -74.44, -22.60, -71.74],
    [-39.62, -64.73, -39.52, -65.54, -38.88, -66.89, -37.76, -70.19],
    [-35.34, -65.03, -38.12, -66.34, -29.51, -89.10, -2.91, 77.90],
    [-45.29, -67.44, -27.72, -87.27, 5.13, 77.49, 30.71, -93.23],
    [-27.09, -86.14, 0.30, 59.85, 21.51, -96.30, 132.67, -92.91],
])

# Fetch animal lysoyzme structures
lyso_files = rcsb.fetch(["1REX", "1AKI", "1DKJ", "1GD6"],
                        format="mmtf",
                        target_path=biotite.temp_dir())
organisms = ["H. sapiens", "G. gallus", "C. viginianus", "B. mori"]

# Create a PB sequence from each structure
pb_seqs = []
for file_name in lyso_files:
    file = mmtf.MMTFFile()
    file.read(file_name)
    # Take only the first model into account
    array = mmtf.get_structure(file, model=1)
    # Remove everything but the first protein chain
    array = array[struc.filter_amino_acids(array)]
    array = array[array.chain_id == array.chain_id[0]]

    # Calculate backbone dihedral angles,
    # as the PBs are determined from them
Example #15
0
"""

# Code source: Patrick Kunzmann
# License: BSD 3 clause

import biotite
import biotite.structure as struc
import biotite.structure.io as strucio
import biotite.database.rcsb as rcsb
import matplotlib.pyplot as plt
import numpy as np
from matplotlib import colors
import scipy.stats as sts

# Download and parse file
file = rcsb.fetch("3vkh", "cif", biotite.temp_dir())
atom_array = strucio.load_structure(file)
# Calculate backbone dihedral angles
# from one of the two identical chains in the asymmetric unit
phi, psi, omega = struc.dihedral_backbone(
    atom_array[atom_array.chain_id == "A"])
# Conversion from radians into degree
phi *= 180 / np.pi
psi *= 180 / np.pi
# Remove invalid values (NaN) at first and last position
phi = phi[1:-1]
psi = psi[1:-1]

# Plot density
figure = plt.figure()
ax = figure.add_subplot(111)
Example #16
0
and the selecivity filter of the channel protein KcsA (PDB: 2KB1).
The structure was resolved using NMR, so multiple models are present
in the structure.
Hence, we can also calculate the frequency of each bond.
"""

# Code source: Daniel Bauer
# License: BSD 3 clause

import biotite
import matplotlib.pyplot as plt
import biotite.structure as struc
import biotite.structure.io as strucio
import biotite.database.rcsb as rcsb

file_name = rcsb.fetch("2KB1", "mmtf", biotite.temp_dir())
stack = strucio.load_structure(file_name)
# Four identical chains, consider only chain A
chain_a = stack[:, stack.chain_id == "A"]
# Selection for p-helix
p_helix = (chain_a.res_id >= 40) & (chain_a.res_id <= 52)
# Selection for selectivity filter
sf = (chain_a.res_id >= 53) & (chain_a.res_id <= 58)

# Calculate the hydrogen bonds and the frequency of each bond
triplets, mask = struc.hbond(chain_a, selection1=p_helix, selection2=sf)
freq = struc.hbond_frequency(mask)

# Create names of bonds
label = "{d_resid}{d_resnm}-{d_a} -- {a_resid}{a_resnm}-{a_a}"
names = [label.format(
Example #17
0
# For our purpose, we will work on a protein structure as small as
# possible, namely the miniprotein *TC5b* (PDB: ``1L2Y```).
# The structure of this 20-residue protein (304 atoms) has been
# elucidated via NMR.
# Thus, the corresponding PDB file consists of multiple (namely 38)
# models, each showing another conformation.
#
# .. currentmodule:: biotite.structure.io.pdb
#
# At first we load the structure from a PDB file via the class
# :class:`PDBFile` in the subpackage :mod:`biotite.structure.io.pdb`.

import biotite
import biotite.structure.io.pdb as pdb
import biotite.database.rcsb as rcsb
pdb_file_path = rcsb.fetch("1l2y", "pdb", biotite.temp_dir())
file = pdb.PDBFile()
file.read(pdb_file_path)
tc5b = file.get_structure()
print(type(tc5b).__name__)
print(tc5b.stack_depth())
print(tc5b.array_length())

########################################################################
# The method :func:`PDBFile.get_structure()` returns an atom array stack
# unless the :obj:`model` parameter is specified,
# even if the file contains only one model.
# Alternatively, the module level function :func:`get_structure()`
# can be used.
# The following example
# shows how to write an array or stack back into a PDB file:
Example #18
0
########################################################################
# As test case a structure of a *cysteine knot* protein is used,
# specifically the squash trypsin inhibitor *EETI-II*
# (PDB: `2IT7 <http://www.rcsb.org/structure/2IT7>`_).
# This motif is famous for its three characteristic disulfide bridges
# forming a 'knot'.
# However, the loaded MMTF file already has information about the
# covalent bonds - including the disulfide bridges.
# To have a proper test case, all disulfide bonds are removed from the
# structure and we pretend that the structure never had information
# about the disulfide bonds.
# For later verification that the implemented function wroks correctly,
# the disulfide bonds, that are removed, are printed out.

mmtf_file = mmtf.MMTFFile()
mmtf_file.read(rcsb.fetch("2IT7", "mmtf", biotite.temp_dir()))
knottin = mmtf.get_structure(mmtf_file, include_bonds=True, model=1)
sulfide_indices = np.where((knottin.res_name == "CYS")
                           & (knottin.atom_name == "SG"))[0]
for i, j, _ in knottin.bonds.as_array():
    if i in sulfide_indices and j in sulfide_indices:
        print(knottin[i])
        print(knottin[j])
        print()
        knottin.bonds.remove_bond(i, j)

########################################################################
# Now the sanitized structure is put into the disulfide detection
# function.
# The detected bonds are printed out and we expect to see the same
# bonds, that were removed in the code snippet above.
Example #19
0
########################################################################
# As test case a structure of a *cysteine knot* protein is used,
# specifically the squash trypsin inhibitor *EETI-II*
# (PDB: `2IT7 <http://www.rcsb.org/structure/2IT7>`_).
# This motif is famous for its three characteristic disulfide bridges
# forming a 'knot'.
# However, the loaded MMTF file already has information about the
# covalent bonds - including the disulfide bridges.
# To have a proper test case, all disulfide bonds are removed from the
# structure and we pretend that the structure never had information
# about the disulfide bonds.
# For later verification that the implemented function wroks correctly,
# the disulfide bonds, that are removed, are printed out.

mmtf_file = mmtf.MMTFFile.read(
    rcsb.fetch("2IT7", "mmtf", biotite.temp_dir())
)
knottin = mmtf.get_structure(mmtf_file, include_bonds=True, model=1)
sulfide_indices = np.where(
    (knottin.res_name == "CYS") & (knottin.atom_name == "SG")
)[0]
for i, j, _ in knottin.bonds.as_array():
    if i in sulfide_indices and j in sulfide_indices:
        print(knottin[i])
        print(knottin[j])
        print()
        knottin.bonds.remove_bond(i,j)

########################################################################
# Now the sanitized structure is put into the disulfide detection
# function.
Example #20
0
"""

# Code source: Patrick Kunzmann
# License: BSD 3 clause

import biotite
import biotite.sequence as seq
import biotite.sequence.graphics as graphics
import biotite.sequence.io.genbank as gb
import biotite.database.entrez as entrez
import numpy as np
import matplotlib.pyplot as plt

# Download E. coli BL21 genome
file_name = entrez.fetch("CP001509",
                         biotite.temp_dir(),
                         suffix="gb",
                         db_name="nuccore",
                         ret_type="gb")
gb_file = gb.GenBankFile()
gb_file.read(file_name)
_, seq_length, _, _, _, _ = gb.get_locus(gb_file)
annotation = gb.get_annotation(gb_file, include_only=["gene"])
# Find the minimum and maximum locations of lac genes
min_loc = seq_length
max_loc = 1
for feature in annotation:
    for loc in feature.locs:
        # Ignore if feature is only a pseudo-gene (e.g. gene fragment)
        # and check if feature is lacA gene (begin of lac operon)
        if "gene" in feature.qual \
Example #21
0
"""

# Code source: Patrick Kunzmann
# License: BSD 3 clause

import biotite
import biotite.sequence as seq
import biotite.sequence.io.fasta as fasta
import biotite.sequence.graphics as graphics
import biotite.application.muscle as muscle
import biotite.application.blast as blast
import biotite.database.entrez as entrez
import matplotlib.pyplot as plt

# Download sequence of Streptococcus pyogenes Cas9
file_name = entrez.fetch("Q99ZW2", biotite.temp_dir(), "fa", "protein", "fasta")
file = fasta.FastaFile.read(file_name)
ref_seq = fasta.get_sequence(file)
# Find homologous proteins using NCBI Blast
# Search only the UniProt/SwissProt database
blast_app = blast.BlastWebApp("blastp", ref_seq, "swissprot", obey_rules=False)
blast_app.start()
blast_app.join()
alignments = blast_app.get_alignments()
# Get hit IDs for hits with score > 200
hits = []
for ali in alignments:
    if ali.score > 200:
        hits.append(ali.hit_id)
# Get the sequences from hit IDs
hit_seqs = []
Example #22
0
    multi_line=False,
    loc_range=(1, 100),
    # Register our drawing functions
    feature_plotters=[HelixPlotter(), SheetPlotter()])
fig.tight_layout()

########################################################################
# Now let us do some serious application.
# We want to visualize the secondary structure of one monomer of the
# homodimeric transketolase (PDB: 1QGD).
# The simplest way to do that, is to fetch the corresponding GenBank
# file, extract an `Annotation` object from the file and draw the
# annotation.

# Fetch GenBank files of the TK's first chain and extract annotatation
file_name = entrez.fetch("1QGD_A", biotite.temp_dir(), "gb", "protein", "gb")
gb_file = gb.GenBankFile()
gb_file.read(file_name)
annotation = gb.get_annotation(gb_file, include_only=["SecStr"])
# Length of the sequence
_, length, _, _, _, _ = gb.get_locus(gb_file)

fig = plt.figure(figsize=(8.0, 3.0))
ax = fig.add_subplot(111)
graphics.plot_feature_map(
    ax,
    annotation,
    symbols_per_line=150,
    show_numbers=True,
    show_line_position=True,
    # 'loc_range' takes exclusive stop -> length+1 is required
Example #23
0
    [119.14, -102.58, 130.83,  -67.91,  121.55,   76.25,  -2.95,  -90.88],
    [130.68,  -56.92, 119.26,   77.85,   10.42,  -99.43, 141.40,  -98.01],
    [114.32, -121.47, 118.14,   82.88, -150.05,  -83.81,  23.35,  -85.82],
    [117.16,  -95.41, 140.40,  -59.35,  -29.23,  -72.39, -25.08,  -76.16],
    [139.20,  -55.96, -32.70,  -68.51,  -26.09,  -74.44, -22.60,  -71.74],
    [-39.62,  -64.73, -39.52,  -65.54,  -38.88,  -66.89, -37.76,  -70.19],
    [-35.34,  -65.03, -38.12,  -66.34,  -29.51,  -89.10,  -2.91,   77.90],
    [-45.29,  -67.44, -27.72,  -87.27,    5.13,   77.49,  30.71,  -93.23],
    [-27.09,  -86.14,   0.30,   59.85,   21.51,  -96.30, 132.67,  -92.91],
])


# Fetch animal lysoyzme structures
lyso_files = rcsb.fetch(
    ["1REX", "1AKI", "1DKJ", "1GD6"],
    format="mmtf", target_path=biotite.temp_dir()
)
organisms = ["H. sapiens", "G. gallus", "C. viginianus", "B. mori"]

# Create a PB sequence from each structure
pb_seqs = []
for file_name in lyso_files:
    file = mmtf.MMTFFile.read(file_name)
    # Take only the first model into account
    array = mmtf.get_structure(file, model=1)
    # Remove everything but the first protein chain
    array = array[struc.filter_amino_acids(array)]
    array = array[array.chain_id == array.chain_id[0]]
    
    # Calculate backbone dihedral angles,
    # as the PBs are determined from them
Example #24
0
The values in the adjacency matrix ``m`` are
``m[i,j] = 1 if distance(i,j) <= threshold else 0``. 
"""

# Code source: Patrick Kunzmann
# License: BSD 3 clause

import biotite
import biotite.structure as struc
import biotite.structure.io as strucio
import biotite.database.rcsb as rcsb
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

file_name = rcsb.fetch("1aki", "mmtf", biotite.temp_dir())
array = strucio.load_structure(file_name)
# We only consider CA atoms
ca = array[array.atom_name == "CA"]
# 7 Angstrom adjacency threshold
threshold = 7
# Create cell list of the CA atom array
# for efficient measurement of adjacency
cell_list = struc.CellList(ca, cell_size=threshold)
adjacency_matrix = cell_list.create_adjacency_matrix(threshold)

figure = plt.figure()
ax = figure.add_subplot(111)
cmap = ListedColormap(["white", biotite.colors["dimgreen"]])
#ax.matshow(adjacency_matrix, cmap=cmap, origin="lower")
ax.pcolormesh(ca.res_id, ca.res_id, adjacency_matrix, cmap=cmap)
def analyze_chirality(array):
    # Filter backbone + CB
    array = array[struc.filter_amino_acids(array)]
    array = array[(array.atom_name == "CB") | (struc.filter_backbone(array))]
    # Iterate over each residue
    ids, names = struc.get_residues(array)
    enantiomers = np.zeros(len(ids), dtype=int)
    for i, id in enumerate(ids):
        coord = array.coord[array.res_id == id]
        if len(coord) != 4:
            # Glyine -> no chirality
            enantiomers[i] = 0
        else:
            enantiomers[i] = get_enantiomer(coord[0], coord[1], coord[2],
                                            coord[3])
    return enantiomers


# Fetch and parse structure file
file = rcsb.fetch("1l2y", "mmtf", biotite.temp_dir())
stack = strucio.load_structure(file)
# Get first model
array = stack[0]
# Get enantiomers
print("1l2y            ", analyze_chirality(array))
# Reflected structures have opposite enantiomers
# Test via reflection at x-y-plane, z -> -z
array_reflect = array.copy()
array_reflect.coord[:, 2] *= -1
print("1l2y (reflected)", analyze_chirality(array_reflect))
Example #26
0
def test_fetch_invalid(format):
    with pytest.raises(RequestError):
        file = rcsb.fetch("xxxx", format, biotite.temp_dir(), overwrite=True)
Example #27
0
# Code source: Patrick Kunzmann
# License: BSD 3 clause

import biotite
import biotite.sequence as seq
import biotite.sequence.io.fasta as fasta
import biotite.sequence.io.genbank as gb
import biotite.sequence.graphics as graphics
import biotite.sequence.align as align
import biotite.database.entrez as entrez
import numpy as np
import matplotlib.pyplot as plt

# Download E. coli BL21 genome
file_name = entrez.fetch("CP001509", biotite.temp_dir(), "gb", "nuccore", "gb")
gb_file = gb.GenBankFile()
gb_file.read(file_name)
annot_seq = gb_file.get_annotated_sequence(include_only=["gene"])
# Find leuL gene
for feature in annot_seq.annotation:
    if "gene" in feature.qual and feature.qual["gene"] == "leuL":
        leul_feature = feature
# Get leuL sequence
leul_seq = annot_seq[leul_feature]

# Download Salmonella enterica genome without annotations
file_name = entrez.fetch("CP019649", biotite.temp_dir(), "fa", "nuccore",
                         "fasta")
fasta_file = fasta.FastaFile()
fasta_file.read(file_name)
Example #28
0
"""

# Code source: Patrick Kunzmann
# License: BSD 3 clause

import itertools
import numpy as np
import biotite
import biotite.sequence as seq
import biotite.sequence.io.genbank as gb
import biotite.sequence.io.fasta as fasta
import biotite.database.entrez as entrez

# Get the E. coli K-12 genome as annotated sequence
gb_file = gb.GenBankFile.read(
    entrez.fetch("U00096", biotite.temp_dir(), "gb", "nuccore", "gb"))
# We are only interested in CDS features
k12_genome = gb.get_annotated_sequence(gb_file, include_only=["CDS"])

# This dictionary will count how often each codon occurs in the genome
# For increased performance the dictionary uses symbol codes ([0 3 2])
# instead of symbols (['A' 'T' 'G']) as keys
codon_counter = {
    codon: 0
    for codon in itertools.product(
        *([range(len(k12_genome.sequence.alphabet))] * 3))
}
# For demonstration purposes print the 64 codons in symbol code form
print(list(codon_counter.keys()))

########################################################################
Example #29
0
It is basically very similar to using normal functions.

In the following sections you will get an overview over the mentioned
subpackages, so go and grab some tea and cookies und let us begin.

Preliminary note
----------------

The files used in this tutorial will be stored in a temporary directory.
The top level package :mod:`biotite` provides functionality to create
a temporary directory,
called ``.biotitetemp`` in your current working directory.
You can either obtain the path to this directory via
:func:`temp_dir` or directly create an unambiguous file name in this
directory using :func:`temp_file`.

In the end of the session the temporary directory and all its contents
will be automatically deleted, so make sure to put the files, you want
keep, somewhere else.
"""

from os.path import relpath
import biotite
# Create temporary directory
dir_path = biotite.temp_dir()
print(relpath(dir_path))
# Get a path to a temporary FASTA file
# This would also create the temporary directory,
# if it was not created, yet
file_path = biotite.temp_file("fasta")
print(relpath(file_path))
Example #30
0
def plot_gaps(pdb_id, chain_id, ax):
    # Download and parse structure file
    path = rcsb.fetch(pdb_id, "mmtf", biotite.temp_dir())
    atom_array = strucio.load_structure(path)
    # Consider only one chain
    atom_array = atom_array[atom_array.chain_id == chain_id]
    # Array for saving the 'green', 'yellow' and 'red' state
    states = np.zeros(atom_array.res_id[-1], dtype=int)
    for i in range(len(states)):
        # Get array for only one residue ID
        residue = atom_array[atom_array.res_id == i + 1]
        if len(residue) == 0:
            # not existing
            states[i] = 0
        elif residue.res_name[0] == "UNK":
            # exisiting but polyalanine
            states[i] = 1
        else:
            # existing
            states[i] = 2

    # Find the intervals for each state
    state_intervals = []
    curr_state = None
    curr_start = None
    for i in range(len(states)):
        if curr_start is None:
            curr_start = i
            curr_state = states[i]
        else:
            if states[i] != states[i - 1]:
                state_intervals.append((curr_start, i, curr_state))
                curr_start = i
                curr_state = states[i]
    state_intervals.append((curr_start, i, curr_state))

    # Draw the state intervals as colored rectangles
    for interval in state_intervals:
        start = interval[0]
        stop = interval[1]
        state = interval[2]
        if state == 0:
            color = "firebrick"
        elif state == 1:
            color = "gold"
        elif state == 2:
            color = "forestgreen"
        ax.add_patch(
            Rectangle((start + 1 - 0.5, 0),
                      stop - start,
                      1,
                      edgecolor="None",
                      facecolor=color))
    # Some other visual stuff
    ax.spines["left"].set_visible(False)
    ax.spines["bottom"].set_visible(False)
    ax.spines["right"].set_visible(False)
    ax.spines["top"].set_visible(False)
    ax.yaxis.set_visible(False)
    ax.set_xlim(0.5, len(states) + 0.5)
    ax.set_ylim(0, 2)