Exemple #1
0
def test_dssp(path):
    sec_struct_codes = {
        0: "I",
        1: "S",
        2: "H",
        3: "E",
        4: "G",
        5: "B",
        6: "T",
        7: "C"
    }

    mmtf_file = mmtf.MMTFFile.read(path)
    array = mmtf.get_structure(mmtf_file, model=1)
    array = array[array.hetero == False]
    first_chain_id = array.chain_id[0]
    chain = array[array.chain_id == first_chain_id]

    n_residues = struc.get_residue_count(chain)
    # Secondary structure annotation in PDB use also DSSP
    # -> compare PDB and local DSSP
    sse = mmtf_file["secStructList"]
    sse = sse[:n_residues]
    if (sse == -1).all():
        # First chain is not a polypeptide chain (presumably DNA/RNA)
        # DSSP not applicable -> return
        return
    sse = np.array([sec_struct_codes[code] for code in sse], dtype="U1")

    chain = array[array.chain_id == first_chain_id]
    sse_from_app = DsspApp.annotate_sse(chain)
    np.set_printoptions(threshold=10000)
    # PDB uses different DSSP version -> slight differences possible
    # -> only 90% must be identical
    assert np.count_nonzero(sse_from_app == sse) / len(sse) > 0.9
Exemple #2
0
def test_get_residues(array):
    ids, names = struc.get_residues(array)
    assert ids.tolist() == list(range(1, 21))
    assert names.tolist() == [
        "ASN", "LEU", "TYR", "ILE", "GLN", "TRP", "LEU", "LYS", "ASP", "GLY",
        "GLY", "PRO", "SER", "SER", "GLY", "ARG", "PRO", "PRO", "PRO", "SER"
    ]
    assert len(ids) == struc.get_residue_count(array)
Exemple #3
0
def plot_rna(pdb_id, axes):
    # Download the PDB file and read the structure
    pdb_file_path = rcsb.fetch(pdb_id, "pdb", gettempdir())
    pdb_file = pdb.PDBFile.read(pdb_file_path)
    atom_array = pdb.get_structure(pdb_file)[0]
    nucleotides = atom_array[struc.filter_nucleotides(atom_array)]

    # Compute the base pairs and their pseudoknot order
    base_pairs = struc.base_pairs(nucleotides)
    base_pairs = struc.get_residue_positions(
        nucleotides, base_pairs.flatten()
    ).reshape(base_pairs.shape)
    pseudoknot_order = struc.pseudoknots(base_pairs)[0]

    # Set the linestyle according to the pseudoknot order
    linestyles = np.full(base_pairs.shape[0], '-', dtype=object)
    linestyles[pseudoknot_order == 1] = '--'
    linestyles[pseudoknot_order == 2] = ':'

    # Indicate canonical nucleotides with an upper case one-letter-code
    # and non-canonical nucleotides with a lower case one-letter-code
    base_labels = []
    for base in struc.residue_iter(nucleotides):
        one_letter_code, exact = struc.map_nucleotide(base)
        if exact:
            base_labels.append(one_letter_code)
        else:
            base_labels.append(one_letter_code.lower())

    # Color canonical Watson-Crick base pairs with a darker orange and
    # non-canonical base pairs with a lighter orange
    colors = np.full(base_pairs.shape[0], biotite.colors['brightorange'])
    for i, (base1, base2) in enumerate(base_pairs):
        name1 = base_labels[base1]
        name2 = base_labels[base2]
        if sorted([name1, name2]) in [["A", "U"], ["C", "G"]]:
            colors[i] = biotite.colors["dimorange"]

    # Plot the secondary structure
    graphics.plot_nucleotide_secondary_structure(
        axes, base_labels, base_pairs, struc.get_residue_count(nucleotides),
        pseudoknot_order=pseudoknot_order, bond_linestyle=linestyles,
        bond_color=colors,
        # Margin to compensate for reduced axis limits in shared axis
        border=0.13
    )

    # Use the PDB ID to label each plot
    axes.set_title(pdb_id, loc="left")
                                                   atom_name == "CA"]
ca_trajectory_kinase_right = trajectory_kinase_right[:,
                                                     trajectory_kinase_right.
                                                     atom_name == "CA"]

rmsf_kinase_left = struc.rmsf(struc.average(ca_trajectory_kinase_left),
                              ca_trajectory_kinase_left)
rmsf_upper_kinase_left = rmsf_kinase_left.max() * 1.1

rmsf_kinase_right = struc.rmsf(struc.average(ca_trajectory_kinase_right),
                               ca_trajectory_kinase_right)
rmsf_upper_kinase_right = rmsf_kinase_right.max() * 1.1

fig, (ax1, ax2) = plt.subplots(2, 1)

res_count = struc.get_residue_count(trajectory_kinase_left)
ax1.plot(np.arange(1, res_count + 1) + 2801,
         rmsf_kinase_left,
         color=biotite.colors["dimorange"])
ax1.set_title("Kinase Left")
#ax1.axvline(3828, ls="--", color="k")
#ax1.axvline(3838, ls="--", color="k")
ax1.set_xlim(2801 + 1, 2801 + res_count)
ax1.set_ylim(0, rmsf_upper_kinase_left)
ax1.set_xlabel("Residue")
ax1.set_ylabel("RMSF (Å)")

res_count = struc.get_residue_count(trajectory_kinase_right)
ax2.plot(np.arange(1, res_count + 1) + 2801,
         rmsf_kinase_right,
         color=biotite.colors["dimorange"])
Exemple #5
0
            annotation += "W"
        elif edge == 2:
            annotation += "H"
        else:
            annotation += "S"
        base_labels[base] = annotation

# Create a matplotlib pyplot
fig, ax = plt.subplots(figsize=(8.0, 8.0))

# Plot the secondary structure
graphics.plot_nucleotide_secondary_structure(
    ax,
    base_labels,
    base_pairs,
    struc.get_residue_count(nucleotides),
    bond_color=colors)

# Display the plot
plt.show()

########################################################################
# The sarcin-ricin loop is part of the 23s rRNA and is considered
# crucial to the ribosome‘s activity. The incorporation of the
# Leontis-Westhof nomenclature into the 2D-plot shows how the individual
# base pairs are oriented and how their glycosidic bonds are oriented
# relative to each other.
#
# This visualization enables one to see a pattern that cannot be
# communicated through the 2D structure alone. The upper part of the
# sarcin-ricin loop consists of only cis (c) oriented glycosidic bonds.
import biotite.structure as struc
import biotite.structure.io as strucio
import biotite.structure.io.xtc as xtc
from biotite.application.dssp import DsspApp

# Put here the path of the downloaded files
templ_file_path = "../../download/lysozyme_md.pdb"
traj_file_path = "../../download/lysozyme_md.xtc"

xtc_file = xtc.XTCFile.read(traj_file_path)
traj = xtc_file.get_structure(template=strucio.load_structure(templ_file_path))
time = xtc_file.get_time()
traj = traj[:, struc.filter_amino_acids(traj)]

# DSSP does not assign an SSE to the last residue -> -1
sse = np.empty((traj.shape[0], struc.get_residue_count(traj) - 1), dtype='U1')
for idx, frame in enumerate(traj):
    app = DsspApp(traj[idx])
    app.start()
    app.join()
    sse[idx] = app.get_sse()


# Matplotlib needs numbers to assign colors correctly
def sse_to_num(sse):
    num = np.empty(sse.shape, dtype=int)
    num[sse == 'C'] = 0
    num[sse == 'E'] = 1
    num[sse == 'B'] = 2
    num[sse == 'S'] = 3
    num[sse == 'T'] = 4
Exemple #7
0
        else:
            annotation = "t"
        if edge == 1:
            annotation += "W"
        elif edge == 2:
            annotation += "H"
        else:
            annotation += "S"
        base_labels[base] = annotation

# Create a matplotlib pyplot
fig, ax = plt.subplots(figsize=(8.0, 8.0))

# Plot the secondary structure
graphics.plot_nucleotide_secondary_structure(
    ax, base_labels, base_pairs, struc.get_residue_count(nucleotides),
    bond_color=colors
)

# Display the plot
plt.show()

########################################################################
# The sarcin-ricin loop is part of the 23s rRNA and is considered 
# crucial to the ribosome‘s activity. The incorporation of the
# Leontis-Westhof nomenclature into the 2D-plot shows how the individual 
# base pairs are oriented and how their glycosidic bonds are oriented 
# relative to each other.
#
# This visualization enables one to see a pattern that cannot be 
# communicated through the 2D structure alone. The upper part of the 
Exemple #8
0
def rmsf_plot(topology,
              xtc_traj,
              start_frame=None,
              stop_frame=None,
              write_dat_files=None):
    # Gromacs does not set the element symbol in its PDB files,
    # but Biotite guesses the element names from the atom names,
    # emitting a warning
    template = strucio.load_structure(topology)

    # The structure still has water and ions, that are not needed for our
    # calculations, we are only interested in the protein itself
    # These are removed for the sake of computational speed using a boolean
    # mask
    protein_mask = struc.filter_amino_acids(template)
    template = template[protein_mask]
    residue_names = struc.get_residues(template)[1]

    xtc_file = XTCFile()
    xtc_file.read(xtc_traj,
                  atom_i=np.where(protein_mask)[0],
                  start=start_frame,
                  stop=stop_frame + 1)

    trajectory = xtc_file.get_structure(template)

    time = xtc_file.get_time()  # Get simulation time for plotting purposes

    trajectory = struc.remove_pbc(trajectory)
    trajectory, transform = struc.superimpose(trajectory[0], trajectory)
    rmsd = struc.rmsd(trajectory[0], trajectory)

    figure = plt.figure(figsize=(6, 3))
    ax = figure.add_subplot(111)
    ax.plot(time, rmsd, color=biotite.colors["dimorange"])
    ax.set_xlim(time[0], time[-1])
    ax.set_ylim(0, 2)
    ax.set_xlabel("Time (ps)")
    ax.set_ylabel("RMSD (Å)")
    figure.tight_layout()

    radius = struc.gyration_radius(trajectory)

    figure = plt.figure(figsize=(6, 3))
    ax = figure.add_subplot(111)
    ax.plot(time, radius, color=biotite.colors["dimorange"])
    ax.set_xlim(time[0], time[-1])
    ax.set_ylim(14.0, 14.5)
    ax.set_xlabel("Time (ps)")
    ax.set_ylabel("Radius of gyration (Å)")
    figure.tight_layout()

    # In all models, mask the CA atoms
    ca_trajectory = trajectory[:, trajectory.atom_name == "CA"]
    rmsf = struc.rmsf(struc.average(ca_trajectory), ca_trajectory)

    figure = plt.figure(figsize=(6, 3))
    ax = figure.add_subplot(111)
    res_count = struc.get_residue_count(trajectory)
    ax.plot(np.arange(1, res_count + 1),
            rmsf,
            color=biotite.colors["dimorange"])
    ax.set_xlim(1, res_count)
    ax.set_ylim(0, 1.5)
    ax.set_xlabel("Residue")
    ax.set_ylabel("RMSF (Å)")
    figure.tight_layout()

    if write_dat_files == True:
        # Write RMSD *.dat file
        frames = np.array(range(start_frame - 1, stop_frame), dtype=int)
        frames[0] = 0
        df = pd.DataFrame(data=rmsd, index=frames, columns=["RMSD Values"])
        df.index.name = 'Frames'
        df.to_csv('rmsd.dat', header=True, index=True, sep='\t', mode='w')

        # Write RMSF *.dat file
        df1 = pd.DataFrame(data=rmsf,
                           index=residue_names,
                           columns=["RMSF Values"])
        df1.index.name = 'Residues'
        df1.to_csv('rmsf.dat', header=True, index=True, sep='\t', mode='w')
    plt.show()
# during the entire simulation.
#
# Let's have a look at single amino acids:
# Which residues fluctuate most?
# For answering this question we calculate the RMSF
# (Root mean square fluctuation).
# It is similar to the RMSD, but instead of averaging over the atoms
# and looking at each time step, we average over the time and look at
# each residue.
# Usually the average model is taken as reference
# (compared to the starting model for RMSD).
#
# Since side chain atoms fluctuate quite a lot, they are not suitable
# for evaluation of the residue flexibility. Therefore, we consider only
# CA atoms.

# In all models, mask the CA atoms
ca_trajectory = trajectory[:, trajectory.atom_name == "CA"]
rmsf = struc.rmsf(struc.average(ca_trajectory), ca_trajectory)

figure = plt.figure(figsize=(6, 3))
ax = figure.add_subplot(111)
res_count = struc.get_residue_count(trajectory)
ax.plot(np.arange(1, res_count + 1), rmsf, color=biotite.colors["dimorange"])
ax.set_xlim(1, res_count)
ax.set_ylim(0, 1.5)
ax.set_xlabel("Residue")
ax.set_ylabel("RMSF (Å)")
figure.tight_layout()

plt.show()