def api_route(): pdb_id = request.args.get("pdb_id", "1Q2W") file_format = request.args.get("format", "mmtf") file_name = rcsb.fetch(pdb_id, file_format, biotite.temp_dir()) mmtf_file = mmtf.MMTFFile() mmtf_file.read(file_name) print() try: mmtf_s = mmtf_sec(mmtf_file).tolist() except: mmtf_s = [] try: dssp_s = dssp_sec(mmtf_file).tolist() except: dssp_s = [] try: psea_s = psea_sec(mmtf_file).tolist() except: dssp_s = [] structs = { "mmtf": mmtf_s, "dssp": dssp_s, "psea": psea_s, } return jsonify( sequence=list(mmtf_file["entityList"][0]["sequence"]), **structs, diffs=diff_all(**structs), )
def test_fetch(common_name, as_file_like): path = None if as_file_like else biotite.temp_dir() db_name = "Protein" if common_name else "protein" file = entrez.fetch("1L2Y_A", path, "fa", db_name, "fasta", overwrite=True) fasta_file = fasta.FastaFile() fasta_file.read(file) prot_seq = fasta.get_sequence(fasta_file)
def test_fetch_invalid(): with pytest.raises(ValueError): file = entrez.fetch("xxxx", biotite.temp_dir(), "fa", "protein", "fasta", overwrite=True)
def test_fetch(): file = entrez.fetch("1L2Y_A", biotite.temp_dir(), "fa", "protein", "fasta", overwrite=True) fasta_file = fasta.FastaFile() fasta_file.read(file) prot_seq = fasta.get_sequence(fasta_file)
def fetch_gb_annotation(pdb_chain=str): # input line retained for debugging # pdb_chain = "6FRH_A" # Fetch GenBank files of the TK's first chain and extract annotatation file_name = entrez.fetch(pdb_chain, biotite.temp_dir(), "gb", "protein", "gb") gb_file = gb.GenBankFile() gb_file.read(file_name) annotation = gb.get_annotation(gb_file, include_only=["SecStr"]) return annotation
def get_diameter(pdb_id): file_name = rcsb.fetch(pdb_id, "mmtf", biotite.temp_dir()) atom_array = strucio.load_structure(file_name) # Remove all non-amino acids atom_array = atom_array[struc.filter_amino_acids(atom_array)] coord = atom_array.coord # Calculate all pairwise difference vectors diff = coord[:, np.newaxis, :] - coord[np.newaxis, :, :] # Calculate absolute of difference vectors -> square distances sq_dist = np.sum(diff*diff, axis=-1) # Maximum distance is diameter diameter = np.sqrt(np.max(sq_dist)) return diameter
def test_fetch(format, as_file_like): path = None if as_file_like else biotite.temp_dir() file_path_or_obj = rcsb.fetch("1l2y", format, path, overwrite=True) if format == "pdb": file = pdb.PDBFile() file.read(file_path_or_obj) pdb.get_structure(file) elif format == "pdbx": file = pdbx.PDBxFile() file.read(file_path_or_obj) pdbx.get_structure(file) elif format == "mmtf": file = mmtf.MMTFFile() file.read(file_path_or_obj) mmtf.get_structure(file)
def test_fetch(format, as_file_like): path = None if as_file_like else biotite.temp_dir() file_path_or_obj = rcsb.fetch("1l2y", format, path, overwrite=True) if format == "pdb": file = pdb.PDBFile.read(file_path_or_obj) pdb.get_structure(file) elif format == "pdbx": file = pdbx.PDBxFile.read(file_path_or_obj) pdbx.get_structure(file) elif format == "mmtf": file = mmtf.MMTFFile.read(file_path_or_obj) mmtf.get_structure(file) elif format == "fasta": file = fasta.FastaFile.read(file_path_or_obj) # Test if the file contains any sequences assert len(fasta.get_sequences(file)) > 0
def test_doctest(package_name, context_package_names): """ Run all doctest strings in all Biotite subpackages. """ # Collect all attributes of this package and its subpackages # as globals for the doctests globs = {} mod_names = [] #The package itself is also used as context for name in context_package_names + [package_name]: context_package = import_module(name) mod_names += _list_modules(context_package, False) for modname in mod_names: mod = import_module(modname) attrs = mod.__all__ globs.update({attr: getattr(mod, attr) for attr in attrs}) # Add fixed names for certain paths globs["path_to_directory"] = biotite.temp_dir() globs["path_to_structures"] = "./tests/structure/data/" globs["path_to_sequences"] = "./tests/sequence/data/" # Add frequently used modules globs["np"] = np # Add frequently used objects globs["atom_array_stack"] = strucio.load_structure( "./tests/structure/data/1l2y.mmtf") globs["atom_array"] = globs["atom_array_stack"][0] # Adjust NumPy print formatting np.set_printoptions(precision=3, floatmode="maxprec_equal") # Run doctests package = import_module(package_name) mod_names = _list_modules(package, False) for modname in mod_names: mod = import_module(modname) results = doctest.testmod(mod, extraglobs=globs, optionflags=doctest.ELLIPSIS | doctest.REPORT_ONLY_FIRST_FAILURE, verbose=False, report=False) try: assert results.failed == 0 except AssertionError: print(f"Failing doctest in module {mod}") raise
def build_patterns(structfam, folder): patterns = [] for pdb, c, start, end in tqdm(structfam): file_name = rcsb.fetch(pdb, "mmtf", biotite.temp_dir()) mmtf_file = mmtf.MMTFFile() mmtf_file.read(file_name) array = mmtf.get_structure(mmtf_file, model=1) tk_dimer = array[struc.filter_amino_acids(array)] # The chain ID corresponding to each residue chain_id_per_res = array.chain_id[struc.get_residue_starts(tk_dimer)] sse = mmtf_file["secStructList"] sse = sse[:chain_id_per_res.shape[0]][chain_id_per_res == c] sse = np.array(sse[start:end + 1]) sse = np.array([sec_struct_codes[code % 8] for code in sse], dtype="U1") sse8 = to_onehot([dssp_codes[x] for x in sse], (None, 8)) dss8 = (sse8[1:] - sse8[:-1]) cls = to_onehot(np.where(dss8 == -1)[1], (None, 8)).T bbox = np.array( [np.where(dss8 == 1)[0], np.where(dss8 == -1)[0], *cls]).T pat8 = np.argmax(bbox[:, 2:], 1) sse3 = to_onehot([abc_codes[dssp_to_abc[x]] for x in sse], (None, 3)) dss3 = (sse3[1:] - sse3[:-1]) cls = to_onehot(np.where(dss3 == -1)[1], (None, 3)).T bbox = np.array( [np.where(dss3 == 1)[0], np.where(dss3 == -1)[0], *cls]).T pat3 = np.argmax(bbox[:, 2:], 1) patterns.append((pat3, pat8)) if len(patterns) == 0: print("No pattern find") return None, None, None, None c_patterns3, n_patterns3, c_patterns8, n_patterns8, weights = [], [], [], [], [] for pat3, pat8 in patterns: char_pat8 = "".join([sec_struct_codes[x] for x in pat8]) char_pat3 = "".join(["abc"[x] for x in pat3]) c_patterns8.append(char_pat8) n_patterns8.append(list(pat8)) c_patterns3.append(char_pat3) n_patterns3.append(list(pat3)) occ_sum8 = dict() occ_sum3 = dict() correspondings8 = dict() correspondings3 = dict() for c8, n8, c3, n3 in zip(c_patterns8, n_patterns8, c_patterns3, n_patterns3): if len(c3) == 0: continue if c3[0] != "c": c3 = "c" + c3 n3 = [2] + n3 if c3[-1] != "c": c3 = c3 + "c" n3 = n3 + [2] if c8[0] != "C": c8 = "C" + c8 n8 = [7] + n8 if c8[-1] != "C": c8 = c8 + "C" n8 = n8 + [7] if c8 not in occ_sum8.keys(): occ_sum8[c8] = 0 correspondings8[c8] = c8, n8 occ_sum8[c8] += 1 if c3 not in occ_sum3.keys(): occ_sum3[c3] = 0 correspondings3[c3] = c3, n3 occ_sum3[c3] += 1 c_pattern8, n_pattern8 = correspondings8[max(occ_sum8, key=occ_sum8.get)] c_pattern3, n_pattern3 = correspondings3[max(occ_sum3, key=occ_sum3.get)] push(f"{folder}/data.pt", "pattern", (c_pattern3, n_pattern3, c_pattern8, n_pattern8)) return c_pattern3, n_pattern3, c_pattern8, n_pattern8, occ_sum3, occ_sum8
Since we want to perform a six-frame translation we have to look at the complementary strand of the genome as well. """ # Code source: Patrick Kunzmann # License: BSD 3 clause import biotite import biotite.sequence as seq import biotite.sequence.io.fasta as fasta import biotite.database.entrez as entrez import matplotlib.pyplot as plt # Download Porcine circovirus genome file_name = entrez.fetch("KP282147", biotite.temp_dir(), "fa", "nuccore", "fasta") fasta_file = fasta.FastaFile() fasta_file.read(file_name) genome = fasta.get_sequence(fasta_file) # Perform translation for forward strand proteins, positions = genome.translate() print("Forward strand:") for i in range(len(proteins)): print("{:4d} - {:4d}: {:}".format(positions[i][0], positions[i][1], str(proteins[i]))) print("\n") # Perform translation for complementary strand genome_rev = genome.reverse().complement() proteins, positions = genome_rev.translate() print("Reverse strand:")
# License: BSD 3 clause import biotite import biotite.structure as struc import biotite.structure.io as strucio import biotite.structure.io.pdbx as pdbx import biotite.database.rcsb as rcsb import numpy as np # The output file names # Modify these values for actual file output ku_dna_file = biotite.temp_file("ku_dna.cif") ku_file = biotite.temp_file("ku.cif") # Download and parse structure files file = rcsb.fetch("1JEY", "mmtf", biotite.temp_dir()) ku_dna = strucio.load_structure(file) file = rcsb.fetch("1JEQ", "mmtf", biotite.temp_dir()) ku = strucio.load_structure(file) # Remove DNA and water ku_dna = ku_dna[(ku_dna.chain_id == "A") | (ku_dna.chain_id == "B")] ku_dna = ku_dna[~struc.filter_solvent(ku_dna)] ku = ku[~struc.filter_solvent(ku)] # The structures have a differing amount of atoms missing # at the the start and end of the structure # -> Find common structure ku_dna_common = ku_dna[struc.filter_intersection(ku_dna, ku)] ku_common = ku[struc.filter_intersection(ku, ku_dna)] # Superimpose ku_superimposed, transformation = struc.superimpose( ku_dna_common, ku_common, (ku_common.atom_name == "CA"))
Downloading structure files from the *RCSB PDB* is quite easy: Simply specify the PDB ID, the file format and the target directory for the :func:`fetch()` function and you are done. The function even returns the path to the downloaded file, so you can just load it via the other *Biotite* subpackages (more on this later). We will download on a protein structure of the miniprotein *TC5b* (PDB: 1L2Y) into a temporary directory. """ from os.path import relpath import biotite import biotite.database.rcsb as rcsb file_path = rcsb.fetch("1l2y", "pdb", biotite.temp_dir()) print(relpath(file_path)) ######################################################################## # In case you want to download multiple files, you are able to specify a # list of PDB IDs, which in return gives you a list of file paths. # Download files in the more modern mmCIF format file_paths = rcsb.fetch(["1l2y", "1aki"], "cif", biotite.temp_dir()) print([relpath(file_path) for file_path in file_paths]) ######################################################################## # By default :func:`fetch()` checks whether the file to be fetched # already exists in the directory, and downloads it, if it does not # exist yet. # If you want to download files irrespectively, set :obj:`overwrite` to
[0.40, -81.83, 4.91, -100.59, 85.50, -71.65, 130.78, 84.98], [119.14, -102.58, 130.83, -67.91, 121.55, 76.25, -2.95, -90.88], [130.68, -56.92, 119.26, 77.85, 10.42, -99.43, 141.40, -98.01], [114.32, -121.47, 118.14, 82.88, -150.05, -83.81, 23.35, -85.82], [117.16, -95.41, 140.40, -59.35, -29.23, -72.39, -25.08, -76.16], [139.20, -55.96, -32.70, -68.51, -26.09, -74.44, -22.60, -71.74], [-39.62, -64.73, -39.52, -65.54, -38.88, -66.89, -37.76, -70.19], [-35.34, -65.03, -38.12, -66.34, -29.51, -89.10, -2.91, 77.90], [-45.29, -67.44, -27.72, -87.27, 5.13, 77.49, 30.71, -93.23], [-27.09, -86.14, 0.30, 59.85, 21.51, -96.30, 132.67, -92.91], ]) # Fetch animal lysoyzme structures lyso_files = rcsb.fetch(["1REX", "1AKI", "1DKJ", "1GD6"], format="mmtf", target_path=biotite.temp_dir()) organisms = ["H. sapiens", "G. gallus", "C. viginianus", "B. mori"] # Create a PB sequence from each structure pb_seqs = [] for file_name in lyso_files: file = mmtf.MMTFFile() file.read(file_name) # Take only the first model into account array = mmtf.get_structure(file, model=1) # Remove everything but the first protein chain array = array[struc.filter_amino_acids(array)] array = array[array.chain_id == array.chain_id[0]] # Calculate backbone dihedral angles, # as the PBs are determined from them
""" # Code source: Patrick Kunzmann # License: BSD 3 clause import biotite import biotite.structure as struc import biotite.structure.io as strucio import biotite.database.rcsb as rcsb import matplotlib.pyplot as plt import numpy as np from matplotlib import colors import scipy.stats as sts # Download and parse file file = rcsb.fetch("3vkh", "cif", biotite.temp_dir()) atom_array = strucio.load_structure(file) # Calculate backbone dihedral angles # from one of the two identical chains in the asymmetric unit phi, psi, omega = struc.dihedral_backbone( atom_array[atom_array.chain_id == "A"]) # Conversion from radians into degree phi *= 180 / np.pi psi *= 180 / np.pi # Remove invalid values (NaN) at first and last position phi = phi[1:-1] psi = psi[1:-1] # Plot density figure = plt.figure() ax = figure.add_subplot(111)
and the selecivity filter of the channel protein KcsA (PDB: 2KB1). The structure was resolved using NMR, so multiple models are present in the structure. Hence, we can also calculate the frequency of each bond. """ # Code source: Daniel Bauer # License: BSD 3 clause import biotite import matplotlib.pyplot as plt import biotite.structure as struc import biotite.structure.io as strucio import biotite.database.rcsb as rcsb file_name = rcsb.fetch("2KB1", "mmtf", biotite.temp_dir()) stack = strucio.load_structure(file_name) # Four identical chains, consider only chain A chain_a = stack[:, stack.chain_id == "A"] # Selection for p-helix p_helix = (chain_a.res_id >= 40) & (chain_a.res_id <= 52) # Selection for selectivity filter sf = (chain_a.res_id >= 53) & (chain_a.res_id <= 58) # Calculate the hydrogen bonds and the frequency of each bond triplets, mask = struc.hbond(chain_a, selection1=p_helix, selection2=sf) freq = struc.hbond_frequency(mask) # Create names of bonds label = "{d_resid}{d_resnm}-{d_a} -- {a_resid}{a_resnm}-{a_a}" names = [label.format(
# For our purpose, we will work on a protein structure as small as # possible, namely the miniprotein *TC5b* (PDB: ``1L2Y```). # The structure of this 20-residue protein (304 atoms) has been # elucidated via NMR. # Thus, the corresponding PDB file consists of multiple (namely 38) # models, each showing another conformation. # # .. currentmodule:: biotite.structure.io.pdb # # At first we load the structure from a PDB file via the class # :class:`PDBFile` in the subpackage :mod:`biotite.structure.io.pdb`. import biotite import biotite.structure.io.pdb as pdb import biotite.database.rcsb as rcsb pdb_file_path = rcsb.fetch("1l2y", "pdb", biotite.temp_dir()) file = pdb.PDBFile() file.read(pdb_file_path) tc5b = file.get_structure() print(type(tc5b).__name__) print(tc5b.stack_depth()) print(tc5b.array_length()) ######################################################################## # The method :func:`PDBFile.get_structure()` returns an atom array stack # unless the :obj:`model` parameter is specified, # even if the file contains only one model. # Alternatively, the module level function :func:`get_structure()` # can be used. # The following example # shows how to write an array or stack back into a PDB file:
######################################################################## # As test case a structure of a *cysteine knot* protein is used, # specifically the squash trypsin inhibitor *EETI-II* # (PDB: `2IT7 <http://www.rcsb.org/structure/2IT7>`_). # This motif is famous for its three characteristic disulfide bridges # forming a 'knot'. # However, the loaded MMTF file already has information about the # covalent bonds - including the disulfide bridges. # To have a proper test case, all disulfide bonds are removed from the # structure and we pretend that the structure never had information # about the disulfide bonds. # For later verification that the implemented function wroks correctly, # the disulfide bonds, that are removed, are printed out. mmtf_file = mmtf.MMTFFile() mmtf_file.read(rcsb.fetch("2IT7", "mmtf", biotite.temp_dir())) knottin = mmtf.get_structure(mmtf_file, include_bonds=True, model=1) sulfide_indices = np.where((knottin.res_name == "CYS") & (knottin.atom_name == "SG"))[0] for i, j, _ in knottin.bonds.as_array(): if i in sulfide_indices and j in sulfide_indices: print(knottin[i]) print(knottin[j]) print() knottin.bonds.remove_bond(i, j) ######################################################################## # Now the sanitized structure is put into the disulfide detection # function. # The detected bonds are printed out and we expect to see the same # bonds, that were removed in the code snippet above.
######################################################################## # As test case a structure of a *cysteine knot* protein is used, # specifically the squash trypsin inhibitor *EETI-II* # (PDB: `2IT7 <http://www.rcsb.org/structure/2IT7>`_). # This motif is famous for its three characteristic disulfide bridges # forming a 'knot'. # However, the loaded MMTF file already has information about the # covalent bonds - including the disulfide bridges. # To have a proper test case, all disulfide bonds are removed from the # structure and we pretend that the structure never had information # about the disulfide bonds. # For later verification that the implemented function wroks correctly, # the disulfide bonds, that are removed, are printed out. mmtf_file = mmtf.MMTFFile.read( rcsb.fetch("2IT7", "mmtf", biotite.temp_dir()) ) knottin = mmtf.get_structure(mmtf_file, include_bonds=True, model=1) sulfide_indices = np.where( (knottin.res_name == "CYS") & (knottin.atom_name == "SG") )[0] for i, j, _ in knottin.bonds.as_array(): if i in sulfide_indices and j in sulfide_indices: print(knottin[i]) print(knottin[j]) print() knottin.bonds.remove_bond(i,j) ######################################################################## # Now the sanitized structure is put into the disulfide detection # function.
""" # Code source: Patrick Kunzmann # License: BSD 3 clause import biotite import biotite.sequence as seq import biotite.sequence.graphics as graphics import biotite.sequence.io.genbank as gb import biotite.database.entrez as entrez import numpy as np import matplotlib.pyplot as plt # Download E. coli BL21 genome file_name = entrez.fetch("CP001509", biotite.temp_dir(), suffix="gb", db_name="nuccore", ret_type="gb") gb_file = gb.GenBankFile() gb_file.read(file_name) _, seq_length, _, _, _, _ = gb.get_locus(gb_file) annotation = gb.get_annotation(gb_file, include_only=["gene"]) # Find the minimum and maximum locations of lac genes min_loc = seq_length max_loc = 1 for feature in annotation: for loc in feature.locs: # Ignore if feature is only a pseudo-gene (e.g. gene fragment) # and check if feature is lacA gene (begin of lac operon) if "gene" in feature.qual \
""" # Code source: Patrick Kunzmann # License: BSD 3 clause import biotite import biotite.sequence as seq import biotite.sequence.io.fasta as fasta import biotite.sequence.graphics as graphics import biotite.application.muscle as muscle import biotite.application.blast as blast import biotite.database.entrez as entrez import matplotlib.pyplot as plt # Download sequence of Streptococcus pyogenes Cas9 file_name = entrez.fetch("Q99ZW2", biotite.temp_dir(), "fa", "protein", "fasta") file = fasta.FastaFile.read(file_name) ref_seq = fasta.get_sequence(file) # Find homologous proteins using NCBI Blast # Search only the UniProt/SwissProt database blast_app = blast.BlastWebApp("blastp", ref_seq, "swissprot", obey_rules=False) blast_app.start() blast_app.join() alignments = blast_app.get_alignments() # Get hit IDs for hits with score > 200 hits = [] for ali in alignments: if ali.score > 200: hits.append(ali.hit_id) # Get the sequences from hit IDs hit_seqs = []
multi_line=False, loc_range=(1, 100), # Register our drawing functions feature_plotters=[HelixPlotter(), SheetPlotter()]) fig.tight_layout() ######################################################################## # Now let us do some serious application. # We want to visualize the secondary structure of one monomer of the # homodimeric transketolase (PDB: 1QGD). # The simplest way to do that, is to fetch the corresponding GenBank # file, extract an `Annotation` object from the file and draw the # annotation. # Fetch GenBank files of the TK's first chain and extract annotatation file_name = entrez.fetch("1QGD_A", biotite.temp_dir(), "gb", "protein", "gb") gb_file = gb.GenBankFile() gb_file.read(file_name) annotation = gb.get_annotation(gb_file, include_only=["SecStr"]) # Length of the sequence _, length, _, _, _, _ = gb.get_locus(gb_file) fig = plt.figure(figsize=(8.0, 3.0)) ax = fig.add_subplot(111) graphics.plot_feature_map( ax, annotation, symbols_per_line=150, show_numbers=True, show_line_position=True, # 'loc_range' takes exclusive stop -> length+1 is required
[119.14, -102.58, 130.83, -67.91, 121.55, 76.25, -2.95, -90.88], [130.68, -56.92, 119.26, 77.85, 10.42, -99.43, 141.40, -98.01], [114.32, -121.47, 118.14, 82.88, -150.05, -83.81, 23.35, -85.82], [117.16, -95.41, 140.40, -59.35, -29.23, -72.39, -25.08, -76.16], [139.20, -55.96, -32.70, -68.51, -26.09, -74.44, -22.60, -71.74], [-39.62, -64.73, -39.52, -65.54, -38.88, -66.89, -37.76, -70.19], [-35.34, -65.03, -38.12, -66.34, -29.51, -89.10, -2.91, 77.90], [-45.29, -67.44, -27.72, -87.27, 5.13, 77.49, 30.71, -93.23], [-27.09, -86.14, 0.30, 59.85, 21.51, -96.30, 132.67, -92.91], ]) # Fetch animal lysoyzme structures lyso_files = rcsb.fetch( ["1REX", "1AKI", "1DKJ", "1GD6"], format="mmtf", target_path=biotite.temp_dir() ) organisms = ["H. sapiens", "G. gallus", "C. viginianus", "B. mori"] # Create a PB sequence from each structure pb_seqs = [] for file_name in lyso_files: file = mmtf.MMTFFile.read(file_name) # Take only the first model into account array = mmtf.get_structure(file, model=1) # Remove everything but the first protein chain array = array[struc.filter_amino_acids(array)] array = array[array.chain_id == array.chain_id[0]] # Calculate backbone dihedral angles, # as the PBs are determined from them
The values in the adjacency matrix ``m`` are ``m[i,j] = 1 if distance(i,j) <= threshold else 0``. """ # Code source: Patrick Kunzmann # License: BSD 3 clause import biotite import biotite.structure as struc import biotite.structure.io as strucio import biotite.database.rcsb as rcsb import numpy as np import matplotlib.pyplot as plt from matplotlib.colors import ListedColormap file_name = rcsb.fetch("1aki", "mmtf", biotite.temp_dir()) array = strucio.load_structure(file_name) # We only consider CA atoms ca = array[array.atom_name == "CA"] # 7 Angstrom adjacency threshold threshold = 7 # Create cell list of the CA atom array # for efficient measurement of adjacency cell_list = struc.CellList(ca, cell_size=threshold) adjacency_matrix = cell_list.create_adjacency_matrix(threshold) figure = plt.figure() ax = figure.add_subplot(111) cmap = ListedColormap(["white", biotite.colors["dimgreen"]]) #ax.matshow(adjacency_matrix, cmap=cmap, origin="lower") ax.pcolormesh(ca.res_id, ca.res_id, adjacency_matrix, cmap=cmap)
def analyze_chirality(array): # Filter backbone + CB array = array[struc.filter_amino_acids(array)] array = array[(array.atom_name == "CB") | (struc.filter_backbone(array))] # Iterate over each residue ids, names = struc.get_residues(array) enantiomers = np.zeros(len(ids), dtype=int) for i, id in enumerate(ids): coord = array.coord[array.res_id == id] if len(coord) != 4: # Glyine -> no chirality enantiomers[i] = 0 else: enantiomers[i] = get_enantiomer(coord[0], coord[1], coord[2], coord[3]) return enantiomers # Fetch and parse structure file file = rcsb.fetch("1l2y", "mmtf", biotite.temp_dir()) stack = strucio.load_structure(file) # Get first model array = stack[0] # Get enantiomers print("1l2y ", analyze_chirality(array)) # Reflected structures have opposite enantiomers # Test via reflection at x-y-plane, z -> -z array_reflect = array.copy() array_reflect.coord[:, 2] *= -1 print("1l2y (reflected)", analyze_chirality(array_reflect))
def test_fetch_invalid(format): with pytest.raises(RequestError): file = rcsb.fetch("xxxx", format, biotite.temp_dir(), overwrite=True)
# Code source: Patrick Kunzmann # License: BSD 3 clause import biotite import biotite.sequence as seq import biotite.sequence.io.fasta as fasta import biotite.sequence.io.genbank as gb import biotite.sequence.graphics as graphics import biotite.sequence.align as align import biotite.database.entrez as entrez import numpy as np import matplotlib.pyplot as plt # Download E. coli BL21 genome file_name = entrez.fetch("CP001509", biotite.temp_dir(), "gb", "nuccore", "gb") gb_file = gb.GenBankFile() gb_file.read(file_name) annot_seq = gb_file.get_annotated_sequence(include_only=["gene"]) # Find leuL gene for feature in annot_seq.annotation: if "gene" in feature.qual and feature.qual["gene"] == "leuL": leul_feature = feature # Get leuL sequence leul_seq = annot_seq[leul_feature] # Download Salmonella enterica genome without annotations file_name = entrez.fetch("CP019649", biotite.temp_dir(), "fa", "nuccore", "fasta") fasta_file = fasta.FastaFile() fasta_file.read(file_name)
""" # Code source: Patrick Kunzmann # License: BSD 3 clause import itertools import numpy as np import biotite import biotite.sequence as seq import biotite.sequence.io.genbank as gb import biotite.sequence.io.fasta as fasta import biotite.database.entrez as entrez # Get the E. coli K-12 genome as annotated sequence gb_file = gb.GenBankFile.read( entrez.fetch("U00096", biotite.temp_dir(), "gb", "nuccore", "gb")) # We are only interested in CDS features k12_genome = gb.get_annotated_sequence(gb_file, include_only=["CDS"]) # This dictionary will count how often each codon occurs in the genome # For increased performance the dictionary uses symbol codes ([0 3 2]) # instead of symbols (['A' 'T' 'G']) as keys codon_counter = { codon: 0 for codon in itertools.product( *([range(len(k12_genome.sequence.alphabet))] * 3)) } # For demonstration purposes print the 64 codons in symbol code form print(list(codon_counter.keys())) ########################################################################
It is basically very similar to using normal functions. In the following sections you will get an overview over the mentioned subpackages, so go and grab some tea and cookies und let us begin. Preliminary note ---------------- The files used in this tutorial will be stored in a temporary directory. The top level package :mod:`biotite` provides functionality to create a temporary directory, called ``.biotitetemp`` in your current working directory. You can either obtain the path to this directory via :func:`temp_dir` or directly create an unambiguous file name in this directory using :func:`temp_file`. In the end of the session the temporary directory and all its contents will be automatically deleted, so make sure to put the files, you want keep, somewhere else. """ from os.path import relpath import biotite # Create temporary directory dir_path = biotite.temp_dir() print(relpath(dir_path)) # Get a path to a temporary FASTA file # This would also create the temporary directory, # if it was not created, yet file_path = biotite.temp_file("fasta") print(relpath(file_path))
def plot_gaps(pdb_id, chain_id, ax): # Download and parse structure file path = rcsb.fetch(pdb_id, "mmtf", biotite.temp_dir()) atom_array = strucio.load_structure(path) # Consider only one chain atom_array = atom_array[atom_array.chain_id == chain_id] # Array for saving the 'green', 'yellow' and 'red' state states = np.zeros(atom_array.res_id[-1], dtype=int) for i in range(len(states)): # Get array for only one residue ID residue = atom_array[atom_array.res_id == i + 1] if len(residue) == 0: # not existing states[i] = 0 elif residue.res_name[0] == "UNK": # exisiting but polyalanine states[i] = 1 else: # existing states[i] = 2 # Find the intervals for each state state_intervals = [] curr_state = None curr_start = None for i in range(len(states)): if curr_start is None: curr_start = i curr_state = states[i] else: if states[i] != states[i - 1]: state_intervals.append((curr_start, i, curr_state)) curr_start = i curr_state = states[i] state_intervals.append((curr_start, i, curr_state)) # Draw the state intervals as colored rectangles for interval in state_intervals: start = interval[0] stop = interval[1] state = interval[2] if state == 0: color = "firebrick" elif state == 1: color = "gold" elif state == 2: color = "forestgreen" ax.add_patch( Rectangle((start + 1 - 0.5, 0), stop - start, 1, edgecolor="None", facecolor=color)) # Some other visual stuff ax.spines["left"].set_visible(False) ax.spines["bottom"].set_visible(False) ax.spines["right"].set_visible(False) ax.spines["top"].set_visible(False) ax.yaxis.set_visible(False) ax.set_xlim(0.5, len(states) + 0.5) ax.set_ylim(0, 2)