Python SimpleQuery Examples, biotite.database.entrez.SimpleQuery Python Examples

Example #1

0

Show file

import biotite.sequence.io.genbank as gb
import biotite.database.entrez as entrez

# The names of the sigma factors and the corresponding genes
genes = OrderedDict({
    r"$\sigma^{70}$": "rpoD",
    r"$\sigma^{24}$": "rpoE",
    r"$\sigma^{28}$": "rpoF",
    r"$\sigma^{32}$": "rpoH",
    r"$\sigma^{38}$": "rpoS",
})

# Find SwissProt entries for these genes in NCBI Entrez protein database
uids = []
for name, gene in genes.items():
    query =   entrez.SimpleQuery(gene, "Gene Name") \
            & entrez.SimpleQuery("srcdb_swiss-prot", "Properties") \
            & entrez.SimpleQuery("Escherichia coli K-12", "Organism")
    ids = entrez.search(query, "protein")
    # Only one entry per gene in E. coli K-12 is expected
    assert len(ids) == 1
    uids += ids
# Download corresponding GenBank files as single, merged file
file_name = entrez.fetch_single_file(uids,
                                     biotite.temp_file("gb"),
                                     "protein",
                                     ret_type="gb")

# Array that will hold for each of the genes and each of the 4 domains
# the first and last position
# The array is initally filled with -1, as the value -1 will indicate

Example #2

0

Show file

# Code source: Patrick Kunzmann
# License: BSD 3 clause

import numpy as np
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
import biotite
import biotite.sequence as seq
import biotite.sequence.io.fasta as fasta
import biotite.sequence.align as align
import biotite.sequence.graphics as graphics
import biotite.database.entrez as entrez

# Generate example alignment
# (the same as in the bacterial luciferase example)
query =   entrez.SimpleQuery("luxA", "Gene Name") \
        & entrez.SimpleQuery("srcdb_swiss-prot", "Properties")
uids = entrez.search(query, db_name="protein")
file_name = entrez.fetch_single_file(uids,
                                     biotite.temp_file("fasta"),
                                     db_name="protein",
                                     ret_type="fasta")
fasta_file = fasta.FastaFile.read(file_name)
sequences = [seq.ProteinSequence(seq_str) for seq_str in fasta_file.values()]
matrix = align.SubstitutionMatrix.std_protein_matrix()
alignment, order, _, _ = align.align_multiple(sequences, matrix)
# Order alignment according to the guide tree
alignment = alignment[:, order]
alignment = alignment[220:300]

# Get color scheme names

Example #3

0

Show file

File: bionigma_alignment.py Project: thomasnevolianis/biotite

                            show_numbers=show_numbers,
                            number_size=number_size,
                            number_functions=number_functions,
                            labels=labels,
                            label_size=label_size,
                            show_line_position=show_line_position,
                            spacing=spacing)

    twin = axes.get_shared_x_axes().get_siblings(axes)[0]
    for ax in (axes, twin):
        ax.set_yticklabels(ax.get_yticklabels(), fontdict={"color": "white"})
    axes.get_figure().patch.set_facecolor("#181818")


# Using cyclotide sequences as example
query = (entrez.SimpleQuery("Cyclotide") & entrez.SimpleQuery("cter")
         & entrez.SimpleQuery("srcdb_swiss-prot", field="Properties")
         ^ entrez.SimpleQuery("Precursor"))
uids = entrez.search(query, "protein")
fasta_file = fasta.FastaFile.read(
    entrez.fetch_single_file(uids, None, "protein", "fasta"))
sequence_dict = fasta.get_sequences(fasta_file)
headers = list(sequence_dict.keys())
sequences = list(sequence_dict.values())
labels = [header[-1] for header in headers]

# Perform a multiple sequence alignment
matrix = align.SubstitutionMatrix.std_protein_matrix()
alignment, order, _, _ = align.align_multiple(sequences, matrix)
# Order alignment according to guide tree
alignment = alignment[:, order.tolist()]

Example #4

0

Show file

temp_file = NamedTemporaryFile(suffix=".fasta")
file_path = entrez.fetch_single_file(["1L2Y_A", "1AKI_A"],
                                     temp_file.name,
                                     db_name="protein",
                                     ret_type="fasta")
print(file_path)
temp_file.close()

########################################################################
# Similar to the *RCSB PDB*, you can also search every
# `field <https://www.ncbi.nlm.nih.gov/books/NBK49540/>`_
# of the *NCBI Entrez* database.

# Search in all fields
print(entrez.SimpleQuery("BL21 genome"))
# Search in the 'Organism' field
print(entrez.SimpleQuery("Escherichia coli", field="Organism"))

########################################################################
# You can also combine multiple :class:`Query` objects in any way you
# like using the binary operators ``|``, ``&`` and ``^``,
# that represent ``OR``,  ``AND`` and ``NOT`` linkage, respectively.

composite_query = (entrez.SimpleQuery("50:100", field="Sequence Length") &
                   (entrez.SimpleQuery("Escherichia coli", field="Organism") |
                    entrez.SimpleQuery("Bacillus subtilis", field="Organism")))
print(composite_query)

########################################################################
# Finally, the query is given to the :func:`search()` function to obtain

Example #5

0

Show file

File: thca_synthase_polymorphism.py Project: Discngine/biotite

# License: BSD 3 clause

import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap
import biotite.sequence as seq
import biotite.sequence.align as align
import biotite.sequence.io.genbank as gb
import biotite.sequence.align as align
import biotite.sequence.graphics as graphics
import biotite.database.entrez as entrez
import biotite.application.clustalo as clustalo


# Search for DNA sequences that belong to the cited article
query =   entrez.SimpleQuery("Forensic Sci. Int.", "Journal") \
        & entrez.SimpleQuery("159", "Volume") \
        & entrez.SimpleQuery("132-140", "Page Number")
uids = entrez.search(query, db_name="nuccore")

# Download and read file containing the Genbank records for the THCA
# synthase genes 
multi_file = gb.MultiFile.read(entrez.fetch_single_file(
    uids, file_name=None, db_name="nuccore", ret_type="gb"
))


# This dictionary maps the strain ID to the protein sequence
sequences = {}

for gb_file in multi_file:

Example #6

0

Show file

    "T": -0.7,
    "S": -0.8,
    "W": -0.9,
    "Y": -1.3,
    "P": -1.6,
    "H": -3.2,
    "E": -3.5,
    "Q": -3.5,
    "D": -3.5,
    "N": -3.5,
    "K": -3.9,
    "R": -4.5
}

# Look for the Swiss-Prot entry contaning the human HCN1 channel
query =   entrez.SimpleQuery("HCN1", "Gene Name") \
        & entrez.SimpleQuery("h**o sapiens", "Organism") \
        & entrez.SimpleQuery("srcdb_swiss-prot", "Properties")
uids = entrez.search(query, db_name="protein")
file_name = entrez.fetch(uids[0],
                         biotite.temp_dir(),
                         "gp",
                         db_name="protein",
                         ret_type="gp")

gp_file = gb.GenBankFile.read(file_name)
hcn1 = seq.ProteinSequence(gb.get_sequence(gp_file, format="gp"))
print(hcn1)

########################################################################
# The positional hydropathy is calculated and smoothened using