Exemple #1
0
def test_fetch_single_file(as_file_like):
    file_name = None if as_file_like else biotite.temp_file("fa")
    file = entrez.fetch_single_file(["1L2Y_A", "3O5R_A"], file_name, "protein",
                                    "fasta")
    fasta_file = fasta.FastaFile.read(file)
    prot_seqs = fasta.get_sequences(fasta_file)
    assert len(prot_seqs) == 2
Exemple #2
0
def test_fetch_single_file():
    file = entrez.fetch_single_file(["1L2Y_A", "3O5R_A"],
                                    biotite.temp_file("fa"), "protein",
                                    "fasta")
    fasta_file = fasta.FastaFile()
    fasta_file.read(file)
    prot_seqs = fasta.get_sequences(fasta_file)
    assert len(prot_seqs) == 2
Exemple #3
0
def test_fetch_single_file(as_file_like):
    if as_file_like:
        file_name = None
    else:
        file = tempfile.NamedTemporaryFile("r", suffix=".fa")
        file_name = file.name
    
    downloaded_file_name = entrez.fetch_single_file(
        ["1L2Y_A", "3O5R_A"], file_name, "protein", "fasta"
    )
    fasta_file = fasta.FastaFile.read(downloaded_file_name)
    prot_seqs = fasta.get_sequences(fasta_file)
    assert len(prot_seqs) == 2

    if not as_file_like:
        file.close()
                            show_line_position=show_line_position,
                            spacing=spacing)

    twin = axes.get_shared_x_axes().get_siblings(axes)[0]
    for ax in (axes, twin):
        ax.set_yticklabels(ax.get_yticklabels(), fontdict={"color": "white"})
    axes.get_figure().patch.set_facecolor("#181818")


# Using cyclotide sequences as example
query = (entrez.SimpleQuery("Cyclotide") & entrez.SimpleQuery("cter")
         & entrez.SimpleQuery("srcdb_swiss-prot", field="Properties")
         ^ entrez.SimpleQuery("Precursor"))
uids = entrez.search(query, "protein")
fasta_file = fasta.FastaFile.read(
    entrez.fetch_single_file(uids, None, "protein", "fasta"))
sequence_dict = fasta.get_sequences(fasta_file)
headers = list(sequence_dict.keys())
sequences = list(sequence_dict.values())
labels = [header[-1] for header in headers]

# Perform a multiple sequence alignment
matrix = align.SubstitutionMatrix.std_protein_matrix()
alignment, order, _, _ = align.align_multiple(sequences, matrix)
# Order alignment according to guide tree
alignment = alignment[:, order.tolist()]
labels = [labels[i] for i in order]

# Visualize the alignment using the new alignment plotter
fig = plt.figure(figsize=(8.0, 3.7))
ax = fig.add_subplot(111)
Exemple #5
0
# A list of valid database, retrieval type and mode combinations can
# be found
# `here <https://www.ncbi.nlm.nih.gov/books/NBK25499/table/chapter4.T._valid_values_of__retmode_and/?report=objectonly>`_.
# Furthermore, :func:`get_database_name()` can be helpful to get the
# required database name by the more commonly known names.

print(entrez.get_database_name("Nucleotide"))

########################################################################
# The *Entrez* database allows for packing data for multiple UIDs into a
# single file. This is achieved with the :func:`fetch_single_file()`
# function.

temp_file = NamedTemporaryFile(suffix=".fasta")
file_path = entrez.fetch_single_file(["1L2Y_A", "1AKI_A"],
                                     temp_file.name,
                                     db_name="protein",
                                     ret_type="fasta")
print(file_path)
temp_file.close()

########################################################################
# Similar to the *RCSB PDB*, you can also search every
# `field <https://www.ncbi.nlm.nih.gov/books/NBK49540/>`_
# of the *NCBI Entrez* database.

# Search in all fields
print(entrez.SimpleQuery("BL21 genome"))
# Search in the 'Organism' field
print(entrez.SimpleQuery("Escherichia coli", field="Organism"))

########################################################################
Exemple #6
0
"""

# Code source: Patrick Kunzmann
# License: BSD 3 clause

import matplotlib.pyplot as plt
import biotite
import biotite.sequence as seq
import biotite.sequence.align as align
import biotite.sequence.io.fasta as fasta
import biotite.database.entrez as entrez
import biotite.sequence.graphics as graphics

# Download and parse protein sequences of avidin and streptavidin
file_name = entrez.fetch_single_file(["CAC34569", "ACL82594"],
                                     biotite.temp_file("sequences.fasta"),
                                     "protein", "fasta")
file = fasta.FastaFile.read(file_name)
for name, sequence in file.items():
    if "CAC34569" in name:
        avidin_seq = seq.ProteinSequence(sequence)
    elif "ACL82594" in name:
        streptavidin_seq = seq.ProteinSequence(sequence)
# Get BLOSUM62 matrix
matrix = align.SubstitutionMatrix.std_protein_matrix()
# Perform pairwise sequence alignment with affine gap penalty
# Terminal gaps are not penalized
alignments = align.align_optimal(avidin_seq,
                                 streptavidin_seq,
                                 matrix,
                                 gap_penalty=(-10, -1),
Exemple #7
0
                            show_line_position=show_line_position,
                            spacing=spacing)

    twin = axes.get_shared_x_axes().get_siblings(axes)[0]
    for ax in (axes, twin):
        ax.set_yticklabels(ax.get_yticklabels(), fontdict={"color": "white"})
    axes.get_figure().patch.set_facecolor("#181818")


# Using cyclotide sequences as example
query = (entrez.SimpleQuery("Cyclotide") & entrez.SimpleQuery("cter")
         & entrez.SimpleQuery("srcdb_swiss-prot", field="Properties")
         ^ entrez.SimpleQuery("Precursor"))
uids = entrez.search(query, "protein")
fasta_file = fasta.FastaFile.read(
    entrez.fetch_single_file(uids, biotite.temp_file("fa"), "protein",
                             "fasta"))
sequence_dict = fasta.get_sequences(fasta_file)
headers = list(sequence_dict.keys())
sequences = list(sequence_dict.values())
labels = [header[-1] for header in headers]

# Perform a multiple sequence alignment
matrix = align.SubstitutionMatrix.std_protein_matrix()
alignment, order, _, _ = align.align_multiple(sequences, matrix)
# Order alignment according to guide tree
alignment = alignment[:, order.tolist()]
labels = [labels[i] for i in order]

# Visualize the alignment using the new alignment plotter
fig = plt.figure(figsize=(8.0, 3.7))
ax = fig.add_subplot(111)
Exemple #8
0
########################################################################
# A list of valid database, retrieval type and mode combinations can
# be found
# `here <https://www.ncbi.nlm.nih.gov/books/NBK25499/table/chapter4.T._valid_values_of__retmode_and/?report=objectonly>`_.
# Furthermore :func:`get_database_name()` can be helpful to get the
# required database name by the more commonly known names.

print(entrez.get_database_name("Nucleotide"))

########################################################################
# The *Entrez* database allows for packing data for multiple UIDs into a
# single file. This is achieved with the :func:`fetch_single_file()`
# function.

file_path = entrez.fetch_single_file(["1L2Y_A", "1AKI_A"],
                                     biotite.temp_file("fa"),
                                     db_name="protein",
                                     ret_type="fasta")
print(relpath(file_path))

########################################################################
# Similar to the *RCSB PDB*, you can also search in the *NCBI Entrez*
# database, but in an even more powerful manner:
# Due to the simple design of the search queries accepted by
# *NCBI Entrez*, you can search in every
# `field <https://www.ncbi.nlm.nih.gov/books/NBK49540/>`_
# of the database.

# Search in all fields
print(entrez.SimpleQuery("BL21 genome"))
# Search in the 'Organism' field
print(entrez.SimpleQuery("Escherichia coli", field="Organism"))
Exemple #9
0
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.collections import LineCollection
import biotite
import biotite.sequence as seq
import biotite.sequence.io.fasta as fasta
import biotite.sequence.io.genbank as gb
import biotite.sequence.graphics as seqgraphics
import biotite.sequence.align as align
import biotite.database.entrez as entrez
import biotite.structure.graphics as strucgraphics
import biotite.application.viennarna as viennarna

# Download Escherichia coli BL21 and Salmonella enterica genome
gb_file = gb.MultiFile.read(
    entrez.fetch_single_file(["CP001509", "CP019649"], None, "nuccore", "gb"))
ec_file, se_file = tuple(gb_file)

annot_seq = gb.get_annotated_sequence(ec_file, include_only=["ncRNA"])
# Find M1 gene in E. coli genome via its annotation
for feature in annot_seq.annotation:
    if "product" in feature.qual and "RNase P" in feature.qual["product"]:
        m1_sequence = annot_seq[feature]

# Get S. enterica genome sequence
se_genome = gb.get_sequence(se_file)

# We want to search in the genome sequence and its reverse complement
genomic_seqs = [se_genome, se_genome.reverse().complement()]

########################################################################
Exemple #10
0
    r"$\sigma^{32}$": "rpoH",
    r"$\sigma^{38}$": "rpoS",
})

# Find SwissProt entries for these genes in NCBI Entrez protein database
uids = []
for name, gene in genes.items():
    query =   entrez.SimpleQuery(gene, "Gene Name") \
            & entrez.SimpleQuery("srcdb_swiss-prot", "Properties") \
            & entrez.SimpleQuery("Escherichia coli K-12", "Organism")
    ids = entrez.search(query, "protein")
    # Only one entry per gene in E. coli K-12 is expected
    assert len(ids) == 1
    uids += ids
# Download corresponding GenBank files as single, merged file
file = entrez.fetch_single_file(uids, None, "protein", ret_type="gb")

# Array that will hold for each of the genes and each of the 4 domains
# the first and last position
# The array is initally filled with -1, as the value -1 will indicate
# that the domain does not exist in the sigma factor
domain_pos = np.full((len(genes), 4, 2), -1, dtype=int)
# Array that will hold the total sequence length of each sigma factor
seq_lengths = np.zeros(len(genes), dtype=int)
# Read the merged file containing multiple GenBank entries
multi_file = gb.MultiFile.read(file)
# Iterate over each GenBank entry
for i, gb_file in enumerate(multi_file):
    _, length, _, _, _, _ = gb.get_locus(gb_file)
    seq_lengths[i] = length
    annotation = gb.get_annotation(gb_file)
Exemple #11
0
import biotite.sequence.graphics as graphics

UNIPROT_IDS = dict(
    hHCN1="O60741",
    hHCN2="Q9UL51",
    hHCN3="Q9P1Z3",
    hHCN4="Q9Y3Q4",
    spHCN="O76977",
    hEAG1="O95259",
    hERG1="Q12809",
    KAT1="Q39128",
)

### fetch sequences for UniProt IDs from NCBI Entrez
fasta_file = fasta.FastaFile.read(
    entrez.fetch_single_file(list(UNIPROT_IDS.values()), None, "protein",
                             "fasta"))
sequences = {
    name: seq.ProteinSequence(seq_str)
    for name, seq_str in zip(UNIPROT_IDS.keys(), fasta_file.values())
}

### create a simple phylogenetic tree
# create MSA
alignment = clustalo.ClustalOmegaApp.align(list(sequences.values()))
# build simple tree based on deviation from sequence identity
distances = 1 - align.get_pairwise_sequence_identity(alignment,
                                                     mode="shortest")
tree = phylo.upgma(distances)

### plot the tree
fig, ax = plt.subplots(1, 1, figsize=(8, 5))
Exemple #12
0
import numpy as np
import matplotlib.pyplot as plt
import biotite.sequence as seq
import biotite.sequence.io.fasta as fasta
import biotite.sequence.io.genbank as gb
import biotite.sequence.graphics as graphics
import biotite.application.clustalo as clustalo
import biotite.database.entrez as entrez
# Search for protein products of LexA gene in UniProtKB/Swiss-Prot database
query =   entrez.SimpleQuery("lexA", "Gene Name") \
        & entrez.SimpleQuery("srcdb_swiss-prot", "Properties")
# Search for the first 200 hits
# More than 200 UIDs are not recommended for the EFetch service
# for a single fetch
uids = entrez.search(query, db_name="protein", number=200)
file = entrez.fetch_single_file(uids, None, db_name="protein", ret_type="gp")
# The file contains multiple concatenated GenPept files
# -> Usage of MultiFile
multi_file = gb.MultiFile.read(file)
# Separate MultiFile into single GenBankFile instances
files = [f for f in multi_file]
print("Definitions:")
for file in files[:20]:
    print(gb.get_definition(file))
print()
print("Sources:")
for file in files[:20]:
    print(gb.get_source(file))

########################################################################
# The names of the sources are too long to be properly displayed later
from scipy.stats import linregress
import biotite
import biotite.sequence as seq
import biotite.sequence.align as align
from biotite.sequence.align.alignment import score
import biotite.sequence.io.fasta as fasta
import biotite.database.entrez as entrez
import biotite.sequence.graphics as graphics


GAP_PENALTY = (-12, -1)


# Download and parse protein sequences of avidin and streptavidin
fasta_file = fasta.FastaFile.read(entrez.fetch_single_file(
    ["CAC34569", "ACL82594"], None, "protein", "fasta"
))
for name, sequence in fasta_file.items():
    if "CAC34569" in name:
        query_seq = seq.ProteinSequence(sequence)
    elif "ACL82594" in name:
        hit_seq = seq.ProteinSequence(sequence)


# Get BLOSUM62 matrix
matrix = align.SubstitutionMatrix.std_protein_matrix()

# Perform pairwise sequence alignment with affine gap penalty
# Terminal gaps are not penalized
alignment = align.align_optimal(
    query_seq, hit_seq, matrix,
Exemple #14
0
import biotite.sequence.graphics as graphics
import biotite.database.entrez as entrez
import biotite.application.clustalo as clustalo

# Search for DNA sequences that belong to the cited article
query =   entrez.SimpleQuery("Forensic Sci. Int.", "Journal") \
        & entrez.SimpleQuery("159", "Volume") \
        & entrez.SimpleQuery("132-140", "Page Number")
uids = entrez.search(query, db_name="nuccore")

# Download and read file containing the Genbank records for the THCA
# synthase genes
multi_file = gb.MultiFile()
multi_file.read(
    entrez.fetch_single_file(uids,
                             file_name=None,
                             db_name="nuccore",
                             ret_type="gb"))

# This dictionary maps the strain ID to the protein sequence
sequences = {}

for gb_file in multi_file:
    annotation = gb.get_annotation(gb_file)

    # Find ID of strain in 'source' feature
    strain = None
    for feature in annotation:
        if feature.key == "source":
            strain = int(feature.qual["strain"])
    assert strain is not None
Exemple #15
0
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
import biotite
import biotite.sequence as seq
import biotite.sequence.io.fasta as fasta
import biotite.sequence.align as align
import biotite.sequence.graphics as graphics
import biotite.database.entrez as entrez

# Generate example alignment
# (the same as in the bacterial luciferase example)
query =   entrez.SimpleQuery("luxA", "Gene Name") \
        & entrez.SimpleQuery("srcdb_swiss-prot", "Properties")
uids = entrez.search(query, db_name="protein")
file_name = entrez.fetch_single_file(uids,
                                     biotite.temp_file("fasta"),
                                     db_name="protein",
                                     ret_type="fasta")
fasta_file = fasta.FastaFile.read(file_name)
sequences = [seq.ProteinSequence(seq_str) for seq_str in fasta_file.values()]
matrix = align.SubstitutionMatrix.std_protein_matrix()
alignment, order, _, _ = align.align_multiple(sequences, matrix)
# Order alignment according to the guide tree
alignment = alignment[:, order]
alignment = alignment[220:300]

# Get color scheme names
alphabet = seq.ProteinSequence.alphabet
schemes = [
    "rainbow", "clustalx", "flower", "blossom", "spring", "wither", "autumn",
    "sunset", "ocean", "zappo", "taylor", "buried", "hydrophobicity",
    "prophelix", "propstrand", "propturn"
Exemple #16
0
        if SPECIES in line:
            # Uniprot/NCBI ID in second column, surrounded by brackets
            ncbi_id = line.split()[1].replace("(", "").replace(")", "")
            # Gene is surrounded by square brackets
            gene = line[gene_start : gene_end+1] \
                   .replace("[","").replace("]","")
            # Sometimes alternative gene names are separated via a
            # semicolon -> Choose the first gene name
            gene = gene.split(";")[0].strip()
            genes.append(gene)
            ids.append(ncbi_id)

# Download sequences a file-like object and read the sequences from it
fasta_file = fasta.FastaFile.read(
    entrez.fetch_single_file(ids,
                             file_name=None,
                             db_name="protein",
                             ret_type="fasta"))
sequences = [seq.ProteinSequence(seq_str) for seq_str in fasta_file.values()]
# Create multiple sequence alignment with Clustal Omega
alignment = clustalo.ClustalOmegaApp.align(sequences)

# The distance measure required for the tree calculation is the
# percentage of non-identical amino acids in the respective two
# sequences
distances = 1 - align.get_pairwise_sequence_identity(alignment,
                                                     mode="shortest")
# Create tree via neighbor joining
tree = phylo.neighbor_joining(distances)
# Convert to NetworkX graph
#For the graph visualization, the edge directions are unnecessary
graph = tree.as_graph().to_undirected()
import biotite.sequence.align as align
import biotite.sequence.graphics as graphics
import biotite.database.entrez as entrez
import biotite.application.clustalo as clustalo


# Search for DNA sequences that belong to the cited article
query =   entrez.SimpleQuery("Forensic Sci. Int.", "Journal") \
        & entrez.SimpleQuery("159", "Volume") \
        & entrez.SimpleQuery("132-140", "Page Number")
uids = entrez.search(query, db_name="nuccore")

# Download and read file containing the Genbank records for the THCA
# synthase genes 
multi_file = gb.MultiFile.read(entrez.fetch_single_file(
    uids, file_name=None, db_name="nuccore", ret_type="gb"
))


# This dictionary maps the strain ID to the protein sequence
sequences = {}

for gb_file in multi_file:
    annotation = gb.get_annotation(gb_file)
    
    # Find ID of strain in 'source' feature
    strain = None
    for feature in annotation:
        if feature.key == "source":
            strain = int(feature.qual["strain"])
    assert strain is not None
Exemple #18
0
    r"$\sigma^{38}$": "rpoS",
})

# Find SwissProt entries for these genes in NCBI Entrez protein database
uids = []
for name, gene in genes.items():
    query =   entrez.SimpleQuery(gene, "Gene Name") \
            & entrez.SimpleQuery("srcdb_swiss-prot", "Properties") \
            & entrez.SimpleQuery("Escherichia coli K-12", "Organism")
    ids = entrez.search(query, "protein")
    # Only one entry per gene in E. coli K-12 is expected
    assert len(ids) == 1
    uids += ids
# Download corresponding GenBank files as single, merged file
file_name = entrez.fetch_single_file(uids,
                                     biotite.temp_file("gb"),
                                     "protein",
                                     ret_type="gb")

# Array that will hold for each of the genes and each of the 4 domains
# the first and last position
# The array is initally filled with -1, as the value -1 will indicate
# that the domain does not exist in the sigma factor
domain_pos = np.full((len(genes), 4, 2), -1, dtype=int)
# Array that will hold the total sequence length of each sigma factor
seq_lengths = np.zeros(len(genes), dtype=int)
# Read the merged file containing multiple GenBank entries
multi_file = gb.MultiFile()
multi_file.read(file_name)
# Iterate over each GenBank entry
for i, gb_file in enumerate(multi_file):
    _, length, _, _, _, _ = gb.get_locus(gb_file)
Exemple #19
0
with streptavidin (*Streptomyces lavendulae*).
"""

# Code source: Patrick Kunzmann
# License: BSD 3 clause

import matplotlib.pyplot as plt
import biotite.sequence as seq
import biotite.sequence.align as align
import biotite.sequence.io.fasta as fasta
import biotite.database.entrez as entrez
import biotite.sequence.graphics as graphics

# Download and parse protein sequences of avidin and streptavidin
fasta_file = fasta.FastaFile.read(
    entrez.fetch_single_file(["CAC34569", "ACL82594"], None, "protein",
                             "fasta"))
for name, sequence in fasta_file.items():
    if "CAC34569" in name:
        avidin_seq = seq.ProteinSequence(sequence)
    elif "ACL82594" in name:
        streptavidin_seq = seq.ProteinSequence(sequence)

# Get BLOSUM62 matrix
matrix = align.SubstitutionMatrix.std_protein_matrix()
# Perform pairwise sequence alignment with affine gap penalty
# Terminal gaps are not penalized
alignments = align.align_optimal(avidin_seq,
                                 streptavidin_seq,
                                 matrix,
                                 gap_penalty=(-10, -1),
                                 terminal_penalty=False)