Ejemplo n.º 1
0
def test_fetch(common_name, as_file_like):
    path = None if as_file_like else biotite.temp_dir()
    db_name = "Protein" if common_name else "protein"
    file = entrez.fetch("1L2Y_A", path, "fa", db_name, "fasta", overwrite=True)
    fasta_file = fasta.FastaFile()
    fasta_file.read(file)
    prot_seq = fasta.get_sequence(fasta_file)
Ejemplo n.º 2
0
def test_fetch_invalid():
    with pytest.raises(ValueError):
        file = entrez.fetch("xxxx",
                            biotite.temp_dir(),
                            "fa",
                            "protein",
                            "fasta",
                            overwrite=True)
Ejemplo n.º 3
0
def test_fetch():
    file = entrez.fetch("1L2Y_A",
                        biotite.temp_dir(),
                        "fa",
                        "protein",
                        "fasta",
                        overwrite=True)
    fasta_file = fasta.FastaFile()
    fasta_file.read(file)
    prot_seq = fasta.get_sequence(fasta_file)
Ejemplo n.º 4
0
def fetch_gb_annotation(pdb_chain=str):

    # input line retained for debugging
    # pdb_chain = "6FRH_A"

    # Fetch GenBank files of the TK's first chain and extract annotatation
    file_name = entrez.fetch(pdb_chain, biotite.temp_dir(), "gb", "protein",
                             "gb")
    gb_file = gb.GenBankFile()
    gb_file.read(file_name)
    annotation = gb.get_annotation(gb_file, include_only=["SecStr"])
    return annotation
Ejemplo n.º 5
0
def make_feature_maps(gene):

    try:
        find_id = entrez.fetch(gene,
                               gettempdir(),
                               suffix="gb",
                               db_name="nuccore",
                               ret_type="gb")
        read_file = gb.GenBankFile.read(find_id)
        file_annotation = gb.get_annotation(read_file)
    except:
        flash('The entered gene could not found. Please try again.', 'error')
        return None

    key_list = []

    for feature in file_annotation:
        keys = feature.key
        key_list.append(keys)
        if feature.key == "source":
            # loc_range has exclusive stop
            loc = list(feature.locs)[0]
            loc_range = (loc.first, loc.last + 1)
            Unique_key = np.unique(key_list)

    pwd = os.getcwd()

    Unique_key = np.unique(key_list)
    for j in range(len(Unique_key)):
        i = Unique_key[j]

        fig, ax = plt.subplots(figsize=(8.0, 2.0))
        graphics.plot_feature_map(ax,
                                  seq.Annotation([
                                      feature for feature in file_annotation
                                      if feature.key == i
                                  ]),
                                  multi_line=False,
                                  loc_range=loc_range,
                                  show_line_position=True)

        plt.title('This plot is for {} features'.format(i))
        plt.savefig(pwd + '/app/static/images/{}.png'.format(i), dpi=300)
        session['valid_gene'] = True

    return None
Ejemplo n.º 6
0
# Code source: Patrick Kunzmann
# License: BSD 3 clause

import biotite
import biotite.sequence as seq
import biotite.sequence.io.fasta as fasta
import biotite.sequence.io.genbank as gb
import biotite.sequence.graphics as graphics
import biotite.sequence.align as align
import biotite.database.entrez as entrez
import numpy as np
import matplotlib.pyplot as plt

# Download and read E. coli BL21 genome
gb_file = gb.GenBankFile.read(
    entrez.fetch("CP001509", None, "gb", "nuccore", "gb"))
annot_seq = gb.get_annotated_sequence(gb_file, include_only=["gene"])
# Find leuL gene
for feature in annot_seq.annotation:
    if "gene" in feature.qual and feature.qual["gene"] == "leuL":
        leul_feature = feature
# Get leuL sequence
leul_seq = annot_seq[leul_feature]

# Download and read Salmonella enterica genome without annotations
fasta_file = fasta.FastaFile.read(
    entrez.fetch("CP019649", None, "fa", "nuccore", "fasta"))
se_genome = fasta.get_sequence(fasta_file)
# Find leuL in genome by local alignment
matrix = align.SubstitutionMatrix.std_nucleotide_matrix()
# Use general gap penalty to save RAM
Ejemplo n.º 7
0
    multi_line=False,
    loc_range=(1, 100),
    # Register our drawing functions
    feature_plotters=[HelixPlotter(), SheetPlotter()])
fig.tight_layout()

########################################################################
# Now let us do some serious application.
# We want to visualize the secondary structure of one monomer of the
# homodimeric transketolase (PDB: 1QGD).
# The simplest way to do that, is to fetch the corresponding GenBank
# file, extract an `Annotation` object from the file and draw the
# annotation.

# Fetch GenBank files of the TK's first chain and extract annotatation
file_name = entrez.fetch("1QGD_A", biotite.temp_dir(), "gb", "protein", "gb")
gb_file = gb.GenBankFile()
gb_file.read(file_name)
annotation = gb.get_annotation(gb_file, include_only=["SecStr"])
# Length of the sequence
_, length, _, _, _, _ = gb.get_locus(gb_file)

fig = plt.figure(figsize=(8.0, 3.0))
ax = fig.add_subplot(111)
graphics.plot_feature_map(
    ax,
    annotation,
    symbols_per_line=150,
    show_numbers=True,
    show_line_position=True,
    # 'loc_range' takes exclusive stop -> length+1 is required
Ejemplo n.º 8
0
# Code source: Patrick Kunzmann
# License: BSD 3 clause

import biotite
import biotite.sequence as seq
import biotite.sequence.graphics as graphics
import biotite.sequence.io.genbank as gb
import biotite.database.entrez as entrez
import numpy as np
import matplotlib.pyplot as plt

# Download E. coli BL21 genome
file_name = entrez.fetch("CP001509",
                         biotite.temp_dir(),
                         suffix="gb",
                         db_name="nuccore",
                         ret_type="gb")
gb_file = gb.GenBankFile()
gb_file.read(file_name)
_, seq_length, _, _, _, _ = gb.get_locus(gb_file)
annotation = gb.get_annotation(gb_file, include_only=["gene"])
# Find the minimum and maximum locations of lac genes
min_loc = seq_length
max_loc = 1
for feature in annotation:
    for loc in feature.locs:
        # Ignore if feature is only a pseudo-gene (e.g. gene fragment)
        # and check if feature is lacA gene (begin of lac operon)
        if "gene" in feature.qual \
            and  "pseudo" not in feature.qual \
Ejemplo n.º 9
0
"""

# Code source: Patrick Kunzmann
# License: BSD 3 clause

import biotite
import biotite.sequence as seq
import biotite.sequence.io.fasta as fasta
import biotite.sequence.graphics as graphics
import biotite.application.muscle as muscle
import biotite.application.blast as blast
import biotite.database.entrez as entrez
import matplotlib.pyplot as plt

# Download sequence of Streptococcus pyogenes Cas9
file_name = entrez.fetch("Q99ZW2", biotite.temp_dir(), "fa", "protein", "fasta")
file = fasta.FastaFile.read(file_name)
ref_seq = fasta.get_sequence(file)
# Find homologous proteins using NCBI Blast
# Search only the UniProt/SwissProt database
blast_app = blast.BlastWebApp("blastp", ref_seq, "swissprot", obey_rules=False)
blast_app.start()
blast_app.join()
alignments = blast_app.get_alignments()
# Get hit IDs for hits with score > 200
hits = []
for ali in alignments:
    if ali.score > 200:
        hits.append(ali.hit_id)
# Get the sequences from hit IDs
hit_seqs = []
Ejemplo n.º 10
0
# the respective sequence strings.
# Actually you can cast the  :class:`FastaFile` object into a
# :class:`dict`.
# Let's demonstrate this on the genome of the *lambda* phage
# (Accession: ``NC_001416```).
# After downloading the FASTA file from the NCBI Entrez database,
# we can load the contents in the following way:

import biotite
import biotite.sequence as seq
import biotite.sequence.io.fasta as fasta
import biotite.database.entrez as entrez

file_path = entrez.fetch("NC_001416",
                         biotite.temp_dir(),
                         suffix="fa",
                         db_name="nuccore",
                         ret_type="fasta")
file = fasta.FastaFile()
file.read(file_path)
for header, string in file.items():
    print("Header:", header)
    print(len(string))
    print("Sequence:", string[:50], "...")
    print("Sequence length:", len(string))

########################################################################
# Since there is only a single sequence in the file, the loop is run
# only one time.
# As the sequence string is very long, only the first 50 bp are printed.
# Now this string could be used as input parameter for creation of a
Ejemplo n.º 11
0
#
# Read mapping
# ------------
#
# In the next step we map each read to its respective position
# in the reference genome.
# An additional challenge is to find the correct sense of the read:
# In the library preparation both, sense and complementary DNA, is
# produced from the virus RNA.
# For this reason we need to create a complementary copy for each read
# and map both strands to the reference genome.
# Later the *wrong* strand is discarded.

# Download and read the reference SARS-CoV-2 genome
orig_genome_file = entrez.fetch(
    "NC_045512", tempfile.gettempdir(), "gb",
    db_name="Nucleotide", ret_type="gb"
)
orig_genome = seqio.load_sequence(orig_genome_file)

# Create complementary reads
compl_reads = list(itertools.chain(
    *[(read, read.reverse(False).complement()) for read in reads]
))

########################################################################
# To map the reads to their corresponding positions in the reference
# genome, we need to align them to it.
# Although we could use :func:`align_optimal()`
# (Needleman-Wunsch algorithm [4]_) for this purpose, aligning this
# large number of reads to even a small virus genome would take hours.
#
Ejemplo n.º 12
0
def test_fetch_invalid():
    with pytest.raises(RequestError):
        file = entrez.fetch("xxxx", tempfile.gettempdir(), "fa", "protein",
                            "fasta", overwrite=True)
Ejemplo n.º 13
0
"""

# Code source: Patrick Kunzmann
# License: BSD 3 clause

import tempfile
import itertools
import numpy as np
import biotite.sequence as seq
import biotite.sequence.io.genbank as gb
import biotite.sequence.io.fasta as fasta
import biotite.database.entrez as entrez

# Get the E. coli K-12 genome as annotated sequence
gb_file = gb.GenBankFile.read(
    entrez.fetch("U00096", tempfile.gettempdir(), "gb", "nuccore", "gb"))
# We are only interested in CDS features
k12_genome = gb.get_annotated_sequence(gb_file, include_only=["CDS"])

# This dictionary will count how often each codon occurs in the genome
# For increased performance the dictionary uses symbol codes ([0 3 2])
# instead of symbols (['A' 'T' 'G']) as keys
codon_counter = {
    codon: 0
    for codon in itertools.product(
        *([range(len(k12_genome.sequence.alphabet))] * 3))
}
# For demonstration purposes print the 64 codons in symbol code form
print(list(codon_counter.keys()))

########################################################################
Ejemplo n.º 14
0
import matplotlib.pyplot as plt
import biotite
import biotite.sequence as seq
import biotite.sequence.align as align
import biotite.sequence.io.fasta as fasta
import biotite.database.entrez as entrez
import biotite.sequence.graphics as graphics

# Download and parse protein sequences of Covid and Mers
covid_file_path = entrez.fetch("NC_045512",
                               "myresult_dir",
                               suffix="fa",
                               db_name="nuccore",
                               ret_type="fasta")
mers_file_path = entrez.fetch("NC_019843.3",
                              "myresult_dir",
                              suffix="fa",
                              db_name="nuccore",
                              ret_type="fasta")
# Read the file
c_file = fasta.FastaFile()
c_file.read(covid_file_path)
m_file = fasta.FastaFile()
m_file.read(mers_file_path)
# Display
for h, s in c_file.items():
    print(h)
    print(s)
    covid_seq = seq.NucleotideSequence(s)
for h, s in m_file.items():
    print(h)
Ejemplo n.º 15
0
sequence alignment of the hit sequences afterwards, using MUSCLE.
"""

# Code source: Patrick Kunzmann
# License: BSD 3 cl
from tempfile import gettempdir
import biotite.sequence as seq
import biotite.sequence.io.fasta as fasta
import biotite.sequence.graphics as graphics
import biotite.application.muscle as muscle
import biotite.application.blast as blast
import biotite.database.entrez as entrez
import matplotlib.pyplot as plt

# Download sequence of Streptococcus pyogenes Cas9
file_name = entrez.fetch("Q99ZW2", gettempdir(), "fa", "protein", "fasta")
fasta_file = fasta.FastaFile.read(file_name)
ref_seq = fasta.get_sequence(fasta_file)
# Find homologous proteins using NCBI Blast
# Search only the UniProt/SwissProt database
blast_app = blast.BlastWebApp("blastp", ref_seq, "swissprot", obey_rules=False)
blast_app.start()
blast_app.join()
alignments = blast_app.get_alignments()
# Get hit IDs for hits with score > 200
hits = []
for ali in alignments:
    if ali.score > 200:
        hits.append(ali.hit_id)
# Get the sequences from hit IDs
hit_seqs = []
Ejemplo n.º 16
0
"""

# Code source: Patrick Kunzmann
# License: BSD 3 clause

import itertools
import numpy as np
import biotite
import biotite.sequence as seq
import biotite.sequence.io.genbank as gb
import biotite.sequence.io.fasta as fasta
import biotite.database.entrez as entrez

# Get the E. coli K-12 genome as annotated sequence
gb_file = gb.GenBankFile.read(
    entrez.fetch("U00096", biotite.temp_dir(), "gb", "nuccore", "gb"))
# We are only interested in CDS features
k12_genome = gb.get_annotated_sequence(gb_file, include_only=["CDS"])

# This dictionary will count how often each codon occurs in the genome
# For increased performance the dictionary uses symbol codes ([0 3 2])
# instead of symbols (['A' 'T' 'G']) as keys
codon_counter = {
    codon: 0
    for codon in itertools.product(
        *([range(len(k12_genome.sequence.alphabet))] * 3))
}
# For demonstration purposes print the 64 codons in symbol code form
print(list(codon_counter.keys()))

########################################################################
This script creates a feature map for the region around the *lac* operon
in the E. coli BL21 genome.
"""

# Code source: Patrick Kunzmann
# License: BSD 3 clause

import biotite.sequence as seq
import biotite.sequence.graphics as graphics
import biotite.sequence.io.genbank as gb
import biotite.database.entrez as entrez
import numpy as np
import matplotlib.pyplot as plt

# Download E. coli BL21 genome
file = entrez.fetch("CP001509", None, suffix="gb",
                         db_name="nuccore", ret_type="gb")
gb_file = gb.GenBankFile.read(file)
_, seq_length, _, _, _, _ = gb.get_locus(gb_file)
annotation = gb.get_annotation(gb_file, include_only=["gene"])
# Find the minimum and maximum locations of lac genes
min_loc = seq_length
max_loc = 1
for feature in annotation:
    for loc in feature.locs:
        # Ignore if feature is only a pseudo-gene (e.g. gene fragment)
        # and check if feature is lacA gene (begin of lac operon)
        if "gene" in feature.qual \
            and  "pseudo" not in feature.qual \
            and feature.qual["gene"] == "lacA":
                if min_loc > loc.first:
                    min_loc = loc.first
Ejemplo n.º 18
0
Since we want to perform a six-frame translation we have to look at
the complementary strand of the genome as well.
"""

# Code source: Patrick Kunzmann
# License: BSD 3 clause

import biotite
import biotite.sequence as seq
import biotite.sequence.io.fasta as fasta
import biotite.database.entrez as entrez
import matplotlib.pyplot as plt

# Download Porcine circovirus genome
file_name = entrez.fetch("KP282147", biotite.temp_dir(), "fa", "nuccore",
                         "fasta")
fasta_file = fasta.FastaFile()
fasta_file.read(file_name)
genome = fasta.get_sequence(fasta_file)
# Perform translation for forward strand
proteins, positions = genome.translate()
print("Forward strand:")
for i in range(len(proteins)):
    print("{:4d} - {:4d}:   {:}".format(positions[i][0], positions[i][1],
                                        str(proteins[i])))
print("\n")
# Perform translation for complementary strand
genome_rev = genome.reverse().complement()
proteins, positions = genome_rev.translate()
print("Reverse strand:")
for i in range(len(proteins)):
Ejemplo n.º 19
0
is described as 3 integers instead of 3 letters.
"""

# Code source: Patrick Kunzmann
# License: BSD 3 clause

import itertools
import numpy as np
import biotite.sequence as seq
import biotite.sequence.io.genbank as gb
import biotite.sequence.io.fasta as fasta
import biotite.database.entrez as entrez

# Get the E. coli K-12 genome as annotated sequence
gb_file = gb.GenBankFile.read(
    entrez.fetch("U00096", None, "gb", "nuccore", "gb"))
# We are only interested in CDS features
k12_genome = gb.get_annotated_sequence(gb_file, include_only=["CDS"])

# This dictionary will count how often each codon occurs in the genome
# For increased performance the dictionary uses symbol codes ([0 3 2])
# instead of symbols (['A' 'T' 'G']) as keys
codon_counter = {
    codon: 0
    for codon in itertools.product(
        *([range(len(k12_genome.sequence.alphabet))] * 3))
}
# For demonstration purposes print the 64 codons in symbol code form
print(list(codon_counter.keys()))

########################################################################
Ejemplo n.º 20
0
import tempfile
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
from matplotlib.ticker import MultipleLocator
import biotite
import biotite.sequence as seq
import biotite.sequence.io as seqio
import biotite.sequence.io.genbank as gb
import biotite.sequence.align as align
import biotite.database.entrez as entrez
import biotite.application.tantan as tantan

fasta_file = entrez.fetch("NC_000932",
                          tempfile.gettempdir(),
                          "fasta",
                          db_name="Nucleotide",
                          ret_type="fasta")
chloroplast_seq = seqio.load_sequence(fasta_file)

fasta_file = entrez.fetch("NC_000911",
                          tempfile.gettempdir(),
                          "fasta",
                          db_name="Nucleotide",
                          ret_type="fasta")
bacterium_seq = seqio.load_sequence(fasta_file)

########################################################################
# For the *k-mer* matching step the genome of the cyanobacterium is
# indexed into a :class:`KmerTable`.
# As homologous regions between both genomes may also appear on the
Ejemplo n.º 21
0
    "E": -3.5,
    "Q": -3.5,
    "D": -3.5,
    "N": -3.5,
    "K": -3.9,
    "R": -4.5
}

# Look for the Swiss-Prot entry contaning the human HCN1 channel
query =   entrez.SimpleQuery("HCN1", "Gene Name") \
        & entrez.SimpleQuery("h**o sapiens", "Organism") \
        & entrez.SimpleQuery("srcdb_swiss-prot", "Properties")
uids = entrez.search(query, db_name="protein")
file_name = entrez.fetch(uids[0],
                         biotite.temp_dir(),
                         "gp",
                         db_name="protein",
                         ret_type="gp")

gp_file = gb.GenBankFile.read(file_name)
hcn1 = seq.ProteinSequence(gb.get_sequence(gp_file, format="gp"))
print(hcn1)

########################################################################
# The positional hydropathy is calculated and smoothened using
# a moving average for clearer visualization.

hydropathies = np.array([hydropathy_dict[symbol] for symbol in hcn1])


def moving_average(data_set, window_size):
Ejemplo n.º 22
0
# *NCBI Entrez* database, which is commonly known as *the NCBI*.
# It provides a myriad of information, ranging from sequences and
# sequence features to scientific articles.
# Fetching files from NCBI Entrez works analogous to the RCSB interface.
# This time we have to provide the UIDs (Accession or GI) instead of
# PDB IDs to the :func:`fetch()` function.
# Furthermore, we need to specifiy the database to retrieve the data
# from and the retrieval type.

from tempfile import gettempdir, NamedTemporaryFile
import biotite.database.entrez as entrez

# Fetch a single UID ...
file_path = entrez.fetch("NC_001416",
                         gettempdir(),
                         suffix="fa",
                         db_name="nuccore",
                         ret_type="fasta")
print(file_path)
# ... or multiple UIDs
file_paths = entrez.fetch(["1L2Y_A", "1AKI_A"],
                          gettempdir(),
                          suffix="fa",
                          db_name="protein",
                          ret_type="fasta")
print([file_path for file_path in file_paths])

########################################################################
# A list of valid database, retrieval type and mode combinations can
# be found
# `here <https://www.ncbi.nlm.nih.gov/books/NBK25499/table/chapter4.T._valid_values_of__retmode_and/?report=objectonly>`_.
Ejemplo n.º 23
0
since domestic pigs are the host of the virus.

Since we want to perform a six-frame translation we have to look at
the complementary strand of the genome as well.
"""

# Code source: Patrick Kunzmann
# License: BSD 3 clause

import biotite.sequence as seq
import biotite.sequence.io.fasta as fasta
import biotite.database.entrez as entrez
import matplotlib.pyplot as plt

# Download Porcine circovirus genome
file = entrez.fetch("KP282147", None, "fa", "nuccore", "fasta")
fasta_file = fasta.FastaFile.read(file)
genome = fasta.get_sequence(fasta_file)
# Perform translation for forward strand
proteins, positions = genome.translate()
print("Forward strand:")
for i in range(len(proteins)):
    print("{:4d} - {:4d}:   {:}"
          .format(positions[i][0], positions[i][1], str(proteins[i])))
print("\n")
# Perform translation for complementary strand
genome_rev = genome.reverse().complement()
proteins, positions = genome_rev.translate()
print("Reverse strand:")
for i in range(len(proteins)):
    print("{:5d} - {:5d}:   {:}"
Ejemplo n.º 24
0
# Code source: Patrick Kunzmann
# License: BSD 3 clause

import biotite
import biotite.sequence as seq
import biotite.sequence.io.fasta as fasta
import biotite.sequence.io.genbank as gb
import biotite.sequence.graphics as graphics
import biotite.sequence.align as align
import biotite.database.entrez as entrez
import numpy as np
import matplotlib.pyplot as plt

# Download E. coli BL21 genome
file_name = entrez.fetch("CP001509", biotite.temp_dir(), "gb", "nuccore", "gb")
gb_file = gb.GenBankFile()
gb_file.read(file_name)
annot_seq = gb_file.get_annotated_sequence(include_only=["gene"])
# Find leuL gene
for feature in annot_seq.annotation:
    if "gene" in feature.qual and feature.qual["gene"] == "leuL":
        leul_feature = feature
# Get leuL sequence
leul_seq = annot_seq[leul_feature]

# Download Salmonella enterica genome without annotations
file_name = entrez.fetch("CP019649", biotite.temp_dir(), "fa", "nuccore",
                         "fasta")
fasta_file = fasta.FastaFile()
fasta_file.read(file_name)
Ejemplo n.º 25
0
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
import matplotlib.ticker as ticker
import biotite
import biotite.sequence as seq
import biotite.sequence.io.genbank as gb
import biotite.sequence.graphics as graphics
import biotite.database.entrez as entrez
import biotite.application.muscle as muscle

UTR_LENGTH = 20

### Get the E. coli K-12 genome as annotated sequence

gb_file = gb.GenBankFile.read(
    entrez.fetch("U00096", tempfile.gettempdir(), "gb", "nuccore", "gb"))
# We are only interested in CDS features
bl21_genome = gb.get_annotated_sequence(gb_file, include_only=["CDS"])

### Extract sequences for 5' untranslated regions (UTRs)

# In this case we define the untranslated region, as the sequence
# up to UTR_LENGTH bases upstream from the start codon
utrs = []
for cds in bl21_genome.annotation:
    # Expect a single location for the feature,
    # since no splicing can occur
    # Ignore special cases like ribosomal slippage sites, etc.
    # for simplicity
    if len(cds.locs) != 1:
        continue