def test_conversion_highlevel(path):
    """
    Test whether the high-level GenBank interface can properly read
    the locus, annotation and sequence from GenBank file and write
    these properties to a file, without data changing.
    """
    suffix = path[-2:]
    gb_file = gb.GenBankFile.read(path)
    ref_locus = gb.get_locus(gb_file)
    ref_annot_seq = gb.get_annotated_sequence(gb_file, format=suffix)

    gb_file = gb.GenBankFile()
    gb.set_locus(gb_file, *ref_locus)
    gb.set_annotated_sequence(gb_file, ref_annot_seq)
    temp = TemporaryFile("w+")
    gb_file.write(temp)
    
    temp.seek(0)
    gb_file = gb.GenBankFile.read(temp)
    temp.close()
    test_locus = gb.get_locus(gb_file)
    test_annot_seq = gb.get_annotated_sequence(gb_file, format=suffix)
    assert test_locus == ref_locus
    assert test_annot_seq.sequence       == ref_annot_seq.sequence
    assert test_annot_seq.annotation     == ref_annot_seq.annotation
    assert test_annot_seq.sequence_start == ref_annot_seq.sequence_start
Exemple #2
0
# coding region (at the terminator signal),
# hence ``BEYOND_RIGHT`` is applied.
# These two defects are also reflected in the *mRNA* feature.
#
# Annotated sequences
# ^^^^^^^^^^^^^^^^^^^
#
# Now, that you have understood what annotations are, we proceed to the
# next topic: annotated sequences.
# An :class:`AnnotatedSequence` is like an annotation, but the sequence
# is included this time.
# Since our GenBank file contains the
# sequence corresponding to the feature table, we can directly obtain the
# :class:`AnnotatedSequence`.

annot_seq = gb.get_annotated_sequence(file)
print("Same annotation as before?", (annotation == annot_seq.annotation))
print(annot_seq.sequence[:60], "...")

########################################################################
# When indexing an :class:`AnnotatedSequence` with a slice,
# the index is applied to the :class:`Annotation` and the
# :class:`Sequence`.
# While the :class:`Annotation` handles the index as shown before,
# the :class:`Sequence` is indexed based on the sequence start
# value (usually *1*).

print("Sequence start before indexing:", annot_seq.sequence_start)
for feature in annot_seq.annotation:
    if feature.key == "regulatory" \
        and feature.qual["regulatory_class"] == "polyA_signal_sequence":
def test_reverse_complement():
    gb_file = gb.GenBankFile.read(join(data_dir("sequence"), "ec_bl21.gb"))
    annot_seq = gb.get_annotated_sequence(gb_file)
    assert annot_seq == annot_seq.reverse_complement().reverse_complement()
Exemple #4
0
# So we use a set to store the source name of sequences we already
# listed and ignore all further occurences of that source species. 

# List of sequences
binding_sites = []
# List of source species
sources = []
# Set for ignoring already listed sources
listed_sources = set()
for file, source in zip(files, all_sources):
    if source in listed_sources:
        # Ignore already listed species
        continue
    bind_feature = None
    annot_seq = gb.get_annotated_sequence(
        file, include_only=["Site"], format="gp"
    )
    # Find the feature for DNA-binding site
    for feature in annot_seq.annotation:
        # DNA binding site is a helix-turn-helix motif
        if "site_type" in feature.qual \
            and feature.qual["site_type"] == "DNA binding" \
            and "H-T-H motif" in feature.qual["note"]:
                bind_feature = feature
    if bind_feature is not None:
        # If the feature is found,
        # get the sequence slice that is defined by the feature...
        binding_sites.append(annot_seq[bind_feature])
        # ...and save the respective source species
        sources.append(source)
        listed_sources.add(source)
Exemple #5
0
)[0]
identity = align.get_sequence_identity(genome_alignment, 'all')
print(f"Sequence identity: {identity * 100:.2f} %")

########################################################################
# Now we would like to have a closer look at the mutation locations.
# To contextualize the locations we plot the mutation frequency along
# with the gene locations.
# The genomic coordinates for each gene can be extracted from the
# already downloaded *GenBank* file of the reference genome.

N_BINS = 50

# Get genomic coordinates for all SARS-Cov-2 genes
gb_file = gb.GenBankFile.read(orig_genome_file)
annot_seq = gb.get_annotated_sequence(gb_file, include_only=["gene"])

# Calculate the sequence identity within each bin
bin_identities = np.zeros(N_BINS)
edges = np.linspace(0, len(orig_genome), N_BINS+1)
for i, (bin_start, bin_stop) in enumerate(zip(edges[:-1], edges[1:])):
    orig_genome_trace = genome_alignment.trace[:,1]
    excerpt = genome_alignment[
        (orig_genome_trace >= bin_start) & (orig_genome_trace < bin_stop)
    ]
    bin_identities[i] = align.get_sequence_identity(excerpt, "all")


fig, (deviation_ax, feature_ax) = plt.subplots(nrows=2, figsize=(8.0, 5.0))

# Plot the deviation = 1 - sequence identity
Exemple #6
0
# Code source: Patrick Kunzmann
# License: BSD 3 clause

import itertools
import numpy as np
import biotite
import biotite.sequence as seq
import biotite.sequence.io.genbank as gb
import biotite.sequence.io.fasta as fasta
import biotite.database.entrez as entrez

# Get the E. coli K-12 genome as annotated sequence
gb_file = gb.GenBankFile.read(
    entrez.fetch("U00096", biotite.temp_dir(), "gb", "nuccore", "gb"))
# We are only interested in CDS features
k12_genome = gb.get_annotated_sequence(gb_file, include_only=["CDS"])

# This dictionary will count how often each codon occurs in the genome
# For increased performance the dictionary uses symbol codes ([0 3 2])
# instead of symbols (['A' 'T' 'G']) as keys
codon_counter = {
    codon: 0
    for codon in itertools.product(
        *([range(len(k12_genome.sequence.alphabet))] * 3))
}
# For demonstration purposes print the 64 codons in symbol code form
print(list(codon_counter.keys()))

########################################################################
# As expected the dictionary encodes each codon as tuple of 3 numbers,
# where ``0`` represents ``A``, ``1`` ``C``, ``2`` ``G`` and ``3`` ``T``.
Exemple #7
0
import biotite
import biotite.sequence as seq
import biotite.sequence.io.fasta as fasta
import biotite.sequence.io.genbank as gb
import biotite.sequence.graphics as seqgraphics
import biotite.sequence.align as align
import biotite.database.entrez as entrez
import biotite.structure.graphics as strucgraphics
import biotite.application.viennarna as viennarna

# Download Escherichia coli BL21 and Salmonella enterica genome
gb_file = gb.MultiFile.read(
    entrez.fetch_single_file(["CP001509", "CP019649"], None, "nuccore", "gb"))
ec_file, se_file = tuple(gb_file)

annot_seq = gb.get_annotated_sequence(ec_file, include_only=["ncRNA"])
# Find M1 gene in E. coli genome via its annotation
for feature in annot_seq.annotation:
    if "product" in feature.qual and "RNase P" in feature.qual["product"]:
        m1_sequence = annot_seq[feature]

# Get S. enterica genome sequence
se_genome = gb.get_sequence(se_file)

# We want to search in the genome sequence and its reverse complement
genomic_seqs = [se_genome, se_genome.reverse().complement()]

########################################################################
# In an initial fast matching step, we look for matching *k-mers*
# between *M1* and the *S. enterica* genome.
# A matching *k-mer* is a length *k* subsequence, that appears in both