def test_genbank_utility_gp():
    """
    Check whether the high-level utility functions return the expected
    content of a known GenPept file. 
    """
    gp_file = gb.GenBankFile.read(join(data_dir("sequence"), "bt_lysozyme.gp"))
    #[print(e) for e in gp_file._field_pos]
    assert gb.get_locus(gp_file) \
        == ("AAC37312", 147, "", False, "MAM", "27-APR-1993")
    assert gb.get_definition(gp_file) == "lysozyme [Bos taurus]."
    assert gb.get_version(gp_file) == "AAC37312.1"
    assert gb.get_gi(gp_file) == 163334
    annotation = gb.get_annotation(gp_file)
    feature = seq.Feature(
        "Site",
        [seq.Location(start, stop) for start, stop in zip(
            [52,55,62,76,78,81,117,120,125],
            [53,55,62,76,78,81,117,120,126]
        )],
        {"note": "lysozyme catalytic cleft [active]", "site_type": "active"}
    )
    in_annotation = False
    for f in annotation:
        if f.key == feature.key and f.locs == feature.locs and \
           all([(key, val in f.qual.items())
                for key, val in feature.qual.items()]):
                    in_annotation = True
    assert in_annotation
    assert len(gb.get_sequence(gp_file, format="gp")) == 147
Exemple #2
0
def test_genbank_consistency(path):
    """
    Test whether the same annotation (if reasonable) can be read from a
    GFF3 file and a GenBank file.
    """
    gb_file = gb.GenBankFile.read(join(data_dir("sequence"), path))
    ref_annot = gb.get_annotation(gb_file)

    gff_file = gff.GFFFile.read(join(data_dir("sequence"), path[:-3] + ".gff3"))
    test_annot = gff.get_annotation(gff_file)
    
    # Remove qualifiers, since they will be different
    # in GFF3 and GenBank
    ref_annot = seq.Annotation(
        [seq.Feature(feature.key, feature.locs) for feature in ref_annot]
    )
    test_annot = seq.Annotation(
        [seq.Feature(feature.key, feature.locs) for feature in test_annot]
    )
    for feature in test_annot:
        # Only CDS, gene, intron and exon should be equal
        # in GenBank and GFF3
        if feature.key in ["CDS", "gene", "intron", "exon"]:
            try:
                assert feature in test_annot
            except AssertionError:
                print(feature.key)
                for loc in feature.locs:
                    print(loc)
                raise
def test_genbank_utility_gb():
    """
    Check whether the high-level utility functions return the expected
    content of a known GenBank file. 
    """
    gb_file = gb.GenBankFile.read(join(data_dir("sequence"), "ec_bl21.gb"))
    assert gb.get_locus(gb_file) \
        == ("CP001509", 4558953, "DNA", True, "BCT", "16-FEB-2017")
    assert gb.get_definition(gb_file) \
        == ("Escherichia coli BL21(DE3), complete genome.")
    assert gb.get_version(gb_file) == "CP001509.3"
    assert gb.get_gi(gb_file) == 296142109
    assert gb.get_db_link(gb_file) \
        == {"BioProject" : "PRJNA20713", "BioSample" : "SAMN02603478"}
    annotation = gb.get_annotation(gb_file, include_only=["CDS"])
    feature = seq.Feature(
        "CDS",
        [seq.Location(5681, 6457, seq.Location.Strand.REVERSE)],
        {"gene": "yaaA", "transl_table": "11"}
    )
    in_annotation = False
    for f in annotation:
        if f.key == feature.key and f.locs == feature.locs and \
           all([(key, val in f.qual.items())
                for key, val in feature.qual.items()]):
                    in_annotation = True
    assert in_annotation
    assert len(gb.get_sequence(gb_file, format="gb")) == 4558953
Exemple #4
0
def fetch_gb_annotation(pdb_chain=str):

    # input line retained for debugging
    # pdb_chain = "6FRH_A"

    # Fetch GenBank files of the TK's first chain and extract annotatation
    file_name = entrez.fetch(pdb_chain, biotite.temp_dir(), "gb", "protein",
                             "gb")
    gb_file = gb.GenBankFile()
    gb_file.read(file_name)
    annotation = gb.get_annotation(gb_file, include_only=["SecStr"])
    return annotation
Exemple #5
0
def make_feature_maps(gene):

    try:
        find_id = entrez.fetch(gene,
                               gettempdir(),
                               suffix="gb",
                               db_name="nuccore",
                               ret_type="gb")
        read_file = gb.GenBankFile.read(find_id)
        file_annotation = gb.get_annotation(read_file)
    except:
        flash('The entered gene could not found. Please try again.', 'error')
        return None

    key_list = []

    for feature in file_annotation:
        keys = feature.key
        key_list.append(keys)
        if feature.key == "source":
            # loc_range has exclusive stop
            loc = list(feature.locs)[0]
            loc_range = (loc.first, loc.last + 1)
            Unique_key = np.unique(key_list)

    pwd = os.getcwd()

    Unique_key = np.unique(key_list)
    for j in range(len(Unique_key)):
        i = Unique_key[j]

        fig, ax = plt.subplots(figsize=(8.0, 2.0))
        graphics.plot_feature_map(ax,
                                  seq.Annotation([
                                      feature for feature in file_annotation
                                      if feature.key == i
                                  ]),
                                  multi_line=False,
                                  loc_range=loc_range,
                                  show_line_position=True)

        plt.title('This plot is for {} features'.format(i))
        plt.savefig(pwd + '/app/static/images/{}.png'.format(i), dpi=300)
        session['valid_gene'] = True

    return None
Exemple #6
0
# Array that will hold for each of the genes and each of the 4 domains
# the first and last position
# The array is initally filled with -1, as the value -1 will indicate
# that the domain does not exist in the sigma factor
domain_pos = np.full((len(genes), 4, 2), -1, dtype=int)
# Array that will hold the total sequence length of each sigma factor
seq_lengths = np.zeros(len(genes), dtype=int)
# Read the merged file containing multiple GenBank entries
multi_file = gb.MultiFile()
multi_file.read(file_name)
# Iterate over each GenBank entry
for i, gb_file in enumerate(multi_file):
    _, length, _, _, _, _ = gb.get_locus(gb_file)
    seq_lengths[i] = length
    annotation = gb.get_annotation(gb_file)
    # Find features, that represent a sigma factor domain
    for feature in annotation:
        if feature.key == "Region" and "note" in feature.qual \
           and "Sigma-70 factor domain" in feature.qual["note"]:
            # Extract the domain number
            # and decrement for 0-based indexing
            #
            # e.g. 'Sigma-70 factor domain-2.' => 1
            #                              ^
            domain_index = int(
                re.findall("(?<=Sigma-70 factor domain-)\d+",
                           feature.qual["note"])[0]) - 1
            # Expect a single contiguous location of the domain
            assert len(feature.locs) == 1
            loc = list(feature.locs)[0]
import numpy as np
import biotite
import biotite.sequence.io.genbank as gb
import biotite.sequence.graphics as graphics
import biotite.database.entrez as entrez


PLASMID_URL = "https://media.addgene.org/snapgene-media/" \
              "v1.6.2-0-g4b4ed87/sequences/67/17/246717/" \
              "addgene-plasmid-26094-sequence-246717.gbk"

response = requests.get(PLASMID_URL)
gb_file = gb.GenBankFile.read(io.StringIO(response.text))
annotation = gb.get_annotation(gb_file,
                               include_only=[
                                   "promoter", "terminator", "protein_bind",
                                   "RBS", "CDS", "rep_origin", "primer_bind"
                               ])
_, seq_length, _, _, _, _ = gb.get_locus(gb_file)
# AddGene stores the plasmid name in the 'KEYWORDS' field
# [0][0][0] ->
# The first (and only) 'KEYWORDS' field
# The first entry in the tuple
# The first (and only) line in the field
plasmid_name = gb_file.get_fields("KEYWORDS")[0][0][0]


def custom_feature_formatter(feature):
    # AddGene stores the feature label in the '\label' qualifier
    label = feature.qual.get("label")
    if feature.key == "promoter":
import biotite.sequence.graphics as graphics
import biotite.sequence.io.genbank as gb
import biotite.database.entrez as entrez
import numpy as np
import matplotlib.pyplot as plt

# Download E. coli BL21 genome
file_name = entrez.fetch("CP001509",
                         biotite.temp_dir(),
                         suffix="gb",
                         db_name="nuccore",
                         ret_type="gb")
gb_file = gb.GenBankFile()
gb_file.read(file_name)
_, seq_length, _, _, _, _ = gb.get_locus(gb_file)
annotation = gb.get_annotation(gb_file, include_only=["gene"])
# Find the minimum and maximum locations of lac genes
min_loc = seq_length
max_loc = 1
for feature in annotation:
    for loc in feature.locs:
        # Ignore if feature is only a pseudo-gene (e.g. gene fragment)
        # and check if feature is lacA gene (begin of lac operon)
        if "gene" in feature.qual \
            and  "pseudo" not in feature.qual \
            and feature.qual["gene"] == "lacA":
            if min_loc > loc.first:
                min_loc = loc.first
            if max_loc < loc.last:
                max_loc = loc.last
# Extend the location range by 1000 (arbitrary) in each dirction
# green, respectively.

N_COL = 4
MAX_NAME_LENGTH = 30
EXCERPT_SIZE = 3000

COLORS = {
    "CDS": biotite.colors["dimgreen"],
    "tRNA": biotite.colors["orange"],
    "rRNA": biotite.colors["orange"]
}

# Fetch features of the chloroplast genome
gb_file = gb.GenBankFile.read(
    entrez.fetch("NC_000932", None, "gb", db_name="Nucleotide", ret_type="gb"))
annotation = gb.get_annotation(gb_file, include_only=["CDS", "rRNA", "tRNA"])


def draw_arrow(ax, feature, loc):
    x = loc.first
    dx = loc.last - loc.first + 1
    if loc.strand == seq.Location.Strand.FORWARD:
        x = loc.first
        dx = loc.last - loc.first + 1
    else:
        x = loc.last
        dx = loc.first - loc.last + 1

    # Create head with 90 degrees tip -> head width/length ratio = 1/2
    ax.add_patch(
        biotite.AdaptiveFancyArrow(x,
Exemple #10
0
# An annotation is the collection of features corresponding to one
# sequence (the sequence itself is not included, though).
# In case of *Biotite* we can get an :class:`Annotation` object from the
# :class:`GenBankFile`.
# This :class:`Annotation` can be iterated in order to obtain single
# :class:`Feature` objects.
# Each :class:`Feature` contains 3 pieces of information: Its feature
# key (e.g. *regulatory* or *CDS*), a dictionary of qualifiers and one
# or multiple locations on the corresponding sequence.
# A :class:`Location` in turn, contains its starting and its ending
# base/residue position, the strand it is on (only for DNA) and possible
# *location defects* (defects will be discussed later).
# In the next example we will print the keys of the features and their
# locations:

annotation = gb.get_annotation(file)
for feature in annotation:
    # Convert the feature locations in better readable format
    locs = [str(loc) for loc in sorted(feature.locs, key=lambda l: l.first)]
    print(f"{feature.key:12}   {locs}")

########################################################################
# The ``'>'`` characters in the string representations of a location
# indicate that the location is on the forward strand.
# Most of the features have only one location, except the *mRNA* and
# *CDS* feature, which have 4 locations joined.
# When we look at the rest of the features, this makes sense: The gene
# has 4 exons.
# Therefore, the mRNA (and consequently the CDS) is composed of
# these exons.
#
Exemple #11
0
    feature_plotters=[HelixPlotter(), SheetPlotter()])
fig.tight_layout()

########################################################################
# Now let us do some serious application.
# We want to visualize the secondary structure of one monomer of the
# homodimeric transketolase (PDB: 1QGD).
# The simplest way to do that, is to fetch the corresponding GenBank
# file, extract an `Annotation` object from the file and draw the
# annotation.

# Fetch GenBank files of the TK's first chain and extract annotatation
file_name = entrez.fetch("1QGD_A", biotite.temp_dir(), "gb", "protein", "gb")
gb_file = gb.GenBankFile()
gb_file.read(file_name)
annotation = gb.get_annotation(gb_file, include_only=["SecStr"])
# Length of the sequence
_, length, _, _, _, _ = gb.get_locus(gb_file)

fig = plt.figure(figsize=(8.0, 3.0))
ax = fig.add_subplot(111)
graphics.plot_feature_map(
    ax,
    annotation,
    symbols_per_line=150,
    show_numbers=True,
    show_line_position=True,
    # 'loc_range' takes exclusive stop -> length+1 is required
    loc_range=(1, length + 1),
    feature_plotters=[HelixPlotter(), SheetPlotter()])
fig.tight_layout()
Exemple #12
0
figure = plt.figure(figsize=(8.0, 4.0))
ax = figure.add_subplot(111)

# Plot hydropathy
ax.plot(np.arange(1 + ma_radius,
                  len(hcn1) - ma_radius + 1),
        hydropathies,
        color=biotite.colors["dimorange"])
ax.axhline(0, color="gray", linewidth=0.5)
ax.set_xlim(1, len(hcn1) + 1)
ax.set_xlabel("HCN1 sequence position")
ax.set_ylabel("Hydropathy (15 residues moving average)")

# Draw boxes for annotated transmembrane helices for comparison
# with hydropathy plot
annotation = gb.get_annotation(gp_file, include_only=["Region"])
transmembrane_annotation = seq.Annotation([
    feature for feature in annotation
    if feature.qual["region_name"] == "Transmembrane region"
])
for feature in transmembrane_annotation:
    first, last = feature.get_location_range()
    ax.axvspan(first, last, color=(0.0, 0.0, 0.0, 0.2), linewidth=0)

# Plot similarity score as measure for conservation
ax2 = ax.twinx()
ax2.plot(np.arange(1 + ma_radius,
                   len(hcn1) - ma_radius + 1),
         scores,
         color=biotite.colors["brightorange"])
ax2.set_ylabel("Similarity score (15 residues moving average)")