def test_genbank_utility_gp(): """ Check whether the high-level utility functions return the expected content of a known GenPept file. """ gp_file = gb.GenBankFile.read(join(data_dir("sequence"), "bt_lysozyme.gp")) #[print(e) for e in gp_file._field_pos] assert gb.get_locus(gp_file) \ == ("AAC37312", 147, "", False, "MAM", "27-APR-1993") assert gb.get_definition(gp_file) == "lysozyme [Bos taurus]." assert gb.get_version(gp_file) == "AAC37312.1" assert gb.get_gi(gp_file) == 163334 annotation = gb.get_annotation(gp_file) feature = seq.Feature( "Site", [seq.Location(start, stop) for start, stop in zip( [52,55,62,76,78,81,117,120,125], [53,55,62,76,78,81,117,120,126] )], {"note": "lysozyme catalytic cleft [active]", "site_type": "active"} ) in_annotation = False for f in annotation: if f.key == feature.key and f.locs == feature.locs and \ all([(key, val in f.qual.items()) for key, val in feature.qual.items()]): in_annotation = True assert in_annotation assert len(gb.get_sequence(gp_file, format="gp")) == 147
def test_genbank_consistency(path): """ Test whether the same annotation (if reasonable) can be read from a GFF3 file and a GenBank file. """ gb_file = gb.GenBankFile.read(join(data_dir("sequence"), path)) ref_annot = gb.get_annotation(gb_file) gff_file = gff.GFFFile.read(join(data_dir("sequence"), path[:-3] + ".gff3")) test_annot = gff.get_annotation(gff_file) # Remove qualifiers, since they will be different # in GFF3 and GenBank ref_annot = seq.Annotation( [seq.Feature(feature.key, feature.locs) for feature in ref_annot] ) test_annot = seq.Annotation( [seq.Feature(feature.key, feature.locs) for feature in test_annot] ) for feature in test_annot: # Only CDS, gene, intron and exon should be equal # in GenBank and GFF3 if feature.key in ["CDS", "gene", "intron", "exon"]: try: assert feature in test_annot except AssertionError: print(feature.key) for loc in feature.locs: print(loc) raise
def test_genbank_utility_gb(): """ Check whether the high-level utility functions return the expected content of a known GenBank file. """ gb_file = gb.GenBankFile.read(join(data_dir("sequence"), "ec_bl21.gb")) assert gb.get_locus(gb_file) \ == ("CP001509", 4558953, "DNA", True, "BCT", "16-FEB-2017") assert gb.get_definition(gb_file) \ == ("Escherichia coli BL21(DE3), complete genome.") assert gb.get_version(gb_file) == "CP001509.3" assert gb.get_gi(gb_file) == 296142109 assert gb.get_db_link(gb_file) \ == {"BioProject" : "PRJNA20713", "BioSample" : "SAMN02603478"} annotation = gb.get_annotation(gb_file, include_only=["CDS"]) feature = seq.Feature( "CDS", [seq.Location(5681, 6457, seq.Location.Strand.REVERSE)], {"gene": "yaaA", "transl_table": "11"} ) in_annotation = False for f in annotation: if f.key == feature.key and f.locs == feature.locs and \ all([(key, val in f.qual.items()) for key, val in feature.qual.items()]): in_annotation = True assert in_annotation assert len(gb.get_sequence(gb_file, format="gb")) == 4558953
def fetch_gb_annotation(pdb_chain=str): # input line retained for debugging # pdb_chain = "6FRH_A" # Fetch GenBank files of the TK's first chain and extract annotatation file_name = entrez.fetch(pdb_chain, biotite.temp_dir(), "gb", "protein", "gb") gb_file = gb.GenBankFile() gb_file.read(file_name) annotation = gb.get_annotation(gb_file, include_only=["SecStr"]) return annotation
def make_feature_maps(gene): try: find_id = entrez.fetch(gene, gettempdir(), suffix="gb", db_name="nuccore", ret_type="gb") read_file = gb.GenBankFile.read(find_id) file_annotation = gb.get_annotation(read_file) except: flash('The entered gene could not found. Please try again.', 'error') return None key_list = [] for feature in file_annotation: keys = feature.key key_list.append(keys) if feature.key == "source": # loc_range has exclusive stop loc = list(feature.locs)[0] loc_range = (loc.first, loc.last + 1) Unique_key = np.unique(key_list) pwd = os.getcwd() Unique_key = np.unique(key_list) for j in range(len(Unique_key)): i = Unique_key[j] fig, ax = plt.subplots(figsize=(8.0, 2.0)) graphics.plot_feature_map(ax, seq.Annotation([ feature for feature in file_annotation if feature.key == i ]), multi_line=False, loc_range=loc_range, show_line_position=True) plt.title('This plot is for {} features'.format(i)) plt.savefig(pwd + '/app/static/images/{}.png'.format(i), dpi=300) session['valid_gene'] = True return None
# Array that will hold for each of the genes and each of the 4 domains # the first and last position # The array is initally filled with -1, as the value -1 will indicate # that the domain does not exist in the sigma factor domain_pos = np.full((len(genes), 4, 2), -1, dtype=int) # Array that will hold the total sequence length of each sigma factor seq_lengths = np.zeros(len(genes), dtype=int) # Read the merged file containing multiple GenBank entries multi_file = gb.MultiFile() multi_file.read(file_name) # Iterate over each GenBank entry for i, gb_file in enumerate(multi_file): _, length, _, _, _, _ = gb.get_locus(gb_file) seq_lengths[i] = length annotation = gb.get_annotation(gb_file) # Find features, that represent a sigma factor domain for feature in annotation: if feature.key == "Region" and "note" in feature.qual \ and "Sigma-70 factor domain" in feature.qual["note"]: # Extract the domain number # and decrement for 0-based indexing # # e.g. 'Sigma-70 factor domain-2.' => 1 # ^ domain_index = int( re.findall("(?<=Sigma-70 factor domain-)\d+", feature.qual["note"])[0]) - 1 # Expect a single contiguous location of the domain assert len(feature.locs) == 1 loc = list(feature.locs)[0]
import numpy as np import biotite import biotite.sequence.io.genbank as gb import biotite.sequence.graphics as graphics import biotite.database.entrez as entrez PLASMID_URL = "https://media.addgene.org/snapgene-media/" \ "v1.6.2-0-g4b4ed87/sequences/67/17/246717/" \ "addgene-plasmid-26094-sequence-246717.gbk" response = requests.get(PLASMID_URL) gb_file = gb.GenBankFile.read(io.StringIO(response.text)) annotation = gb.get_annotation(gb_file, include_only=[ "promoter", "terminator", "protein_bind", "RBS", "CDS", "rep_origin", "primer_bind" ]) _, seq_length, _, _, _, _ = gb.get_locus(gb_file) # AddGene stores the plasmid name in the 'KEYWORDS' field # [0][0][0] -> # The first (and only) 'KEYWORDS' field # The first entry in the tuple # The first (and only) line in the field plasmid_name = gb_file.get_fields("KEYWORDS")[0][0][0] def custom_feature_formatter(feature): # AddGene stores the feature label in the '\label' qualifier label = feature.qual.get("label") if feature.key == "promoter":
import biotite.sequence.graphics as graphics import biotite.sequence.io.genbank as gb import biotite.database.entrez as entrez import numpy as np import matplotlib.pyplot as plt # Download E. coli BL21 genome file_name = entrez.fetch("CP001509", biotite.temp_dir(), suffix="gb", db_name="nuccore", ret_type="gb") gb_file = gb.GenBankFile() gb_file.read(file_name) _, seq_length, _, _, _, _ = gb.get_locus(gb_file) annotation = gb.get_annotation(gb_file, include_only=["gene"]) # Find the minimum and maximum locations of lac genes min_loc = seq_length max_loc = 1 for feature in annotation: for loc in feature.locs: # Ignore if feature is only a pseudo-gene (e.g. gene fragment) # and check if feature is lacA gene (begin of lac operon) if "gene" in feature.qual \ and "pseudo" not in feature.qual \ and feature.qual["gene"] == "lacA": if min_loc > loc.first: min_loc = loc.first if max_loc < loc.last: max_loc = loc.last # Extend the location range by 1000 (arbitrary) in each dirction
# green, respectively. N_COL = 4 MAX_NAME_LENGTH = 30 EXCERPT_SIZE = 3000 COLORS = { "CDS": biotite.colors["dimgreen"], "tRNA": biotite.colors["orange"], "rRNA": biotite.colors["orange"] } # Fetch features of the chloroplast genome gb_file = gb.GenBankFile.read( entrez.fetch("NC_000932", None, "gb", db_name="Nucleotide", ret_type="gb")) annotation = gb.get_annotation(gb_file, include_only=["CDS", "rRNA", "tRNA"]) def draw_arrow(ax, feature, loc): x = loc.first dx = loc.last - loc.first + 1 if loc.strand == seq.Location.Strand.FORWARD: x = loc.first dx = loc.last - loc.first + 1 else: x = loc.last dx = loc.first - loc.last + 1 # Create head with 90 degrees tip -> head width/length ratio = 1/2 ax.add_patch( biotite.AdaptiveFancyArrow(x,
# An annotation is the collection of features corresponding to one # sequence (the sequence itself is not included, though). # In case of *Biotite* we can get an :class:`Annotation` object from the # :class:`GenBankFile`. # This :class:`Annotation` can be iterated in order to obtain single # :class:`Feature` objects. # Each :class:`Feature` contains 3 pieces of information: Its feature # key (e.g. *regulatory* or *CDS*), a dictionary of qualifiers and one # or multiple locations on the corresponding sequence. # A :class:`Location` in turn, contains its starting and its ending # base/residue position, the strand it is on (only for DNA) and possible # *location defects* (defects will be discussed later). # In the next example we will print the keys of the features and their # locations: annotation = gb.get_annotation(file) for feature in annotation: # Convert the feature locations in better readable format locs = [str(loc) for loc in sorted(feature.locs, key=lambda l: l.first)] print(f"{feature.key:12} {locs}") ######################################################################## # The ``'>'`` characters in the string representations of a location # indicate that the location is on the forward strand. # Most of the features have only one location, except the *mRNA* and # *CDS* feature, which have 4 locations joined. # When we look at the rest of the features, this makes sense: The gene # has 4 exons. # Therefore, the mRNA (and consequently the CDS) is composed of # these exons. #
feature_plotters=[HelixPlotter(), SheetPlotter()]) fig.tight_layout() ######################################################################## # Now let us do some serious application. # We want to visualize the secondary structure of one monomer of the # homodimeric transketolase (PDB: 1QGD). # The simplest way to do that, is to fetch the corresponding GenBank # file, extract an `Annotation` object from the file and draw the # annotation. # Fetch GenBank files of the TK's first chain and extract annotatation file_name = entrez.fetch("1QGD_A", biotite.temp_dir(), "gb", "protein", "gb") gb_file = gb.GenBankFile() gb_file.read(file_name) annotation = gb.get_annotation(gb_file, include_only=["SecStr"]) # Length of the sequence _, length, _, _, _, _ = gb.get_locus(gb_file) fig = plt.figure(figsize=(8.0, 3.0)) ax = fig.add_subplot(111) graphics.plot_feature_map( ax, annotation, symbols_per_line=150, show_numbers=True, show_line_position=True, # 'loc_range' takes exclusive stop -> length+1 is required loc_range=(1, length + 1), feature_plotters=[HelixPlotter(), SheetPlotter()]) fig.tight_layout()
figure = plt.figure(figsize=(8.0, 4.0)) ax = figure.add_subplot(111) # Plot hydropathy ax.plot(np.arange(1 + ma_radius, len(hcn1) - ma_radius + 1), hydropathies, color=biotite.colors["dimorange"]) ax.axhline(0, color="gray", linewidth=0.5) ax.set_xlim(1, len(hcn1) + 1) ax.set_xlabel("HCN1 sequence position") ax.set_ylabel("Hydropathy (15 residues moving average)") # Draw boxes for annotated transmembrane helices for comparison # with hydropathy plot annotation = gb.get_annotation(gp_file, include_only=["Region"]) transmembrane_annotation = seq.Annotation([ feature for feature in annotation if feature.qual["region_name"] == "Transmembrane region" ]) for feature in transmembrane_annotation: first, last = feature.get_location_range() ax.axvspan(first, last, color=(0.0, 0.0, 0.0, 0.2), linewidth=0) # Plot similarity score as measure for conservation ax2 = ax.twinx() ax2.plot(np.arange(1 + ma_radius, len(hcn1) - ma_radius + 1), scores, color=biotite.colors["brightorange"]) ax2.set_ylabel("Similarity score (15 residues moving average)")