def test_fetch(common_name, as_file_like): path = None if as_file_like else biotite.temp_dir() db_name = "Protein" if common_name else "protein" file = entrez.fetch("1L2Y_A", path, "fa", db_name, "fasta", overwrite=True) fasta_file = fasta.FastaFile() fasta_file.read(file) prot_seq = fasta.get_sequence(fasta_file)
def test_fetch_invalid(): with pytest.raises(ValueError): file = entrez.fetch("xxxx", biotite.temp_dir(), "fa", "protein", "fasta", overwrite=True)
def test_fetch(): file = entrez.fetch("1L2Y_A", biotite.temp_dir(), "fa", "protein", "fasta", overwrite=True) fasta_file = fasta.FastaFile() fasta_file.read(file) prot_seq = fasta.get_sequence(fasta_file)
def fetch_gb_annotation(pdb_chain=str): # input line retained for debugging # pdb_chain = "6FRH_A" # Fetch GenBank files of the TK's first chain and extract annotatation file_name = entrez.fetch(pdb_chain, biotite.temp_dir(), "gb", "protein", "gb") gb_file = gb.GenBankFile() gb_file.read(file_name) annotation = gb.get_annotation(gb_file, include_only=["SecStr"]) return annotation
def make_feature_maps(gene): try: find_id = entrez.fetch(gene, gettempdir(), suffix="gb", db_name="nuccore", ret_type="gb") read_file = gb.GenBankFile.read(find_id) file_annotation = gb.get_annotation(read_file) except: flash('The entered gene could not found. Please try again.', 'error') return None key_list = [] for feature in file_annotation: keys = feature.key key_list.append(keys) if feature.key == "source": # loc_range has exclusive stop loc = list(feature.locs)[0] loc_range = (loc.first, loc.last + 1) Unique_key = np.unique(key_list) pwd = os.getcwd() Unique_key = np.unique(key_list) for j in range(len(Unique_key)): i = Unique_key[j] fig, ax = plt.subplots(figsize=(8.0, 2.0)) graphics.plot_feature_map(ax, seq.Annotation([ feature for feature in file_annotation if feature.key == i ]), multi_line=False, loc_range=loc_range, show_line_position=True) plt.title('This plot is for {} features'.format(i)) plt.savefig(pwd + '/app/static/images/{}.png'.format(i), dpi=300) session['valid_gene'] = True return None
# Code source: Patrick Kunzmann # License: BSD 3 clause import biotite import biotite.sequence as seq import biotite.sequence.io.fasta as fasta import biotite.sequence.io.genbank as gb import biotite.sequence.graphics as graphics import biotite.sequence.align as align import biotite.database.entrez as entrez import numpy as np import matplotlib.pyplot as plt # Download and read E. coli BL21 genome gb_file = gb.GenBankFile.read( entrez.fetch("CP001509", None, "gb", "nuccore", "gb")) annot_seq = gb.get_annotated_sequence(gb_file, include_only=["gene"]) # Find leuL gene for feature in annot_seq.annotation: if "gene" in feature.qual and feature.qual["gene"] == "leuL": leul_feature = feature # Get leuL sequence leul_seq = annot_seq[leul_feature] # Download and read Salmonella enterica genome without annotations fasta_file = fasta.FastaFile.read( entrez.fetch("CP019649", None, "fa", "nuccore", "fasta")) se_genome = fasta.get_sequence(fasta_file) # Find leuL in genome by local alignment matrix = align.SubstitutionMatrix.std_nucleotide_matrix() # Use general gap penalty to save RAM
multi_line=False, loc_range=(1, 100), # Register our drawing functions feature_plotters=[HelixPlotter(), SheetPlotter()]) fig.tight_layout() ######################################################################## # Now let us do some serious application. # We want to visualize the secondary structure of one monomer of the # homodimeric transketolase (PDB: 1QGD). # The simplest way to do that, is to fetch the corresponding GenBank # file, extract an `Annotation` object from the file and draw the # annotation. # Fetch GenBank files of the TK's first chain and extract annotatation file_name = entrez.fetch("1QGD_A", biotite.temp_dir(), "gb", "protein", "gb") gb_file = gb.GenBankFile() gb_file.read(file_name) annotation = gb.get_annotation(gb_file, include_only=["SecStr"]) # Length of the sequence _, length, _, _, _, _ = gb.get_locus(gb_file) fig = plt.figure(figsize=(8.0, 3.0)) ax = fig.add_subplot(111) graphics.plot_feature_map( ax, annotation, symbols_per_line=150, show_numbers=True, show_line_position=True, # 'loc_range' takes exclusive stop -> length+1 is required
# Code source: Patrick Kunzmann # License: BSD 3 clause import biotite import biotite.sequence as seq import biotite.sequence.graphics as graphics import biotite.sequence.io.genbank as gb import biotite.database.entrez as entrez import numpy as np import matplotlib.pyplot as plt # Download E. coli BL21 genome file_name = entrez.fetch("CP001509", biotite.temp_dir(), suffix="gb", db_name="nuccore", ret_type="gb") gb_file = gb.GenBankFile() gb_file.read(file_name) _, seq_length, _, _, _, _ = gb.get_locus(gb_file) annotation = gb.get_annotation(gb_file, include_only=["gene"]) # Find the minimum and maximum locations of lac genes min_loc = seq_length max_loc = 1 for feature in annotation: for loc in feature.locs: # Ignore if feature is only a pseudo-gene (e.g. gene fragment) # and check if feature is lacA gene (begin of lac operon) if "gene" in feature.qual \ and "pseudo" not in feature.qual \
""" # Code source: Patrick Kunzmann # License: BSD 3 clause import biotite import biotite.sequence as seq import biotite.sequence.io.fasta as fasta import biotite.sequence.graphics as graphics import biotite.application.muscle as muscle import biotite.application.blast as blast import biotite.database.entrez as entrez import matplotlib.pyplot as plt # Download sequence of Streptococcus pyogenes Cas9 file_name = entrez.fetch("Q99ZW2", biotite.temp_dir(), "fa", "protein", "fasta") file = fasta.FastaFile.read(file_name) ref_seq = fasta.get_sequence(file) # Find homologous proteins using NCBI Blast # Search only the UniProt/SwissProt database blast_app = blast.BlastWebApp("blastp", ref_seq, "swissprot", obey_rules=False) blast_app.start() blast_app.join() alignments = blast_app.get_alignments() # Get hit IDs for hits with score > 200 hits = [] for ali in alignments: if ali.score > 200: hits.append(ali.hit_id) # Get the sequences from hit IDs hit_seqs = []
# the respective sequence strings. # Actually you can cast the :class:`FastaFile` object into a # :class:`dict`. # Let's demonstrate this on the genome of the *lambda* phage # (Accession: ``NC_001416```). # After downloading the FASTA file from the NCBI Entrez database, # we can load the contents in the following way: import biotite import biotite.sequence as seq import biotite.sequence.io.fasta as fasta import biotite.database.entrez as entrez file_path = entrez.fetch("NC_001416", biotite.temp_dir(), suffix="fa", db_name="nuccore", ret_type="fasta") file = fasta.FastaFile() file.read(file_path) for header, string in file.items(): print("Header:", header) print(len(string)) print("Sequence:", string[:50], "...") print("Sequence length:", len(string)) ######################################################################## # Since there is only a single sequence in the file, the loop is run # only one time. # As the sequence string is very long, only the first 50 bp are printed. # Now this string could be used as input parameter for creation of a
# # Read mapping # ------------ # # In the next step we map each read to its respective position # in the reference genome. # An additional challenge is to find the correct sense of the read: # In the library preparation both, sense and complementary DNA, is # produced from the virus RNA. # For this reason we need to create a complementary copy for each read # and map both strands to the reference genome. # Later the *wrong* strand is discarded. # Download and read the reference SARS-CoV-2 genome orig_genome_file = entrez.fetch( "NC_045512", tempfile.gettempdir(), "gb", db_name="Nucleotide", ret_type="gb" ) orig_genome = seqio.load_sequence(orig_genome_file) # Create complementary reads compl_reads = list(itertools.chain( *[(read, read.reverse(False).complement()) for read in reads] )) ######################################################################## # To map the reads to their corresponding positions in the reference # genome, we need to align them to it. # Although we could use :func:`align_optimal()` # (Needleman-Wunsch algorithm [4]_) for this purpose, aligning this # large number of reads to even a small virus genome would take hours. #
def test_fetch_invalid(): with pytest.raises(RequestError): file = entrez.fetch("xxxx", tempfile.gettempdir(), "fa", "protein", "fasta", overwrite=True)
""" # Code source: Patrick Kunzmann # License: BSD 3 clause import tempfile import itertools import numpy as np import biotite.sequence as seq import biotite.sequence.io.genbank as gb import biotite.sequence.io.fasta as fasta import biotite.database.entrez as entrez # Get the E. coli K-12 genome as annotated sequence gb_file = gb.GenBankFile.read( entrez.fetch("U00096", tempfile.gettempdir(), "gb", "nuccore", "gb")) # We are only interested in CDS features k12_genome = gb.get_annotated_sequence(gb_file, include_only=["CDS"]) # This dictionary will count how often each codon occurs in the genome # For increased performance the dictionary uses symbol codes ([0 3 2]) # instead of symbols (['A' 'T' 'G']) as keys codon_counter = { codon: 0 for codon in itertools.product( *([range(len(k12_genome.sequence.alphabet))] * 3)) } # For demonstration purposes print the 64 codons in symbol code form print(list(codon_counter.keys())) ########################################################################
import matplotlib.pyplot as plt import biotite import biotite.sequence as seq import biotite.sequence.align as align import biotite.sequence.io.fasta as fasta import biotite.database.entrez as entrez import biotite.sequence.graphics as graphics # Download and parse protein sequences of Covid and Mers covid_file_path = entrez.fetch("NC_045512", "myresult_dir", suffix="fa", db_name="nuccore", ret_type="fasta") mers_file_path = entrez.fetch("NC_019843.3", "myresult_dir", suffix="fa", db_name="nuccore", ret_type="fasta") # Read the file c_file = fasta.FastaFile() c_file.read(covid_file_path) m_file = fasta.FastaFile() m_file.read(mers_file_path) # Display for h, s in c_file.items(): print(h) print(s) covid_seq = seq.NucleotideSequence(s) for h, s in m_file.items(): print(h)
sequence alignment of the hit sequences afterwards, using MUSCLE. """ # Code source: Patrick Kunzmann # License: BSD 3 cl from tempfile import gettempdir import biotite.sequence as seq import biotite.sequence.io.fasta as fasta import biotite.sequence.graphics as graphics import biotite.application.muscle as muscle import biotite.application.blast as blast import biotite.database.entrez as entrez import matplotlib.pyplot as plt # Download sequence of Streptococcus pyogenes Cas9 file_name = entrez.fetch("Q99ZW2", gettempdir(), "fa", "protein", "fasta") fasta_file = fasta.FastaFile.read(file_name) ref_seq = fasta.get_sequence(fasta_file) # Find homologous proteins using NCBI Blast # Search only the UniProt/SwissProt database blast_app = blast.BlastWebApp("blastp", ref_seq, "swissprot", obey_rules=False) blast_app.start() blast_app.join() alignments = blast_app.get_alignments() # Get hit IDs for hits with score > 200 hits = [] for ali in alignments: if ali.score > 200: hits.append(ali.hit_id) # Get the sequences from hit IDs hit_seqs = []
""" # Code source: Patrick Kunzmann # License: BSD 3 clause import itertools import numpy as np import biotite import biotite.sequence as seq import biotite.sequence.io.genbank as gb import biotite.sequence.io.fasta as fasta import biotite.database.entrez as entrez # Get the E. coli K-12 genome as annotated sequence gb_file = gb.GenBankFile.read( entrez.fetch("U00096", biotite.temp_dir(), "gb", "nuccore", "gb")) # We are only interested in CDS features k12_genome = gb.get_annotated_sequence(gb_file, include_only=["CDS"]) # This dictionary will count how often each codon occurs in the genome # For increased performance the dictionary uses symbol codes ([0 3 2]) # instead of symbols (['A' 'T' 'G']) as keys codon_counter = { codon: 0 for codon in itertools.product( *([range(len(k12_genome.sequence.alphabet))] * 3)) } # For demonstration purposes print the 64 codons in symbol code form print(list(codon_counter.keys())) ########################################################################
This script creates a feature map for the region around the *lac* operon in the E. coli BL21 genome. """ # Code source: Patrick Kunzmann # License: BSD 3 clause import biotite.sequence as seq import biotite.sequence.graphics as graphics import biotite.sequence.io.genbank as gb import biotite.database.entrez as entrez import numpy as np import matplotlib.pyplot as plt # Download E. coli BL21 genome file = entrez.fetch("CP001509", None, suffix="gb", db_name="nuccore", ret_type="gb") gb_file = gb.GenBankFile.read(file) _, seq_length, _, _, _, _ = gb.get_locus(gb_file) annotation = gb.get_annotation(gb_file, include_only=["gene"]) # Find the minimum and maximum locations of lac genes min_loc = seq_length max_loc = 1 for feature in annotation: for loc in feature.locs: # Ignore if feature is only a pseudo-gene (e.g. gene fragment) # and check if feature is lacA gene (begin of lac operon) if "gene" in feature.qual \ and "pseudo" not in feature.qual \ and feature.qual["gene"] == "lacA": if min_loc > loc.first: min_loc = loc.first
Since we want to perform a six-frame translation we have to look at the complementary strand of the genome as well. """ # Code source: Patrick Kunzmann # License: BSD 3 clause import biotite import biotite.sequence as seq import biotite.sequence.io.fasta as fasta import biotite.database.entrez as entrez import matplotlib.pyplot as plt # Download Porcine circovirus genome file_name = entrez.fetch("KP282147", biotite.temp_dir(), "fa", "nuccore", "fasta") fasta_file = fasta.FastaFile() fasta_file.read(file_name) genome = fasta.get_sequence(fasta_file) # Perform translation for forward strand proteins, positions = genome.translate() print("Forward strand:") for i in range(len(proteins)): print("{:4d} - {:4d}: {:}".format(positions[i][0], positions[i][1], str(proteins[i]))) print("\n") # Perform translation for complementary strand genome_rev = genome.reverse().complement() proteins, positions = genome_rev.translate() print("Reverse strand:") for i in range(len(proteins)):
is described as 3 integers instead of 3 letters. """ # Code source: Patrick Kunzmann # License: BSD 3 clause import itertools import numpy as np import biotite.sequence as seq import biotite.sequence.io.genbank as gb import biotite.sequence.io.fasta as fasta import biotite.database.entrez as entrez # Get the E. coli K-12 genome as annotated sequence gb_file = gb.GenBankFile.read( entrez.fetch("U00096", None, "gb", "nuccore", "gb")) # We are only interested in CDS features k12_genome = gb.get_annotated_sequence(gb_file, include_only=["CDS"]) # This dictionary will count how often each codon occurs in the genome # For increased performance the dictionary uses symbol codes ([0 3 2]) # instead of symbols (['A' 'T' 'G']) as keys codon_counter = { codon: 0 for codon in itertools.product( *([range(len(k12_genome.sequence.alphabet))] * 3)) } # For demonstration purposes print the 64 codons in symbol code form print(list(codon_counter.keys())) ########################################################################
import tempfile import numpy as np import matplotlib.pyplot as plt from matplotlib.patches import Rectangle from matplotlib.ticker import MultipleLocator import biotite import biotite.sequence as seq import biotite.sequence.io as seqio import biotite.sequence.io.genbank as gb import biotite.sequence.align as align import biotite.database.entrez as entrez import biotite.application.tantan as tantan fasta_file = entrez.fetch("NC_000932", tempfile.gettempdir(), "fasta", db_name="Nucleotide", ret_type="fasta") chloroplast_seq = seqio.load_sequence(fasta_file) fasta_file = entrez.fetch("NC_000911", tempfile.gettempdir(), "fasta", db_name="Nucleotide", ret_type="fasta") bacterium_seq = seqio.load_sequence(fasta_file) ######################################################################## # For the *k-mer* matching step the genome of the cyanobacterium is # indexed into a :class:`KmerTable`. # As homologous regions between both genomes may also appear on the
"E": -3.5, "Q": -3.5, "D": -3.5, "N": -3.5, "K": -3.9, "R": -4.5 } # Look for the Swiss-Prot entry contaning the human HCN1 channel query = entrez.SimpleQuery("HCN1", "Gene Name") \ & entrez.SimpleQuery("h**o sapiens", "Organism") \ & entrez.SimpleQuery("srcdb_swiss-prot", "Properties") uids = entrez.search(query, db_name="protein") file_name = entrez.fetch(uids[0], biotite.temp_dir(), "gp", db_name="protein", ret_type="gp") gp_file = gb.GenBankFile.read(file_name) hcn1 = seq.ProteinSequence(gb.get_sequence(gp_file, format="gp")) print(hcn1) ######################################################################## # The positional hydropathy is calculated and smoothened using # a moving average for clearer visualization. hydropathies = np.array([hydropathy_dict[symbol] for symbol in hcn1]) def moving_average(data_set, window_size):
# *NCBI Entrez* database, which is commonly known as *the NCBI*. # It provides a myriad of information, ranging from sequences and # sequence features to scientific articles. # Fetching files from NCBI Entrez works analogous to the RCSB interface. # This time we have to provide the UIDs (Accession or GI) instead of # PDB IDs to the :func:`fetch()` function. # Furthermore, we need to specifiy the database to retrieve the data # from and the retrieval type. from tempfile import gettempdir, NamedTemporaryFile import biotite.database.entrez as entrez # Fetch a single UID ... file_path = entrez.fetch("NC_001416", gettempdir(), suffix="fa", db_name="nuccore", ret_type="fasta") print(file_path) # ... or multiple UIDs file_paths = entrez.fetch(["1L2Y_A", "1AKI_A"], gettempdir(), suffix="fa", db_name="protein", ret_type="fasta") print([file_path for file_path in file_paths]) ######################################################################## # A list of valid database, retrieval type and mode combinations can # be found # `here <https://www.ncbi.nlm.nih.gov/books/NBK25499/table/chapter4.T._valid_values_of__retmode_and/?report=objectonly>`_.
since domestic pigs are the host of the virus. Since we want to perform a six-frame translation we have to look at the complementary strand of the genome as well. """ # Code source: Patrick Kunzmann # License: BSD 3 clause import biotite.sequence as seq import biotite.sequence.io.fasta as fasta import biotite.database.entrez as entrez import matplotlib.pyplot as plt # Download Porcine circovirus genome file = entrez.fetch("KP282147", None, "fa", "nuccore", "fasta") fasta_file = fasta.FastaFile.read(file) genome = fasta.get_sequence(fasta_file) # Perform translation for forward strand proteins, positions = genome.translate() print("Forward strand:") for i in range(len(proteins)): print("{:4d} - {:4d}: {:}" .format(positions[i][0], positions[i][1], str(proteins[i]))) print("\n") # Perform translation for complementary strand genome_rev = genome.reverse().complement() proteins, positions = genome_rev.translate() print("Reverse strand:") for i in range(len(proteins)): print("{:5d} - {:5d}: {:}"
# Code source: Patrick Kunzmann # License: BSD 3 clause import biotite import biotite.sequence as seq import biotite.sequence.io.fasta as fasta import biotite.sequence.io.genbank as gb import biotite.sequence.graphics as graphics import biotite.sequence.align as align import biotite.database.entrez as entrez import numpy as np import matplotlib.pyplot as plt # Download E. coli BL21 genome file_name = entrez.fetch("CP001509", biotite.temp_dir(), "gb", "nuccore", "gb") gb_file = gb.GenBankFile() gb_file.read(file_name) annot_seq = gb_file.get_annotated_sequence(include_only=["gene"]) # Find leuL gene for feature in annot_seq.annotation: if "gene" in feature.qual and feature.qual["gene"] == "leuL": leul_feature = feature # Get leuL sequence leul_seq = annot_seq[leul_feature] # Download Salmonella enterica genome without annotations file_name = entrez.fetch("CP019649", biotite.temp_dir(), "fa", "nuccore", "fasta") fasta_file = fasta.FastaFile() fasta_file.read(file_name)
import matplotlib.pyplot as plt from matplotlib.patches import Patch import matplotlib.ticker as ticker import biotite import biotite.sequence as seq import biotite.sequence.io.genbank as gb import biotite.sequence.graphics as graphics import biotite.database.entrez as entrez import biotite.application.muscle as muscle UTR_LENGTH = 20 ### Get the E. coli K-12 genome as annotated sequence gb_file = gb.GenBankFile.read( entrez.fetch("U00096", tempfile.gettempdir(), "gb", "nuccore", "gb")) # We are only interested in CDS features bl21_genome = gb.get_annotated_sequence(gb_file, include_only=["CDS"]) ### Extract sequences for 5' untranslated regions (UTRs) # In this case we define the untranslated region, as the sequence # up to UTR_LENGTH bases upstream from the start codon utrs = [] for cds in bl21_genome.annotation: # Expect a single location for the feature, # since no splicing can occur # Ignore special cases like ribosomal slippage sites, etc. # for simplicity if len(cds.locs) != 1: continue