def test_sequence_conversion(): path = os.path.join(data_dir("sequence"), "nuc.fasta") file = fasta.FastaFile.read(path) assert seq.NucleotideSequence("ACGCTACGT") == fasta.get_sequence(file) seq_dict = fasta.get_sequences(file) file2 = fasta.FastaFile() fasta.set_sequences(file2, seq_dict) seq_dict2 = fasta.get_sequences(file2) # Cannot compare dicts directly, since the original RNA sequence is # now guessed as protein sequence for seq1, seq2 in zip(seq_dict.values(), seq_dict2.values()): assert str(seq1) == str(seq2) file3 = fasta.FastaFile() fasta.set_sequence(file3, seq.NucleotideSequence("AACCTTGG")) assert file3["sequence"] == "AACCTTGG" path = os.path.join(data_dir("sequence"), "prot.fasta") file4 = fasta.FastaFile.read(path) # Expect a warning for selenocysteine conversion with pytest.warns(UserWarning): assert seq.ProteinSequence("YAHCGFRTGS") == fasta.get_sequence(file4) path = os.path.join(data_dir("sequence"), "invalid.fasta") file5 = fasta.FastaFile.read(path) with pytest.raises(ValueError): seq.NucleotideSequence(fasta.get_sequence(file5))
def test_sequence_conversion(): path = os.path.join(data_dir, "nuc.fasta") file = fasta.FastaFile() file.read(path) assert seq.NucleotideSequence("ACGCTACGT") == fasta.get_sequence(file) seq_dict = fasta.get_sequences(file) file2 = fasta.FastaFile() fasta.set_sequences(file2, seq_dict) seq_dict2 = fasta.get_sequences(file2) assert seq_dict == seq_dict2 file3 = fasta.FastaFile() fasta.set_sequence(file3, seq.NucleotideSequence("AACCTTGG")) assert file3["sequence"] == "AACCTTGG" path = os.path.join(data_dir, "prot.fasta") file4 = fasta.FastaFile() file4.read(path) assert seq.ProteinSequence("YAHGFRTGS") == fasta.get_sequence(file4) path = os.path.join(data_dir, "invalid.fasta") file5 = fasta.FastaFile() file5.read(path) with pytest.raises(ValueError): seq.NucleotideSequence(fasta.get_sequence(file5))
def test_fetch(as_file_like): path = None if as_file_like else tempfile.gettempdir() # UniProtKB file = uniprot.fetch( "P12345", "fasta", path, overwrite=True ) fasta_file = fasta.FastaFile.read(file) prot_seq = fasta.get_sequence(fasta_file) assert len(prot_seq) == 430 # UniRef file = uniprot.fetch( "UniRef90_P99999", "fasta", path, overwrite=True ) fasta_file = fasta.FastaFile.read(file) prot_seq = fasta.get_sequence(fasta_file) assert len(prot_seq) == 105 # UniParc file = uniprot.fetch( "UPI000000001F", "fasta", path, overwrite=True ) fasta_file = fasta.FastaFile.read(file) prot_seq = fasta.get_sequence(fasta_file) assert len(prot_seq) == 551
def test_fetch(common_name, as_file_like): path = None if as_file_like else biotite.temp_dir() db_name = "Protein" if common_name else "protein" file = entrez.fetch("1L2Y_A", path, "fa", db_name, "fasta", overwrite=True) fasta_file = fasta.FastaFile() fasta_file.read(file) prot_seq = fasta.get_sequence(fasta_file)
def test_fetch(): file = entrez.fetch("1L2Y_A", biotite.temp_dir(), "fa", "protein", "fasta", overwrite=True) fasta_file = fasta.FastaFile() fasta_file.read(file) prot_seq = fasta.get_sequence(fasta_file)
def test_search_sequence(): IDENTIY_CUTOFF = 0.9 pdbx_file = pdbx.PDBxFile.read(join(data_dir("structure"), "1l2y.cif")) ref_sequence = pdbx.get_sequence(pdbx_file)[0] query = rcsb.SequenceQuery(ref_sequence, "protein", min_identity=IDENTIY_CUTOFF) test_ids = rcsb.search(query) for id in test_ids: fasta_file = fasta.FastaFile.read(rcsb.fetch(id, "fasta")) test_sequence = fasta.get_sequence(fasta_file) matrix = align.SubstitutionMatrix.std_protein_matrix() alignment = align.align_optimal(ref_sequence, test_sequence, matrix, terminal_penalty=False)[0] identity = align.get_sequence_identity(alignment, mode="shortest") assert identity >= IDENTIY_CUTOFF
gb_file = gb.GenBankFile() gb_file.read(file_name) annot_seq = gb_file.get_annotated_sequence(include_only=["gene"]) # Find leuL gene for feature in annot_seq.annotation: if "gene" in feature.qual and feature.qual["gene"] == "leuL": leul_feature = feature # Get leuL sequence leul_seq = annot_seq[leul_feature] # Download Salmonella enterica genome without annotations file_name = entrez.fetch("CP019649", biotite.temp_dir(), "fa", "nuccore", "fasta") fasta_file = fasta.FastaFile() fasta_file.read(file_name) se_genome = fasta.get_sequence(fasta_file) # Find leuL in genome by local alignment matrix = align.SubstitutionMatrix.std_nucleotide_matrix() # Use general gap penalty to save RAM alignments = align.align_optimal(leul_seq, se_genome, matrix, gap_penalty=-7, local=True) # Do the same for reverse complement genome se_genome_rev = se_genome.reverse().complement() rev_alignments = align.align_optimal(leul_seq, se_genome_rev, matrix, gap_penalty=-7, local=True)
# Code source: Patrick Kunzmann # License: BSD 3 clause import biotite import biotite.sequence as seq import biotite.sequence.io.fasta as fasta import biotite.sequence.graphics as graphics import biotite.application.muscle as muscle import biotite.application.blast as blast import biotite.database.entrez as entrez import matplotlib.pyplot as plt # Download sequence of Streptococcus pyogenes Cas9 file_name = entrez.fetch("Q99ZW2", biotite.temp_dir(), "fa", "protein", "fasta") file = fasta.FastaFile.read(file_name) ref_seq = fasta.get_sequence(file) # Find homologous proteins using NCBI Blast # Search only the UniProt/SwissProt database blast_app = blast.BlastWebApp("blastp", ref_seq, "swissprot", obey_rules=False) blast_app.start() blast_app.join() alignments = blast_app.get_alignments() # Get hit IDs for hits with score > 200 hits = [] for ali in alignments: if ali.score > 200: hits.append(ali.hit_id) # Get the sequences from hit IDs hit_seqs = [] for hit in hits: file_name = entrez.fetch(hit, biotite.temp_dir(), "fa", "protein", "fasta")
for header, string in file.items(): print("Header:", header) print(len(string)) print("Sequence:", string[:50], "...") print("Sequence length:", len(string)) ######################################################################## # Since there is only a single sequence in the file, the loop is run # only one time. # As the sequence string is very long, only the first 50 bp are printed. # Now this string could be used as input parameter for creation of a # :class:`NucleotideSequence`. # But we want to spare ourselves some unnecessary work, there is already # a convenience function for that: dna_seq = fasta.get_sequence(file) print(type(dna_seq).__name__) print(dna_seq[:50]) ######################################################################## # In this form :func:`get_sequence()` returns the first sequence in the # file, which is also the only sequence in most cases. # If you want the sequence corresponding to a specific header, you have # to specifiy the :obj:`header` parameter. # The function even automatically recognizes, if the file contains a # DNA or protein sequence and returns a :class:`NucleotideSequence` or # :class:`ProteinSequence`, instance respectively. # Actually, it just tries to create a :class:`NucleotideSequence`, # and if this fails, a :class:`ProteinSequence` is created instead. # # Sequences can be written into FASTA files in a similar way: either via
opt_codons = {} for amino_acid_code in range(20): codon_codes_for_aa = table[amino_acid_code] # Find codon with maximum frequency max_freq = 0 best_codon_code = None for codon_code in codon_codes_for_aa: if codon_counter[codon_code] > max_freq: max_freq = codon_counter[codon_code] best_codon_code = codon_code # Map the amino acid to the codon with maximum frequency opt_codons[amino_acid_code] = best_codon_code # Fetch the streptavidin protein sequence from Streptomyces avidinii fasta_file = fasta.FastaFile.read( entrez.fetch("P22629", biotite.temp_dir(), "fasta", "protein", "fasta")) strep_prot_seq = fasta.get_sequence(fasta_file) # Create a DNA sequence from the protein sequence # using the optimal codons strep_dna_seq = seq.NucleotideSequence() strep_dna_seq.code = np.concatenate( [opt_codons[amino_acid_code] for amino_acid_code in strep_prot_seq.code]) # Add stop codon strep_dna_seq += seq.NucleotideSequence("TAA") # Put the DNA sequence into a FASTA file fasta_file = fasta.FastaFile() fasta_file["Codon optimized streptavidin"] = str(strep_dna_seq) # Print the contents of the created FASTA file print(fasta_file) # In a real application it would be written onto the hard drive via # fasta_file.write("some_file.fasta")