def sequences(): """ 10 Cas9 sequences. """ fasta_file = fasta.FastaFile() fasta_file.read(join(data_dir, "cas9.fasta")) return [seq.ProteinSequence(sequence) for sequence in fasta_file.values()]
def test_fetch(common_name, as_file_like): path = None if as_file_like else biotite.temp_dir() db_name = "Protein" if common_name else "protein" file = entrez.fetch("1L2Y_A", path, "fa", db_name, "fasta", overwrite=True) fasta_file = fasta.FastaFile() fasta_file.read(file) prot_seq = fasta.get_sequence(fasta_file)
def test_write_iter(chars_per_line, n_sequences): """ Test whether :class:`FastaFile.write()` and :class:`FastaFile.write_iter()` produce the same output file for random sequences. """ LENGTH_RANGE = (50, 150) SCORE_RANGE = (10, 60) # Generate random sequences and scores np.random.seed(0) sequences = [] for i in range(n_sequences): seq_length = np.random.randint(*LENGTH_RANGE) code = np.random.randint(len(seq.NucleotideSequence.alphabet_unamb), size=seq_length) sequence = seq.NucleotideSequence() sequence.code = code sequences.append(sequence) fasta_file = fasta.FastaFile(chars_per_line) for i, sequence in enumerate(sequences): header = f"seq_{i}" fasta_file[header] = str(sequence) ref_file = io.StringIO() fasta_file.write(ref_file) test_file = io.StringIO() fasta.FastaFile.write_iter(test_file, ((f"seq_{i}", str(sequence)) for i, sequence in enumerate(sequences)), chars_per_line) assert test_file.getvalue() == ref_file.getvalue()
def test_rna_conversion(): sequence = seq.NucleotideSequence("ACGT") fasta_file = fasta.FastaFile() fasta.set_sequence(fasta_file, sequence, "seq1", as_rna=False) fasta.set_sequence(fasta_file, sequence, "seq2", as_rna=True) assert fasta_file["seq1"] == "ACGT" assert fasta_file["seq2"] == "ACGU"
def test_fetch_single_file(): file = entrez.fetch_single_file(["1L2Y_A", "3O5R_A"], biotite.temp_file("fa"), "protein", "fasta") fasta_file = fasta.FastaFile() fasta_file.read(file) prot_seqs = fasta.get_sequences(fasta_file) assert len(prot_seqs) == 2
def test_fetch_single_file(as_file_like): file_name = None if as_file_like else biotite.temp_file("fa") file = entrez.fetch_single_file(["1L2Y_A", "3O5R_A"], file_name, "protein", "fasta") fasta_file = fasta.FastaFile() fasta_file.read(file) prot_seqs = fasta.get_sequences(fasta_file) assert len(prot_seqs) == 2
def show_example(ax, colors): fasta_file = fasta.FastaFile() fasta_file.read(EXAMPLE_FILE_NAME) alignment = fasta.get_alignment(fasta_file) alignment = alignment[:60] graphics.plot_alignment_type_based( ax, alignment, spacing=2.0, symbols_per_line=len(alignment), color_scheme=colors )
def test_fetch(): file = entrez.fetch("1L2Y_A", biotite.temp_dir(), "fa", "protein", "fasta", overwrite=True) fasta_file = fasta.FastaFile() fasta_file.read(file) prot_seq = fasta.get_sequence(fasta_file)
def plot_pb_scheme_alignment(): random.seed(1) scheme_file = biotite.temp_file("json") mat_file = biotite.temp_file("mat") with open(mat_file, "w") as file: # PB substitution matrix, adapted from PBxplore file.write(""" a b c d e f g h i j k l m n o p a 516 -59 113 -105 -411 -177 -27 -361 47 -103 -644 -259 -599 -372 -124 -83 b -59 541 -146 -210 -155 -310 -97 90 182 -128 -30 29 -745 -242 -165 22 c 113 -146 360 -14 -333 -240 49 -438 -269 -282 -688 -682 -608 -455 -147 6 d -105 -210 -14 221 5 -131 -349 -278 -253 -173 -585 -670 -1573 -1048 -691 -497 e -411 -155 -333 5 520 185 186 138 -378 -70 -112 -514 -1136 -469 -617 -632 f -177 -310 -240 -131 185 459 -99 -45 -445 83 -214 -88 -547 -629 -406 -552 g -27 -97 49 -349 186 -99 665 -99 -89 -118 -409 -138 -124 172 128 254 h -361 90 -438 -278 138 -45 -99 632 -205 316 192 -108 -712 -359 95 -399 i 47 182 -269 -253 -378 -445 -89 -205 696 186 8 15 -709 -269 -169 226 j -103 -128 -282 -173 -70 83 -118 316 186 768 196 5 -398 -340 -117 -104 k -644 -30 -688 -585 -112 -214 -409 192 8 196 568 -65 -270 -231 -471 -382 l -259 29 -682 -670 -514 -88 -138 -108 15 5 -65 533 -131 8 -11 -316 m -599 -745 -608 -1573 -1136 -547 -124 -712 -709 -398 -270 -131 241 -4 -190 -155 n -372 -242 -455 -1048 -469 -629 172 -359 -269 -340 -231 8 -4 703 88 146 o -124 -165 -147 -691 -617 -406 128 95 -169 -117 -471 -11 -190 88 716 58 p -83 22 6 -497 -632 -552 254 -399 226 -104 -382 -316 -155 146 58 609 """) gecli.main(args=[ "--alphabet", "abcdefghijklmnop", "--matrix", mat_file, "--contrast", "300", "--lmin", "65", "--lmax", "70", "-f", scheme_file ]) colors = graphics.load_color_scheme(scheme_file)["colors"] fig = plt.figure(figsize=(8.0, 5.0)) ax = fig.gca() pb_alphabet = seq.LetterAlphabet("abcdefghijklmnop") fasta_file = fasta.FastaFile() fasta_file.read(PB_EXAMPLE_FILE_NAME) seq_strings = list(fasta_file.values()) sequences = [ seq.GeneralSequence(pb_alphabet, seq_str.replace("-", "")) for seq_str in seq_strings ] trace = align.Alignment.trace_from_strings(seq_strings) alignment = align.Alignment(sequences, trace, score=None) graphics.plot_alignment_type_based(ax, alignment, symbols_per_line=60, spacing=2, color_scheme=colors) fig.tight_layout() return fig
def test_alignment_conversion(): path = os.path.join(data_dir("sequence"), "alignment.fasta") file = fasta.FastaFile.read(path) alignment = fasta.get_alignment(file) assert str(alignment) == ("ADTRCGTARDCGTR-DRTCGRAGD\n" "ADTRCGT---CGTRADRTCGRAGD\n" "ADTRCGTARDCGTRADR--GRAGD") file2 = fasta.FastaFile() fasta.set_alignment(file2, alignment, seq_names=["seq1", "seq2", "seq3"]) alignment2 = fasta.get_alignment(file2) assert str(alignment) == str(alignment2)
def test_sequence_conversion(): path = os.path.join(data_dir("sequence"), "nuc.fasta") file = fasta.FastaFile.read(path) assert seq.NucleotideSequence("ACGCTACGT") == fasta.get_sequence(file) seq_dict = fasta.get_sequences(file) file2 = fasta.FastaFile() fasta.set_sequences(file2, seq_dict) seq_dict2 = fasta.get_sequences(file2) assert seq_dict == seq_dict2 file3 = fasta.FastaFile() fasta.set_sequence(file3, seq.NucleotideSequence("AACCTTGG")) assert file3["sequence"] == "AACCTTGG" path = os.path.join(data_dir("sequence"), "prot.fasta") file4 = fasta.FastaFile.read(path) assert seq.ProteinSequence("YAHGFRTGS") == fasta.get_sequence(file4) path = os.path.join(data_dir("sequence"), "invalid.fasta") file5 = fasta.FastaFile.read(path) with pytest.raises(ValueError): seq.NucleotideSequence(fasta.get_sequence(file5))
def test_access(): path = os.path.join(data_dir, "nuc.fasta") file = fasta.FastaFile() file.read(path) assert file["dna sequence"] == "ACGCTACGT" assert file["another dna sequence"] == "A" assert file["third dna sequence"] == "ACGT" assert dict(file) == {"dna sequence" : "ACGCTACGT", "another dna sequence" : "A", "third dna sequence" : "ACGT"} file["another dna sequence"] = "AA" del file["dna sequence"] file["yet another sequence"] = "ACGT" assert dict(file) == {"another dna sequence" : "AA", "third dna sequence" : "ACGT", "yet another sequence" : "ACGT"}
def ivalue(self, structures, alignment): """ Parse back output PDBs and construct updated Structure models. Parameters ---------- structures: [array like, array like] sequences of two protein structures of same length alignment: biotite.alignment alignment of the given two sequences Returns ------- dict As returned by ``._parse_scoring(output)``. - ``scores`` (dict): - ``rmsd`` (float): RMSD value of the alignment - ``score`` (float): ivalue of the alignment - ``coverage`` (float): coverage of the alignment """ with enter_temp_directory() as (cwd, tmpdir): paths = "structure1.pdb", "structure2.pdb" structures[0].select_atoms(self.protein_selector).write(paths[0]) structures[1].select_atoms(self.protein_selector).write(paths[1]) fasta_file = fasta.FastaFile() for header, string in alignment.items(): fasta_file[header] = string fasta_file.write("temp_alignment.afasta") self._edit_fasta("temp_alignment.afasta") output = subprocess.check_output([ self.executable, paths[0], paths[1], "--ivalue", "temp_alignment.afasta" ]) # We need access to the temporary files at parse time! result = self._parse_scoring(output.decode()) return result
def test_fetch(format, as_file_like): path = None if as_file_like else biotite.temp_dir() file_path_or_obj = rcsb.fetch("1l2y", format, path, overwrite=True) if format == "pdb": file = pdb.PDBFile() file.read(file_path_or_obj) pdb.get_structure(file) elif format == "pdbx": file = pdbx.PDBxFile() file.read(file_path_or_obj) pdbx.get_structure(file) elif format == "mmtf": file = mmtf.MMTFFile() file.read(file_path_or_obj) mmtf.get_structure(file) elif format == "fasta": file = fasta.FastaFile() file.read(file_path_or_obj) # Test if the file contains any sequences assert len(fasta.get_sequences(file)) > 0
def _parse_metadata(self, output): """ Retrieves RMSD, score and metadata from the output of the MMLigner subprocess. Parameters ---------- output: str string of containing the stdout of the mmligener call Returns ------- dict As returned by ``._parse_metadata(output)``. - ``scores`` (dict): - ``rmsd`` (float): RMSD value of the alignment - ``score`` (float): ivalue of the alignment - ``coverage`` (float): coverage of the alignment - ``metadata`` (dict): - ``alignment``: (biotite.alignment): computed alignment - ``rotation``: (array-like): 3x3 rotation matrix - ``translation``: (np.array): array containing the translation - ``quarternion``: (array-like): 4x4 quarternion matrix """ lines = iter(output.splitlines()) for line in lines: if line.startswith("RMSD"): rmsd = float(line.split()[2]) elif line.startswith("Coverage"): coverage = float(line.split()[2]) elif line.startswith("I(A & <S,T>)"): ivalue = float(line.split()[4]) elif "Print Centers of Mass of moving set:" in line: moving_com = np.array([float(x) for x in next(lines).split()]) elif "Print Centers of Mass of fixed set:" in line: fixed_com = np.array([float(x) for x in next(lines).split()]) elif "Print Rotation matrix" in line: rotation = [[float(x) for x in next(lines).split()] for _ in range(3)] elif "Print Quaternion matrix" in line: quaternion = [[float(x) for x in next(lines).split()] for _ in range(4)] # fixed_com, moving_com, rotation and quaternion can only be obtained # if the patched mmligner is used (check /devtools/conda-recipes/mmligner) # -- this will fail in CI for now -- translation = fixed_com - moving_com alignment = fasta.FastaFile() alignment.read("temp__1.afasta") return { "scores": { "rmsd": rmsd, "score": ivalue, "coverage": coverage }, "metadata": { "alignment": alignment, "rotation": rotation, "translation": translation, "quaternion": quaternion, }, }
# Download E. coli BL21 genome file_name = entrez.fetch("CP001509", biotite.temp_dir(), "gb", "nuccore", "gb") gb_file = gb.GenBankFile() gb_file.read(file_name) annot_seq = gb_file.get_annotated_sequence(include_only=["gene"]) # Find leuL gene for feature in annot_seq.annotation: if "gene" in feature.qual and feature.qual["gene"] == "leuL": leul_feature = feature # Get leuL sequence leul_seq = annot_seq[leul_feature] # Download Salmonella enterica genome without annotations file_name = entrez.fetch("CP019649", biotite.temp_dir(), "fa", "nuccore", "fasta") fasta_file = fasta.FastaFile() fasta_file.read(file_name) se_genome = fasta.get_sequence(fasta_file) # Find leuL in genome by local alignment matrix = align.SubstitutionMatrix.std_nucleotide_matrix() # Use general gap penalty to save RAM alignments = align.align_optimal(leul_seq, se_genome, matrix, gap_penalty=-7, local=True) # Do the same for reverse complement genome se_genome_rev = se_genome.reverse().complement() rev_alignments = align.align_optimal(leul_seq, se_genome_rev, matrix,
import biotite.database.entrez as entrez import biotite.sequence.graphics as graphics # Download and parse protein sequences of Covid and Mers covid_file_path = entrez.fetch("NC_045512", "myresult_dir", suffix="fa", db_name="nuccore", ret_type="fasta") mers_file_path = entrez.fetch("NC_019843.3", "myresult_dir", suffix="fa", db_name="nuccore", ret_type="fasta") # Read the file c_file = fasta.FastaFile() c_file.read(covid_file_path) m_file = fasta.FastaFile() m_file.read(mers_file_path) # Display for h, s in c_file.items(): print(h) print(s) covid_seq = seq.NucleotideSequence(s) for h, s in m_file.items(): print(h) print(s) mers_seq = seq.NucleotideSequence(s) mini_covid_seq = covid_seq[0:100] mini_mers_seq = mers_seq[0:100] matrix = align.SubstitutionMatrix.std_nucleotide_matrix()
# of course no assigned Phred score. # For the purpose of this example script we simply define as threshold: # At least 60 % of all reads covering a certain location must call a # deletion for this location, otherwise the deletion is rejected DELETION_THRESHOLD = 0.6 var_genome = seq.NucleotideSequence() var_genome.code = most_probable_symbol_codes # A deletion is called, if either enough reads include this deletion # or the sequence position is not covered by any read at all deletion_mask = (deletion_number > sequencing_depth * DELETION_THRESHOLD) \ | (sequencing_depth == 0) var_genome = var_genome[~deletion_mask] # Write the assembled genome into a FASTA file out_file = fasta.FastaFile() fasta.set_sequence( out_file, var_genome, header="SARS-CoV-2 B.1.1.7", as_rna=True ) out_file.write(tempfile.NamedTemporaryFile("w")) ######################################################################## # We have done it, the genome of the B.1.1.7 variant is assembled! # Now we would like to have a closer look on the difference between the # original and the B.1.1.7 genome. # # Mutations in the B.1.1.7 variant # -------------------------------- # # To get an rough overview about the overall sequence identity between # the genomes and the locations of mutations in the B.1.1.7 variant,
# Let's demonstrate this on the genome of the *lambda* phage # (Accession: ``NC_001416```). # After downloading the FASTA file from the NCBI Entrez database, # we can load the contents in the following way: import biotite import biotite.sequence as seq import biotite.sequence.io.fasta as fasta import biotite.database.entrez as entrez file_path = entrez.fetch("NC_001416", biotite.temp_dir(), suffix="fa", db_name="nuccore", ret_type="fasta") file = fasta.FastaFile() file.read(file_path) for header, string in file.items(): print("Header:", header) print(len(string)) print("Sequence:", string[:50], "...") print("Sequence length:", len(string)) ######################################################################## # Since there is only a single sequence in the file, the loop is run # only one time. # As the sequence string is very long, only the first 50 bp are printed. # Now this string could be used as input parameter for creation of a # :class:`NucleotideSequence`. # But we want to spare ourselves some unnecessary work, there is already # a convenience function for that: