def test_conversion(chars_per_line): path = os.path.join(data_dir("sequence"), "random.fastq") fasta_file = fastq.FastqFile.read( path, offset=33, chars_per_line=chars_per_line ) ref_content = dict(fasta_file.items()) fasta_file = fastq.FastqFile(offset=33, chars_per_line=chars_per_line) for identifier, (sequence, scores) in ref_content.items(): fasta_file[identifier] = sequence, scores temp = TemporaryFile("w+") fasta_file.write(temp) temp.seek(0) fasta_file = fastq.FastqFile.read( temp, offset=33, chars_per_line=chars_per_line ) content = dict(fasta_file.items()) temp.close() for identifier in ref_content: ref_sequence, ref_scores = ref_content[identifier] test_sequence, test_scores = content[identifier] assert test_sequence == ref_sequence assert np.array_equal(test_scores, ref_scores)
def test_rna_conversion(): sequence = seq.NucleotideSequence("ACGT") scores = np.array([0, 0, 0, 0]) fastq_file = fastq.FastqFile(offset="Sanger") fastq.set_sequence(fastq_file, sequence, scores, "seq1", as_rna=False) fastq.set_sequence(fastq_file, sequence, scores, "seq2", as_rna=True) assert fastq_file["seq1"][0] == "ACGT" assert fastq_file["seq2"][0] == "ACGU"
def test_conversion(chars_per_line): path = os.path.join(data_dir, "random.fastq") file1 = fastq.FastqFile(offset=33, chars_per_line=chars_per_line) file1.read(path) ref_content = dict(file1.items()) file2 = fastq.FastqFile(offset=33, chars_per_line=chars_per_line) for identifier, (sequence, scores) in ref_content.items(): file2[identifier] = sequence, scores file2.write(biotite.temp_file("fastq")) file3 = fastq.FastqFile(offset=33, chars_per_line=chars_per_line) file3.read(path) content = dict(file3.items()) for identifier in ref_content: ref_sequence, ref_scores = ref_content[identifier] sequence, scores = content[identifier] assert ref_sequence == sequence assert np.array_equal(ref_scores, scores)
def test_write_iter(offset, chars_per_line, n_sequences): """ Test whether :class:`FastqFile.write()` and :class:`FastqFile.write_iter()` produce the same output file for random sequences and scores. """ LENGTH_RANGE = (50, 150) SCORE_RANGE = (10, 60) # Generate random sequences and scores np.random.seed(0) sequences = [] scores = [] for i in range(n_sequences): seq_length = np.random.randint(*LENGTH_RANGE) code = np.random.randint( len(seq.NucleotideSequence.alphabet_unamb), size=seq_length ) sequence = seq.NucleotideSequence() sequence.code = code sequences.append(sequence) score = np.random.randint(*SCORE_RANGE, size=seq_length) scores.append(score) fastq_file = fastq.FastqFile(offset, chars_per_line) for i, (sequence, score) in enumerate(zip(sequences, scores)): identifier = f"seq_{i}" fastq_file[identifier] = (str(sequence), score) ref_file = io.StringIO() fastq_file.write(ref_file) test_file = io.StringIO() fastq.FastqFile.write_iter( test_file, ( (f"seq_{i}", (str(sequence), score)) for i, (sequence, score) in enumerate(zip(sequences, scores)) ), offset, chars_per_line ) assert test_file.getvalue() == ref_file.getvalue()
def test_access(chars_per_line): path = os.path.join(data_dir, "random.fastq") file = fastq.FastqFile(offset=33, chars_per_line=chars_per_line) file.read(path) assert len(file) == 20 assert list(file.keys()) == [f"Read:{i+1:02d}" for i in range(20)] del (file["Read:05"]) assert len(file) == 19 assert list( file.keys()) == [f"Read:{i+1:02d}" for i in range(20) if i + 1 != 5] for sequence, scores in file.values(): assert len(sequence) == len(scores) assert (scores >= 0).all() sequence = seq.NucleotideSequence("ACTCGGT") scores = np.array([10, 12, 20, 11, 0, 80, 42]) file["test"] = sequence, scores sequence2, scores2 = file["test"] assert sequence == sequence2 assert np.array_equal(scores, scores2)
from io import StringIO import numpy as np import matplotlib.pyplot as plt import biotite import biotite.sequence as seq import biotite.sequence.io.fastq as fastq # Sample FASTQ file from https://en.wikipedia.org/wiki/FASTQ_format fastq_content = StringIO(""" @SEQ_ID GATTTGGGGTTCAAAGCAGTATCGATCAAATAGTAAATCCATTTGTTCAACTCACAGTTT + !''*((((***+))%%%++)(%%%%).1***-+*''))**55CCF>>>>>>CCCCCCC65 """) fastq_file = fastq.FastqFile(offset="Sanger") fastq_file.read(fastq_content) sequence = fastq_file.get_sequence("SEQ_ID") scores = fastq_file.get_quality("SEQ_ID") figure, ax = plt.subplots(figsize=(8.0, 2.0)) ax.bar(x=np.arange(len(sequence)), height=scores, color=biotite.colors["orange"], width=1.0, linewidth=1, edgecolor="white") # -1 to put space between Y-axis and sequence ax.set_xlim(-1, len(sequence)) # The range of Phred scores ax.set_ylim(0, 40)