Beispiel #1
0
def test_conversion(chars_per_line):
    path = os.path.join(data_dir("sequence"), "random.fastq")
    fasta_file = fastq.FastqFile.read(
        path, offset=33, chars_per_line=chars_per_line
    )
    ref_content = dict(fasta_file.items())

    fasta_file = fastq.FastqFile(offset=33, chars_per_line=chars_per_line)
    for identifier, (sequence, scores) in ref_content.items():
        fasta_file[identifier] = sequence, scores
    temp = TemporaryFile("w+")
    fasta_file.write(temp)

    temp.seek(0)
    fasta_file = fastq.FastqFile.read(
        temp, offset=33, chars_per_line=chars_per_line
    )
    content = dict(fasta_file.items())
    temp.close()
    
    for identifier in ref_content:
        ref_sequence, ref_scores = ref_content[identifier]
        test_sequence, test_scores = content[identifier]
        assert test_sequence == ref_sequence
        assert np.array_equal(test_scores, ref_scores)
Beispiel #2
0
def test_rna_conversion():
    sequence = seq.NucleotideSequence("ACGT")
    scores = np.array([0, 0, 0, 0])
    fastq_file = fastq.FastqFile(offset="Sanger")
    fastq.set_sequence(fastq_file, sequence, scores, "seq1", as_rna=False)
    fastq.set_sequence(fastq_file, sequence, scores, "seq2", as_rna=True)
    assert fastq_file["seq1"][0] == "ACGT" 
    assert fastq_file["seq2"][0] == "ACGU"
Beispiel #3
0
def test_conversion(chars_per_line):
    path = os.path.join(data_dir, "random.fastq")
    file1 = fastq.FastqFile(offset=33, chars_per_line=chars_per_line)
    file1.read(path)
    ref_content = dict(file1.items())

    file2 = fastq.FastqFile(offset=33, chars_per_line=chars_per_line)
    for identifier, (sequence, scores) in ref_content.items():
        file2[identifier] = sequence, scores
    file2.write(biotite.temp_file("fastq"))

    file3 = fastq.FastqFile(offset=33, chars_per_line=chars_per_line)
    file3.read(path)
    content = dict(file3.items())

    for identifier in ref_content:
        ref_sequence, ref_scores = ref_content[identifier]
        sequence, scores = content[identifier]
        assert ref_sequence == sequence
        assert np.array_equal(ref_scores, scores)
Beispiel #4
0
def test_write_iter(offset, chars_per_line, n_sequences):
    """
    Test whether :class:`FastqFile.write()` and
    :class:`FastqFile.write_iter()` produce the same output file for
    random sequences and scores.
    """
    LENGTH_RANGE = (50, 150)
    SCORE_RANGE = (10, 60)

    # Generate random sequences and scores
    np.random.seed(0)
    sequences = []
    scores = []
    for i in range(n_sequences):
        seq_length = np.random.randint(*LENGTH_RANGE)
        code = np.random.randint(
            len(seq.NucleotideSequence.alphabet_unamb),
            size=seq_length
        )
        sequence = seq.NucleotideSequence()
        sequence.code = code
        sequences.append(sequence)
        score = np.random.randint(*SCORE_RANGE, size=seq_length)
        scores.append(score)
    
    fastq_file = fastq.FastqFile(offset, chars_per_line)
    for i, (sequence, score) in enumerate(zip(sequences, scores)):
        identifier = f"seq_{i}"
        fastq_file[identifier] = (str(sequence), score)
    ref_file = io.StringIO()
    fastq_file.write(ref_file)
    
    test_file = io.StringIO()
    fastq.FastqFile.write_iter(
        test_file,
        (
            (f"seq_{i}", (str(sequence), score))
            for i, (sequence, score) in enumerate(zip(sequences, scores))
        ),
        offset, chars_per_line
    )

    assert test_file.getvalue() == ref_file.getvalue()
Beispiel #5
0
def test_access(chars_per_line):
    path = os.path.join(data_dir, "random.fastq")
    file = fastq.FastqFile(offset=33, chars_per_line=chars_per_line)
    file.read(path)
    assert len(file) == 20
    assert list(file.keys()) == [f"Read:{i+1:02d}" for i in range(20)]
    del (file["Read:05"])
    assert len(file) == 19
    assert list(
        file.keys()) == [f"Read:{i+1:02d}" for i in range(20) if i + 1 != 5]
    for sequence, scores in file.values():
        assert len(sequence) == len(scores)
        assert (scores >= 0).all()
    sequence = seq.NucleotideSequence("ACTCGGT")
    scores = np.array([10, 12, 20, 11, 0, 80, 42])
    file["test"] = sequence, scores
    sequence2, scores2 = file["test"]
    assert sequence == sequence2
    assert np.array_equal(scores, scores2)
Beispiel #6
0
from io import StringIO
import numpy as np
import matplotlib.pyplot as plt
import biotite
import biotite.sequence as seq
import biotite.sequence.io.fastq as fastq

# Sample FASTQ file from https://en.wikipedia.org/wiki/FASTQ_format
fastq_content = StringIO("""
@SEQ_ID
GATTTGGGGTTCAAAGCAGTATCGATCAAATAGTAAATCCATTTGTTCAACTCACAGTTT
+
!''*((((***+))%%%++)(%%%%).1***-+*''))**55CCF>>>>>>CCCCCCC65
""")

fastq_file = fastq.FastqFile(offset="Sanger")
fastq_file.read(fastq_content)
sequence = fastq_file.get_sequence("SEQ_ID")
scores = fastq_file.get_quality("SEQ_ID")

figure, ax = plt.subplots(figsize=(8.0, 2.0))
ax.bar(x=np.arange(len(sequence)),
       height=scores,
       color=biotite.colors["orange"],
       width=1.0,
       linewidth=1,
       edgecolor="white")
# -1 to put space between Y-axis and sequence
ax.set_xlim(-1, len(sequence))
# The range of Phred scores
ax.set_ylim(0, 40)