Exemple #1
0
def test_rna_conversion():
    sequence = seq.NucleotideSequence("ACGT")
    fasta_file = fasta.FastaFile()
    fasta.set_sequence(fasta_file, sequence, "seq1", as_rna=False)
    fasta.set_sequence(fasta_file, sequence, "seq2", as_rna=True)
    assert fasta_file["seq1"] == "ACGT"
    assert fasta_file["seq2"] == "ACGU"
Exemple #2
0
def test_sequence_conversion():
    path = os.path.join(data_dir("sequence"), "nuc.fasta")
    file = fasta.FastaFile.read(path)
    assert seq.NucleotideSequence("ACGCTACGT") == fasta.get_sequence(file)

    seq_dict = fasta.get_sequences(file)
    file2 = fasta.FastaFile()
    fasta.set_sequences(file2, seq_dict)
    seq_dict2 = fasta.get_sequences(file2)
    # Cannot compare dicts directly, since the original RNA sequence is
    # now guessed as protein sequence
    for seq1, seq2 in zip(seq_dict.values(), seq_dict2.values()):
        assert str(seq1) == str(seq2)

    file3 = fasta.FastaFile()
    fasta.set_sequence(file3, seq.NucleotideSequence("AACCTTGG"))
    assert file3["sequence"] == "AACCTTGG"

    path = os.path.join(data_dir("sequence"), "prot.fasta")
    file4 = fasta.FastaFile.read(path)
    # Expect a warning for selenocysteine conversion
    with pytest.warns(UserWarning):
        assert seq.ProteinSequence("YAHCGFRTGS") == fasta.get_sequence(file4)

    path = os.path.join(data_dir("sequence"), "invalid.fasta")
    file5 = fasta.FastaFile.read(path)
    with pytest.raises(ValueError):
        seq.NucleotideSequence(fasta.get_sequence(file5))
Exemple #3
0
def test_sequence_conversion():
    path = os.path.join(data_dir, "nuc.fasta")
    file = fasta.FastaFile()
    file.read(path)
    assert seq.NucleotideSequence("ACGCTACGT") == fasta.get_sequence(file)

    seq_dict = fasta.get_sequences(file)
    file2 = fasta.FastaFile()
    fasta.set_sequences(file2, seq_dict)
    seq_dict2 = fasta.get_sequences(file2)
    assert seq_dict == seq_dict2

    file3 = fasta.FastaFile()
    fasta.set_sequence(file3, seq.NucleotideSequence("AACCTTGG"))
    assert file3["sequence"] == "AACCTTGG"

    path = os.path.join(data_dir, "prot.fasta")
    file4 = fasta.FastaFile()
    file4.read(path)
    assert seq.ProteinSequence("YAHGFRTGS") == fasta.get_sequence(file4)

    path = os.path.join(data_dir, "invalid.fasta")
    file5 = fasta.FastaFile()
    file5.read(path)
    with pytest.raises(ValueError):
        seq.NucleotideSequence(fasta.get_sequence(file5))
Exemple #4
0
# :class:`ProteinSequence`, instance respectively.
# Actually, it just tries to create a :class:`NucleotideSequence`,
# and if this fails, a :class:`ProteinSequence` is created instead.
#
# Sequences can be written into FASTA files in a similar way: either via
# dictionary-like access or using the  :func:`set_sequence()`
# convenience function.

# Create new empty FASTA file
file = fasta.FastaFile()
# PROTIP: Let your cat walk over the keyboard
dna_seq1 = seq.NucleotideSequence("ATCGGATCTATCGATGCTAGCTACAGCTAT")
dna_seq2 = seq.NucleotideSequence("ACGATCTACTAGCTGATGTCGTGCATGTACG")
# Append entries to file...
# ... via set_sequence()
fasta.set_sequence(file, dna_seq1, header="gibberish")
# .. or dictionary style
file["more gibberish"] = str(dna_seq2)
print(file)
file.write(biotite.temp_file("fa"))

########################################################################
# As you see, our file contains our new ``'gibberish'`` and
# ``'more gibberish'`` sequences now.
#
# In a similar manner sequences and sequence quality scores can be read
# from FASTQ files. For further reference, have a look at the
# :mod:`biotite.sequence.io.fastq` subpackage.
#
# Alternatively, a sequence can also be loaded from GenBank or GenPept
# files, using the :class:`GenBankFile` class (more on this later).
Exemple #5
0
# At least 60 % of all reads covering a certain location must call a
# deletion for this location, otherwise the deletion is rejected

DELETION_THRESHOLD = 0.6

var_genome = seq.NucleotideSequence()
var_genome.code = most_probable_symbol_codes
# A deletion is called, if either enough reads include this deletion
# or the sequence position is not covered by any read at all
deletion_mask = (deletion_number > sequencing_depth * DELETION_THRESHOLD) \
                | (sequencing_depth == 0)
var_genome = var_genome[~deletion_mask]
# Write the assembled genome into a FASTA file
out_file = fasta.FastaFile()
fasta.set_sequence(
    out_file, var_genome, header="SARS-CoV-2 B.1.1.7", as_rna=True
)
out_file.write(tempfile.NamedTemporaryFile("w"))

########################################################################
# We have done it, the genome of the B.1.1.7 variant is assembled!
# Now we would like to have a closer look on the difference between the
# original and the B.1.1.7 genome.
#
# Mutations in the B.1.1.7 variant
# --------------------------------
#
# To get an rough overview about the overall sequence identity between
# the genomes and the locations of mutations in the B.1.1.7 variant,
# we need to align the original genome to our assembled one.
# As both genomes are expected to be highly similar, we can use a banded