Python FastaFile 예제들, biotite.sequence.io.fasta.FastaFile Python 예제들

예제 #1

0

파일 보기

def sequences():
    """
    10 Cas9 sequences.
    """
    fasta_file = fasta.FastaFile()
    fasta_file.read(join(data_dir, "cas9.fasta"))
    return [seq.ProteinSequence(sequence) for sequence in fasta_file.values()]

예제 #2

0

파일 보기

def test_fetch(common_name, as_file_like):
    path = None if as_file_like else biotite.temp_dir()
    db_name = "Protein" if common_name else "protein"
    file = entrez.fetch("1L2Y_A", path, "fa", db_name, "fasta", overwrite=True)
    fasta_file = fasta.FastaFile()
    fasta_file.read(file)
    prot_seq = fasta.get_sequence(fasta_file)

예제 #3

0

파일 보기

def test_write_iter(chars_per_line, n_sequences):
    """
    Test whether :class:`FastaFile.write()` and
    :class:`FastaFile.write_iter()` produce the same output file for
    random sequences.
    """
    LENGTH_RANGE = (50, 150)
    SCORE_RANGE = (10, 60)

    # Generate random sequences and scores
    np.random.seed(0)
    sequences = []
    for i in range(n_sequences):
        seq_length = np.random.randint(*LENGTH_RANGE)
        code = np.random.randint(len(seq.NucleotideSequence.alphabet_unamb),
                                 size=seq_length)
        sequence = seq.NucleotideSequence()
        sequence.code = code
        sequences.append(sequence)

    fasta_file = fasta.FastaFile(chars_per_line)
    for i, sequence in enumerate(sequences):
        header = f"seq_{i}"
        fasta_file[header] = str(sequence)
    ref_file = io.StringIO()
    fasta_file.write(ref_file)

    test_file = io.StringIO()
    fasta.FastaFile.write_iter(test_file,
                               ((f"seq_{i}", str(sequence))
                                for i, sequence in enumerate(sequences)),
                               chars_per_line)

    assert test_file.getvalue() == ref_file.getvalue()

예제 #4

0

파일 보기

def test_rna_conversion():
    sequence = seq.NucleotideSequence("ACGT")
    fasta_file = fasta.FastaFile()
    fasta.set_sequence(fasta_file, sequence, "seq1", as_rna=False)
    fasta.set_sequence(fasta_file, sequence, "seq2", as_rna=True)
    assert fasta_file["seq1"] == "ACGT"
    assert fasta_file["seq2"] == "ACGU"

예제 #5

0

파일 보기

def test_fetch_single_file():
    file = entrez.fetch_single_file(["1L2Y_A", "3O5R_A"],
                                    biotite.temp_file("fa"), "protein",
                                    "fasta")
    fasta_file = fasta.FastaFile()
    fasta_file.read(file)
    prot_seqs = fasta.get_sequences(fasta_file)
    assert len(prot_seqs) == 2

예제 #6

0

파일 보기

def test_fetch_single_file(as_file_like):
    file_name = None if as_file_like else biotite.temp_file("fa")
    file = entrez.fetch_single_file(["1L2Y_A", "3O5R_A"], file_name, "protein",
                                    "fasta")
    fasta_file = fasta.FastaFile()
    fasta_file.read(file)
    prot_seqs = fasta.get_sequences(fasta_file)
    assert len(prot_seqs) == 2

예제 #7

0

파일 보기

def show_example(ax, colors):
    fasta_file = fasta.FastaFile()
    fasta_file.read(EXAMPLE_FILE_NAME)
    alignment = fasta.get_alignment(fasta_file)
    alignment = alignment[:60]

    graphics.plot_alignment_type_based(
        ax, alignment, spacing=2.0, symbols_per_line=len(alignment),
        color_scheme=colors
    )

예제 #8

0

파일 보기

def test_fetch():
    file = entrez.fetch("1L2Y_A",
                        biotite.temp_dir(),
                        "fa",
                        "protein",
                        "fasta",
                        overwrite=True)
    fasta_file = fasta.FastaFile()
    fasta_file.read(file)
    prot_seq = fasta.get_sequence(fasta_file)

예제 #9

0

파일 보기

def plot_pb_scheme_alignment():
    random.seed(1)
    scheme_file = biotite.temp_file("json")
    mat_file = biotite.temp_file("mat")
    with open(mat_file, "w") as file:
        # PB substitution matrix, adapted from PBxplore
        file.write("""
                a     b     c     d     e     f     g     h     i     j     k     l     m     n     o     p
            a  516   -59   113  -105  -411  -177   -27  -361    47  -103  -644  -259  -599  -372  -124   -83
            b  -59   541  -146  -210  -155  -310   -97    90   182  -128   -30    29  -745  -242  -165    22
            c  113  -146   360   -14  -333  -240    49  -438  -269  -282  -688  -682  -608  -455  -147     6
            d -105  -210   -14   221     5  -131  -349  -278  -253  -173  -585  -670 -1573 -1048  -691  -497
            e -411  -155  -333     5   520   185   186   138  -378   -70  -112  -514 -1136  -469  -617  -632
            f -177  -310  -240  -131   185   459   -99   -45  -445    83  -214   -88  -547  -629  -406  -552
            g  -27   -97    49  -349   186   -99   665   -99   -89  -118  -409  -138  -124   172   128   254
            h -361    90  -438  -278   138   -45   -99   632  -205   316   192  -108  -712  -359    95  -399
            i   47   182  -269  -253  -378  -445   -89  -205   696   186     8    15  -709  -269  -169   226
            j -103  -128  -282  -173   -70    83  -118   316   186   768   196     5  -398  -340  -117  -104
            k -644   -30  -688  -585  -112  -214  -409   192     8   196   568   -65  -270  -231  -471  -382
            l -259    29  -682  -670  -514   -88  -138  -108    15     5   -65   533  -131     8   -11  -316
            m -599  -745  -608 -1573 -1136  -547  -124  -712  -709  -398  -270  -131   241    -4  -190  -155
            n -372  -242  -455 -1048  -469  -629   172  -359  -269  -340  -231     8    -4   703    88   146
            o -124  -165  -147  -691  -617  -406   128    95  -169  -117  -471   -11  -190    88   716    58
            p  -83    22     6  -497  -632  -552   254  -399   226  -104  -382  -316  -155   146    58   609
            """)
    gecli.main(args=[
        "--alphabet", "abcdefghijklmnop", "--matrix", mat_file, "--contrast",
        "300", "--lmin", "65", "--lmax", "70", "-f", scheme_file
    ])

    colors = graphics.load_color_scheme(scheme_file)["colors"]
    fig = plt.figure(figsize=(8.0, 5.0))
    ax = fig.gca()

    pb_alphabet = seq.LetterAlphabet("abcdefghijklmnop")
    fasta_file = fasta.FastaFile()
    fasta_file.read(PB_EXAMPLE_FILE_NAME)
    seq_strings = list(fasta_file.values())
    sequences = [
        seq.GeneralSequence(pb_alphabet, seq_str.replace("-", ""))
        for seq_str in seq_strings
    ]
    trace = align.Alignment.trace_from_strings(seq_strings)
    alignment = align.Alignment(sequences, trace, score=None)

    graphics.plot_alignment_type_based(ax,
                                       alignment,
                                       symbols_per_line=60,
                                       spacing=2,
                                       color_scheme=colors)

    fig.tight_layout()
    return fig

예제 #10

0

파일 보기

def test_alignment_conversion():
    path = os.path.join(data_dir("sequence"), "alignment.fasta")
    file = fasta.FastaFile.read(path)
    alignment = fasta.get_alignment(file)
    assert str(alignment) == ("ADTRCGTARDCGTR-DRTCGRAGD\n"
                              "ADTRCGT---CGTRADRTCGRAGD\n"
                              "ADTRCGTARDCGTRADR--GRAGD")

    file2 = fasta.FastaFile()
    fasta.set_alignment(file2, alignment, seq_names=["seq1", "seq2", "seq3"])
    alignment2 = fasta.get_alignment(file2)
    assert str(alignment) == str(alignment2)

예제 #11

0

파일 보기

def test_sequence_conversion():
    path = os.path.join(data_dir("sequence"), "nuc.fasta")
    file = fasta.FastaFile.read(path)
    assert seq.NucleotideSequence("ACGCTACGT") == fasta.get_sequence(file)

    seq_dict = fasta.get_sequences(file)
    file2 = fasta.FastaFile()
    fasta.set_sequences(file2, seq_dict)
    seq_dict2 = fasta.get_sequences(file2)
    assert seq_dict == seq_dict2

    file3 = fasta.FastaFile()
    fasta.set_sequence(file3, seq.NucleotideSequence("AACCTTGG"))
    assert file3["sequence"] == "AACCTTGG"

    path = os.path.join(data_dir("sequence"), "prot.fasta")
    file4 = fasta.FastaFile.read(path)
    assert seq.ProteinSequence("YAHGFRTGS") == fasta.get_sequence(file4)

    path = os.path.join(data_dir("sequence"), "invalid.fasta")
    file5 = fasta.FastaFile.read(path)
    with pytest.raises(ValueError):
        seq.NucleotideSequence(fasta.get_sequence(file5))

예제 #12

0

파일 보기

def test_access():
    path = os.path.join(data_dir, "nuc.fasta")
    file = fasta.FastaFile()
    file.read(path)
    assert file["dna sequence"] == "ACGCTACGT"
    assert file["another dna sequence"] == "A"
    assert file["third dna sequence"] == "ACGT"
    assert dict(file) == {"dna sequence" : "ACGCTACGT",
                          "another dna sequence" : "A",
                          "third dna sequence" : "ACGT"}
    file["another dna sequence"] = "AA"
    del file["dna sequence"]
    file["yet another sequence"] = "ACGT"
    assert dict(file) == {"another dna sequence" : "AA",
                          "third dna sequence" : "ACGT",
                          "yet another sequence" : "ACGT"}

예제 #13

0

파일 보기

    def ivalue(self, structures, alignment):
        """
        Parse back output PDBs and construct updated Structure models.

        Parameters
        ----------
        structures: [array like, array like]
            sequences of two protein structures of same length
        alignment: biotite.alignment
            alignment of the given two sequences

        Returns
        -------
        dict
            As returned by ``._parse_scoring(output)``.

            - ``scores`` (dict):
                - ``rmsd`` (float): RMSD value of the alignment
                - ``score`` (float): ivalue of the alignment
                - ``coverage`` (float): coverage of the alignment
        """

        with enter_temp_directory() as (cwd, tmpdir):
            paths = "structure1.pdb", "structure2.pdb"
            structures[0].select_atoms(self.protein_selector).write(paths[0])
            structures[1].select_atoms(self.protein_selector).write(paths[1])

            fasta_file = fasta.FastaFile()

            for header, string in alignment.items():
                fasta_file[header] = string

            fasta_file.write("temp_alignment.afasta")

            self._edit_fasta("temp_alignment.afasta")

            output = subprocess.check_output([
                self.executable, paths[0], paths[1], "--ivalue",
                "temp_alignment.afasta"
            ])
            # We need access to the temporary files at parse time!
            result = self._parse_scoring(output.decode())

        return result

예제 #14

0

파일 보기

def test_fetch(format, as_file_like):
    path = None if as_file_like else biotite.temp_dir()
    file_path_or_obj = rcsb.fetch("1l2y", format, path, overwrite=True)
    if format == "pdb":
        file = pdb.PDBFile()
        file.read(file_path_or_obj)
        pdb.get_structure(file)
    elif format == "pdbx":
        file = pdbx.PDBxFile()
        file.read(file_path_or_obj)
        pdbx.get_structure(file)
    elif format == "mmtf":
        file = mmtf.MMTFFile()
        file.read(file_path_or_obj)
        mmtf.get_structure(file)
    elif format == "fasta":
        file = fasta.FastaFile()
        file.read(file_path_or_obj)
        # Test if the file contains any sequences
        assert len(fasta.get_sequences(file)) > 0

예제 #15

0

파일 보기

    def _parse_metadata(self, output):
        """
        Retrieves RMSD, score and metadata from the output of the MMLigner subprocess.

        Parameters
        ----------
        output: str
            string of containing the stdout of the mmligener call

        Returns
        -------
        dict
            As returned by ``._parse_metadata(output)``.
            - ``scores`` (dict):
                - ``rmsd`` (float): RMSD value of the alignment
                - ``score`` (float): ivalue of the alignment
                - ``coverage`` (float): coverage of the alignment
            - ``metadata`` (dict):
                - ``alignment``: (biotite.alignment): computed alignment
                - ``rotation``: (array-like): 3x3 rotation matrix
                - ``translation``: (np.array): array containing the translation
                - ``quarternion``: (array-like): 4x4 quarternion matrix
        """
        lines = iter(output.splitlines())
        for line in lines:
            if line.startswith("RMSD"):
                rmsd = float(line.split()[2])
            elif line.startswith("Coverage"):
                coverage = float(line.split()[2])
            elif line.startswith("I(A & <S,T>)"):
                ivalue = float(line.split()[4])
            elif "Print Centers of Mass of moving set:" in line:
                moving_com = np.array([float(x) for x in next(lines).split()])
            elif "Print Centers of Mass of fixed set:" in line:
                fixed_com = np.array([float(x) for x in next(lines).split()])
            elif "Print Rotation matrix" in line:
                rotation = [[float(x) for x in next(lines).split()]
                            for _ in range(3)]
            elif "Print Quaternion matrix" in line:
                quaternion = [[float(x) for x in next(lines).split()]
                              for _ in range(4)]

        # fixed_com, moving_com, rotation and quaternion can only be obtained
        # if the patched mmligner is used (check /devtools/conda-recipes/mmligner)
        # -- this will fail in CI for now --
        translation = fixed_com - moving_com

        alignment = fasta.FastaFile()
        alignment.read("temp__1.afasta")

        return {
            "scores": {
                "rmsd": rmsd,
                "score": ivalue,
                "coverage": coverage
            },
            "metadata": {
                "alignment": alignment,
                "rotation": rotation,
                "translation": translation,
                "quaternion": quaternion,
            },
        }

예제 #16

0

파일 보기

# Download E. coli BL21 genome
file_name = entrez.fetch("CP001509", biotite.temp_dir(), "gb", "nuccore", "gb")
gb_file = gb.GenBankFile()
gb_file.read(file_name)
annot_seq = gb_file.get_annotated_sequence(include_only=["gene"])
# Find leuL gene
for feature in annot_seq.annotation:
    if "gene" in feature.qual and feature.qual["gene"] == "leuL":
        leul_feature = feature
# Get leuL sequence
leul_seq = annot_seq[leul_feature]

# Download Salmonella enterica genome without annotations
file_name = entrez.fetch("CP019649", biotite.temp_dir(), "fa", "nuccore",
                         "fasta")
fasta_file = fasta.FastaFile()
fasta_file.read(file_name)
se_genome = fasta.get_sequence(fasta_file)
# Find leuL in genome by local alignment
matrix = align.SubstitutionMatrix.std_nucleotide_matrix()
# Use general gap penalty to save RAM
alignments = align.align_optimal(leul_seq,
                                 se_genome,
                                 matrix,
                                 gap_penalty=-7,
                                 local=True)
# Do the same for reverse complement genome
se_genome_rev = se_genome.reverse().complement()
rev_alignments = align.align_optimal(leul_seq,
                                     se_genome_rev,
                                     matrix,

예제 #17

0

파일 보기

import biotite.database.entrez as entrez
import biotite.sequence.graphics as graphics

# Download and parse protein sequences of Covid and Mers
covid_file_path = entrez.fetch("NC_045512",
                               "myresult_dir",
                               suffix="fa",
                               db_name="nuccore",
                               ret_type="fasta")
mers_file_path = entrez.fetch("NC_019843.3",
                              "myresult_dir",
                              suffix="fa",
                              db_name="nuccore",
                              ret_type="fasta")
# Read the file
c_file = fasta.FastaFile()
c_file.read(covid_file_path)
m_file = fasta.FastaFile()
m_file.read(mers_file_path)
# Display
for h, s in c_file.items():
    print(h)
    print(s)
    covid_seq = seq.NucleotideSequence(s)
for h, s in m_file.items():
    print(h)
    print(s)
    mers_seq = seq.NucleotideSequence(s)
mini_covid_seq = covid_seq[0:100]
mini_mers_seq = mers_seq[0:100]
matrix = align.SubstitutionMatrix.std_nucleotide_matrix()

예제 #18

0

파일 보기

파일: genome_assembly.py 프로젝트: ebetica/biotite

# of course no assigned Phred score.
# For the purpose of this example script we simply define as threshold:
# At least 60 % of all reads covering a certain location must call a
# deletion for this location, otherwise the deletion is rejected

DELETION_THRESHOLD = 0.6

var_genome = seq.NucleotideSequence()
var_genome.code = most_probable_symbol_codes
# A deletion is called, if either enough reads include this deletion
# or the sequence position is not covered by any read at all
deletion_mask = (deletion_number > sequencing_depth * DELETION_THRESHOLD) \
                | (sequencing_depth == 0)
var_genome = var_genome[~deletion_mask]
# Write the assembled genome into a FASTA file
out_file = fasta.FastaFile()
fasta.set_sequence(
    out_file, var_genome, header="SARS-CoV-2 B.1.1.7", as_rna=True
)
out_file.write(tempfile.NamedTemporaryFile("w"))

########################################################################
# We have done it, the genome of the B.1.1.7 variant is assembled!
# Now we would like to have a closer look on the difference between the
# original and the B.1.1.7 genome.
#
# Mutations in the B.1.1.7 variant
# --------------------------------
#
# To get an rough overview about the overall sequence identity between
# the genomes and the locations of mutations in the B.1.1.7 variant,

예제 #19

0

파일 보기

# Let's demonstrate this on the genome of the *lambda* phage
# (Accession: ``NC_001416```).
# After downloading the FASTA file from the NCBI Entrez database,
# we can load the contents in the following way:

import biotite
import biotite.sequence as seq
import biotite.sequence.io.fasta as fasta
import biotite.database.entrez as entrez

file_path = entrez.fetch("NC_001416",
                         biotite.temp_dir(),
                         suffix="fa",
                         db_name="nuccore",
                         ret_type="fasta")
file = fasta.FastaFile()
file.read(file_path)
for header, string in file.items():
    print("Header:", header)
    print(len(string))
    print("Sequence:", string[:50], "...")
    print("Sequence length:", len(string))

########################################################################
# Since there is only a single sequence in the file, the loop is run
# only one time.
# As the sequence string is very long, only the first 50 bp are printed.
# Now this string could be used as input parameter for creation of a
# :class:`NucleotideSequence`.
# But we want to spare ourselves some unnecessary work, there is already
# a convenience function for that: