Beispiel #1
0
Reference
---------
.. [#] ftp://ftp.ebi.ac.uk/pub/databases/embl/doc/usrman.txt
'''

from skbio.io import create_format, FileFormatError
from skbio.sequence import Sequence, DNA, RNA, Protein
from skbio.io.format._base import (_line_generator, _get_nth_sequence,
                                   _too_many_blanks)


class EMBLFormatError(FileFormatError):
    pass


embl = create_format('embl')

# This list is ordered. From EMBL specification
_HEADERS = [
    'ID',  # identification            (begins each entry; 1 per entry)
    'AC',  # accession number          (>=1 per entry)
    'PR',  # project identifier        (0 or 1 per entry)
    'DT',  # date                      (2 per entry)
    'DE',  # description               (>=1 per entry)
    'KW',  # keyword                   (>=1 per entry)
    'OS',  # organism species          (>=1 per entry)
    'OC',  # organism classification   (>=1 per entry)
    'OG',  # organelle                 (0 or 1 per entry)
    'RN',  # reference number          (>=1 per entry)
    'RC',  # reference comment         (>=0 per entry)
    'RP',  # reference positions       (>=1 per entry)
Beispiel #2
0
from __future__ import (absolute_import, division, print_function,
                        unicode_literals)

from future.builtins import zip, range

from skbio.io import create_format, QSeqFormatError
from skbio.io.format._base import _decode_qual_to_phred, _get_nth_sequence
from skbio.alignment import SequenceCollection
from skbio.sequence import Sequence, DNA, RNA, Protein

_default_phred_offset = None
_default_variant = None
_will_filter = True

qseq = create_format('qseq')


@qseq.sniffer()
def _qseq_sniffer(fh):
    empty = True
    try:
        for _, line in zip(range(10), fh):
            _record_parser(line)
            empty = False
        return not empty, {}
    except QSeqFormatError:
        return False, {}


@qseq.reader(None)
Beispiel #3
0
"""

# ----------------------------------------------------------------------------
# Copyright (c) 2013--, scikit-bio development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file COPYING.txt, distributed with this software.
# ----------------------------------------------------------------------------

import pandas as pd

from skbio.io import create_format, BLAST7FormatError
from skbio.io.format._blast import _parse_blast_data

blast7 = create_format('blast+7')

column_converter = {
    'query id': 'qseqid',
    'query gi': 'qgi',
    'query acc.': 'qacc',
    'query acc.ver': 'qaccver',
    'query length': 'qlen',
    'subject id': 'sseqid',
    'subject ids': 'sallseqid',
    'subject gi': 'sgi',
    'subject gis': 'sallgi',
    'subject acc.': 'sacc',
    'subject acc.ver': 'saccver',
    'subject accs.': 'sallacc',
    'subject length': 'slen',
Beispiel #4
0
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file COPYING.txt, distributed with this software.
# ----------------------------------------------------------------------------

from __future__ import (absolute_import, division, print_function,
                        unicode_literals)

import functools

import pandas as pd

from skbio.io import create_format

blast6 = create_format('blast+6')

_possible_columns = {'qseqid': str, 'qgi': float, 'qacc': str, 'qaccver': str,
                     'qlen': float, 'sseqid': str, 'sallseqid': str,
                     'sgi': float, 'sallgi': float, 'sacc': str,
                     'saccver': str, 'sallacc': str, 'slen': float,
                     'qstart': float, 'qend': float, 'sstart': float,
                     'send': float, 'qseq': str, 'sseq': str,
                     'evalue': float, 'bitscore': float, 'score': float,
                     'length': float, 'pident': float, 'nident': float,
                     'mismatch': float, 'positive': float, 'gapopen': float,
                     'gaps': float, 'ppos': float, 'frames': str,
                     'qframe': float, 'sframe': float, 'btop': float,
                     'staxids': str, 'sscinames': str, 'scomnames': str,
                     'sblastnames': str, 'sskingdoms': str, 'stitle': str,
                     'salltitles': str, 'sstrand': str, 'qcovs': float,
Beispiel #5
0
# ----------------------------------------------------------------------------
# Copyright (c) 2015--, micronota development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file COPYING.txt, distributed with this software.
# ----------------------------------------------------------------------------

import re

from skbio.metadata import IntervalMetadata
from skbio.io import create_format

from ..util import split, split_head

transtermhp = create_format('transtermhp')


@transtermhp.reader(None)
def _generator(fh):
    '''Parse the annotation and add it to interval metadata.

    Parameters
    ----------
    f : str
        the file path from prediction

    Yield
    -----
    tuple of str and IntervalMetadata
        seq_id and interval metadata
Beispiel #6
0
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file COPYING.txt, distributed with this software.
# ----------------------------------------------------------------------------

from __future__ import (absolute_import, division, print_function,
                        unicode_literals)

from skbio.alignment import Alignment
from skbio.sequence import Sequence
from skbio.io import create_format, PhylipFormatError
from skbio.util._misc import chunk_str


phylip = create_format('phylip')


@phylip.sniffer()
def _phylip_sniffer(fh):
    # Strategy:
    #   Read the header and a single sequence; verify that the sequence length
    #   matches the header information.  Do not verify that the total number of
    #   lines matches the header information, since that would require reading
    #   the whole file.
    try:
        header = next(_line_generator(fh))
        _, seq_len = _validate_header(header)
        line = next(_line_generator(fh))
        _validate_line(line, seq_len)
    except (StopIteration, PhylipFormatError):
Beispiel #7
0
# ----------------------------------------------------------------------------
# Copyright (c) 2015--, micronota development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file COPYING.txt, distributed with this software.
# ----------------------------------------------------------------------------

from skbio.metadata import IntervalMetadata
from skbio.io import create_format

from ..util import split, split_head

tandem_repeats_finder = create_format('tandem_repeats_finder')


@tandem_repeats_finder.reader(None)
def _generator(fh):
    '''Parse the annotation and add it to interval metadata.

    Parameters
    ----------
    fp : str
        the file path from Tandem Repeat Finder prediction

    Yield
    -----
    tuple of str and IntervalMetadata
        seq_id and interval metadata
    '''
    splitter = split(split_head, is_head=lambda line: line.startswith('@'))
Beispiel #8
0
# Copyright (c) 2013--, scikit-bio development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file COPYING.txt, distributed with this software.
# ----------------------------------------------------------------------------

from __future__ import (absolute_import, division, print_function,
                        unicode_literals)

import pandas as pd

from skbio.io import create_format
from skbio.io.format._blast import _parse_blast_data, _possible_columns

blast6 = create_format('blast+6')

_default_columns = [
    'qseqid', 'sseqid', 'pident', 'length', 'mismatch', 'gapopen', 'qstart',
    'qend', 'sstart', 'send', 'evalue', 'bitscore'
]


@blast6.reader(pd.DataFrame, monkey_patch=False)
def _blast6_to_data_frame(fh, columns=None, default_columns=False):
    if default_columns and columns is not None:
        raise ValueError("`columns` and `default_columns` cannot both be"
                         " provided.")
    if not default_columns and columns is None:
        raise ValueError("Either `columns` or `default_columns` must be"
                         " provided.")
Beispiel #9
0
import re

from skbio.io import create_format
from skbio.metadata import IntervalMetadata

from ..util import split, split_head

aragorn = create_format('aragorn')


@aragorn.reader(None)
def _generator(fh):
    # aragorn output has a final summary line like this:
    # >end    5 sequences 97 tRNA genes 1 tmRNA genes
    # This line should be skipped and not parsed
    p = re.compile(r'>end\s+\d+ sequences \d+ tRNA genes \d+ tmRNA genes')
    splitter = split(split_head)
    for lines in splitter(fh):
        headline = lines[0]
        if p.match(headline):
            return
        sid = headline.split(None, 1)[0][1:]
        yield sid, _parse_record(lines[2:])


def _parse_record(lines):
    '''Return interval metadata.'''
    imd = IntervalMetadata(None)
    for line in lines:
        bounds, md = _parse_line(line)
        imd.add(bounds, metadata=md)
Beispiel #10
0
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file COPYING.txt, distributed with this software.
# ----------------------------------------------------------------------------

from __future__ import (absolute_import, division, print_function,
                        unicode_literals)
from future.builtins import zip

import numpy as np
import pandas as pd

from skbio._base import OrdinationResults
from skbio.io import create_format, OrdinationFormatError

ordination = create_format('ordination')


@ordination.sniffer()
def _ordination_sniffer(fh):
    # Smells an ordination file if *all* of the following lines are present
    # *from the beginning* of the file:
    #   - eigvals header (minimally parsed)
    #   - another line (contents ignored)
    #   - a whitespace-only line
    #   - proportion explained header (minimally parsed)
    try:
        _parse_header(fh, 'Eigvals', 1)
        next_line = next(fh, None)

        if next_line is not None:
Beispiel #11
0
# The full license is in the file COPYING.txt, distributed with this software.
# ----------------------------------------------------------------------------

from __future__ import absolute_import, division, print_function, unicode_literals

from future.builtins import zip, range

from skbio.io import create_format, QSeqFormatError
from skbio.io.format._base import _decode_qual_to_phred, _get_nth_sequence
from skbio.sequence import Sequence, DNA, RNA, Protein

_default_phred_offset = None
_default_variant = None
_will_filter = True

qseq = create_format("qseq")


@qseq.sniffer()
def _qseq_sniffer(fh):
    empty = True
    try:
        for _, line in zip(range(10), fh):
            _record_parser(line)
            empty = False
        return not empty, {}
    except QSeqFormatError:
        return False, {}


@qseq.reader(None)
Beispiel #12
0
# ----------------------------------------------------------------------------
# Copyright (c) 2013--, scikit-bio development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file COPYING.txt, distributed with this software.
# ----------------------------------------------------------------------------

from __future__ import absolute_import, division, print_function, unicode_literals

from skbio.alignment import Alignment
from skbio.io import create_format, PhylipFormatError
from skbio.util._misc import chunk_str

phylip = create_format("phylip")


@phylip.writer(Alignment)
def _alignment_to_phylip(obj, fh):

    if obj.is_empty():
        raise PhylipFormatError(
            "Alignment can only be written in PHYLIP format if there is at " "least one sequence in the alignment."
        )

    sequence_length = obj.sequence_length()
    if sequence_length == 0:
        raise PhylipFormatError(
            "Alignment can only be written in PHYLIP format if there is at " "least one position in the alignment."
        )
Beispiel #13
0
|Yes   |No    |generator of :mod:`skbio.sequence.Sequence` objects            |
+------+------+---------------------------------------------------------------+


Reference
---------
.. [#] https://samtools.github.io/hts-specs/SAMv1.pdf
'''

from skbio.io import create_format
from skbio.sequence import Sequence, DNA, RNA, Protein
from skbio.io.format._base import (
    _line_generator, _get_nth_sequence, _too_many_blanks)


sam = create_format('sam')

# Alignment headers
_REQUIRED_FIELDS = [
          'QNAME',   # Query template NAME.
          'FLAG',    # Combination of bitwise FLAGs
          'RNAME',   # Reference sequence NAME of the alignment
          'POS',     # 1-based leftmost mapping position of the first base
          'MAPQ',    # Mapping quality. -10log10(P_err).
          'CIGAR',   # CIGAR string
          'RNEXT',   # Reference sequence name of the primary alignment of NEXT
          'PNEXT',   # Position of the primary alignment of the NEXT read
          'TLEN',    # signed observed template length
          'SEQ',     # segment sequence
          'QUAL',    # ASCII of base quality
]
Beispiel #14
0
# The full license is in the file COPYING.txt, distributed with this software.
# ----------------------------------------------------------------------------

import re
from functools import partial

from skbio.io import create_format, GenBankFormatError
from skbio.io.format._base import (_get_nth_sequence, _line_generator,
                                   _too_many_blanks)
from skbio.util._misc import chunk_str
from skbio.sequence import Sequence, DNA, RNA, Protein
from skbio.io.format._sequence_feature_vocabulary import (
    _yield_section, _parse_section_default, _serialize_section_default,
    _parse_feature_table, _serialize_feature_table)

genbank = create_format('genbank')

# This list is ordered
# used to read and write genbank file.
_HEADERS = [
    'LOCUS', 'DEFINITION', 'ACCESSION', 'VERSION', 'DBSOURCE', 'DBLINK',
    'KEYWORDS', 'SOURCE', 'REFERENCE', 'COMMENT', 'FEATURES', 'ORIGIN'
]


@genbank.sniffer()
def _genbank_sniffer(fh):
    # check the 1st real line is a valid LOCUS line
    if _too_many_blanks(fh, 5):
        return False, {}
    try:
Beispiel #15
0
# ----------------------------------------------------------------------------
# Copyright (c) 2013--, scikit-bio development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file COPYING.txt, distributed with this software.
# ----------------------------------------------------------------------------

import numpy as np
import pandas as pd

from skbio._base import OrdinationResults
from skbio.io import create_format, OrdinationFormatError

ordination = create_format('ordination')


@ordination.sniffer()
def _ordination_sniffer(fh):
    # Smells an ordination file if *all* of the following lines are present
    # *from the beginning* of the file:
    #   - eigvals header (minimally parsed)
    #   - another line (contents ignored)
    #   - a whitespace-only line
    #   - proportion explained header (minimally parsed)
    try:
        _parse_header(fh, 'Eigvals', 1)
        next_line = next(fh, None)

        if next_line is not None:
Beispiel #16
0
.. [2] http://evolution.genetics.washington.edu/phylip/newicktree.html

"""

# ----------------------------------------------------------------------------
# Copyright (c) 2013--, scikit-bio development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file COPYING.txt, distributed with this software.
# ----------------------------------------------------------------------------

from skbio.io import create_format, NewickFormatError
from skbio.tree import TreeNode

newick = create_format('newick')


@newick.sniffer()
def _newick_sniffer(fh):
    # Strategy:
    #   The following conditions preclude a file from being newick:
    #       * It is an empty file.
    #       * There is whitespace inside of a label (handled by tokenizer)
    #       * : is followed by anything that is an operator
    #       * ( is not preceded immediately by , or another (
    #       * The parens are unablanced when ; is found.
    #   If 100 tokens (or less if EOF occurs earlier) then it is probably
    #   newick, or at least we can't prove it isn't.
    operators = set(",;:()")
    empty = True
Beispiel #17
0
# ----------------------------------------------------------------------------

import re
from collections import Iterable

from skbio.sequence import DNA, Sequence
from skbio.io import create_format, GFF3FormatError
from skbio.metadata import IntervalMetadata
from skbio.io.format._base import (_line_generator, _too_many_blanks,
                                   _get_nth_sequence)
from skbio.io.format.fasta import _fasta_to_generator
from skbio.io.format._sequence_feature_vocabulary import (_vocabulary_change,
                                                          _vocabulary_skip)
from skbio.io import write

gff3 = create_format('gff3')


@gff3.sniffer()
def _gff3_sniffer(fh):
    # check the 1st real line is a valid ID line
    if _too_many_blanks(fh, 5):
        return False, {}

    try:
        line = next(_line_generator(fh, skip_blanks=True, strip=False))
    except StopIteration:
        return False, {}

    if re.match(r'##gff-version\s+3', line):
        return True, {}
Beispiel #18
0
# ----------------------------------------------------------------------------
# Copyright (c) 2013--, scikit-bio development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file COPYING.txt, distributed with this software.
# ----------------------------------------------------------------------------

from collections import OrderedDict

from skbio.alignment import TabularMSA
from skbio.sequence._grammared_sequence import GrammaredSequence
from skbio.io import create_format, StockholmFormatError

stockholm = create_format('stockholm')


@stockholm.sniffer()
def _stockholm_sniffer(fh):
    # Smells a Stockholm file if the following conditions are met:
    # - File isn't empty
    # - File contains correct header
    try:
        line = next(fh)
    except StopIteration:
        return False, {}

    if _is_header(line):
        return True, {}
Beispiel #19
0
Reference
---------
.. [#] ftp://ftp.ebi.ac.uk/pub/databases/embl/doc/usrman.txt
'''

from skbio.io import create_format, FileFormatError
from skbio.sequence import Sequence, DNA, RNA, Protein
from skbio.io.format._base import (
    _line_generator, _get_nth_sequence, _too_many_blanks)


class EMBLFormatError(FileFormatError):
    pass


embl = create_format('embl')

# This list is ordered. From EMBL specification
_HEADERS = ['ID',  # identification            (begins each entry; 1 per entry)
            'AC',  # accession number          (>=1 per entry)
            'PR',  # project identifier        (0 or 1 per entry)
            'DT',  # date                      (2 per entry)
            'DE',  # description               (>=1 per entry)
            'KW',  # keyword                   (>=1 per entry)
            'OS',  # organism species          (>=1 per entry)
            'OC',  # organism classification   (>=1 per entry)
            'OG',  # organelle                 (0 or 1 per entry)
            'RN',  # reference number          (>=1 per entry)
            'RC',  # reference comment         (>=0 per entry)
            'RP',  # reference positions       (>=1 per entry)
            'RX',  # reference cross-reference (>=0 per entry)
Beispiel #20
0
import textwrap

import numpy as np

from skbio.io import create_format, FASTAFormatError, QUALFormatError
from skbio.io.registry import FileSentinel
from skbio.io.format._base import (_get_nth_sequence,
                                   _parse_fasta_like_header,
                                   _format_fasta_like_records, _line_generator,
                                   _too_many_blanks)
from skbio.util._misc import chunk_str
from skbio.alignment import TabularMSA
from skbio.sequence import Sequence, DNA, RNA, Protein


fasta = create_format('fasta')


@fasta.sniffer()
def _fasta_sniffer(fh):
    # Strategy:
    #   Ignore up to 5 blank/whitespace-only lines at the beginning of the
    #   file. Read up to 10 records. If at least one record is read (i.e.
    #   the file isn't empty) and no errors are thrown during reading, assume
    #   the file is in FASTA format. If a record appears to be QUAL, do *not*
    #   identify the file as FASTA since we don't want to sniff QUAL files as
    #   FASTA (technically they can be read as FASTA since the sequences may
    #   not be validated but it probably isn't what the user wanted). Also, if
    #   we add QUAL as its own file format in the future, we wouldn't want the
    #   FASTA and QUAL sniffers to both positively identify a QUAL file.
    if _too_many_blanks(fh, 5):
Beispiel #21
0
"""

# ----------------------------------------------------------------------------
# Copyright (c) 2013--, scikit-bio development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file COPYING.txt, distributed with this software.
# ----------------------------------------------------------------------------

import pandas as pd

from skbio.io import create_format
from skbio.io.format._blast import _parse_blast_data, _possible_columns

blast6 = create_format("blast+6")

_default_columns = [
    "qseqid",
    "sseqid",
    "pident",
    "length",
    "mismatch",
    "gapopen",
    "qstart",
    "qend",
    "sstart",
    "send",
    "evalue",
    "bitscore",
]
Beispiel #22
0
**Has Sniffer: Yes**

Format Specification
--------------------
An empty file consists of only whitespace characters.

"""

# ----------------------------------------------------------------------------
# Copyright (c) 2013--, scikit-bio development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file COPYING.txt, distributed with this software.
# ----------------------------------------------------------------------------

from __future__ import (absolute_import, division, print_function,
                        unicode_literals)

from skbio.io import create_format

emptyfile = create_format('<emptyfile>')


@emptyfile.sniffer()
def _empty_file_sniffer(fh):
    for line in fh:
        if line.strip():
            return False, {}
    return True, {}
Beispiel #23
0
# ----------------------------------------------------------------------------
# Copyright (c) 2013--, scikit-bio development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file COPYING.txt, distributed with this software.
# ----------------------------------------------------------------------------

import csv

import numpy as np

from skbio.stats.distance import DissimilarityMatrix, DistanceMatrix
from skbio.io import create_format, LSMatFormatError

lsmat = create_format('lsmat')


@lsmat.sniffer()
def _lsmat_sniffer(fh):
    header = _find_header(fh)

    if header is not None:
        try:
            dialect = csv.Sniffer().sniff(header)
            delimiter = dialect.delimiter

            ids = _parse_header(header, delimiter)
            first_id, _ = next(_parse_data(fh, delimiter), (None, None))

            if first_id is not None and first_id == ids[0]:
Beispiel #24
0
import re
from collections import Iterable

from skbio.sequence import DNA, Sequence
from skbio.io import create_format, GFF3FormatError
from skbio.metadata import IntervalMetadata
from skbio.io.format._base import (
    _line_generator, _too_many_blanks, _get_nth_sequence)
from skbio.io.format.fasta import _fasta_to_generator
from skbio.io.format._sequence_feature_vocabulary import (
    _vocabulary_change, _vocabulary_skip)
from skbio.io import write


gff3 = create_format('gff3')


@gff3.sniffer()
def _gff3_sniffer(fh):
    # check the 1st real line is a valid ID line
    if _too_many_blanks(fh, 5):
        return False, {}

    try:
        line = next(_line_generator(fh, skip_blanks=True, strip=False))
    except StopIteration:
        return False, {}

    if re.match(r'##gff-version\s+3', line):
        return True, {}
Beispiel #25
0
# Copyright (c) 2013--, scikit-bio development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file COPYING.txt, distributed with this software.
# ----------------------------------------------------------------------------

from __future__ import absolute_import, division, print_function, unicode_literals

from collections import OrderedDict

from skbio.alignment import TabularMSA
from skbio.sequence._iupac_sequence import IUPACSequence
from skbio.io import create_format, StockholmFormatError

stockholm = create_format("stockholm")


@stockholm.sniffer()
def _stockholm_sniffer(fh):
    # Smells a Stockholm file if the following conditions are met:
    # - File isn't empty
    # - File contains correct header
    try:
        line = next(fh)
    except StopIteration:
        return False, {}

    if _is_header(line):
        return True, {}
Beispiel #26
0
# ----------------------------------------------------------------------------
# Copyright (c) 2015--, micronota development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file COPYING.txt, distributed with this software.
# ----------------------------------------------------------------------------

from logging import getLogger

from skbio.metadata import IntervalMetadata
from skbio.io import create_format

from ..util import split, SplitterID

rnammer = create_format('rnammer')


@rnammer.reader(None)
def _generator(fh):
    '''Parse the annotation and add it to interval metadata.

    Parameters
    ----------
    fn : str
        the file name from RNAmmer prediction

    Yield
    -----
    tuple of str and IntervalMetadata
        seq_id and interval metadata
Beispiel #27
0
from six.moves import zip_longest

import textwrap

import numpy as np

from skbio.io import create_format, FASTAFormatError, QUALFormatError
from skbio.io.registry import FileSentinel
from skbio.io.format._base import (_get_nth_sequence, _parse_fasta_like_header,
                                   _format_fasta_like_records, _line_generator,
                                   _too_many_blanks)
from skbio.util._misc import chunk_str
from skbio.alignment import SequenceCollection, Alignment
from skbio.sequence import Sequence, DNA, RNA, Protein

fasta = create_format('fasta')


@fasta.sniffer()
def _fasta_sniffer(fh):
    # Strategy:
    #   Ignore up to 5 blank/whitespace-only lines at the beginning of the
    #   file. Read up to 10 records. If at least one record is read (i.e.
    #   the file isn't empty) and no errors are thrown during reading, assume
    #   the file is in FASTA format. If a record appears to be QUAL, do *not*
    #   identify the file as FASTA since we don't want to sniff QUAL files as
    #   FASTA (technically they can be read as FASTA since the sequences may
    #   not be validated but it probably isn't what the user wanted). Also, if
    #   we add QUAL as its own file format in the future, we wouldn't want the
    #   FASTA and QUAL sniffers to both positively identify a QUAL file.
    if _too_many_blanks(fh, 5):
Beispiel #28
0
# ----------------------------------------------------------------------------
# Copyright (c) 2013--, scikit-bio development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file COPYING.txt, distributed with this software.
# ----------------------------------------------------------------------------

import numpy as np
import pandas as pd

from skbio.stats.ordination import OrdinationResults
from skbio.io import create_format, OrdinationFormatError

ordination = create_format("ordination")


@ordination.sniffer()
def _ordination_sniffer(fh):
    # Smells an ordination file if *all* of the following lines are present
    # *from the beginning* of the file:
    #   - eigvals header (minimally parsed)
    #   - another line (contents ignored)
    #   - a whitespace-only line
    #   - proportion explained header (minimally parsed)
    try:
        _parse_header(fh, "Eigvals", 1)
        next_line = next(fh, None)

        if next_line is not None:
Beispiel #29
0

Reference
---------
.. [1] Eric P. Nawrocki and Sean R. Eddy, "Infernal 1.1: 100-fold faster RNA
       homology searches",  Bioinformatics 2013,
       doi: 10.1093/bioinformatics/btt509

'''

from skbio.io import create_format, FileFormatError
from skbio.metadata import IntervalMetadata, Feature
from skbio.io.format._base import (_line_generator, _too_many_blanks)
from skbio.io.format._base import _get_nth_sequence as _get_nth_record

cmscan = create_format('cmscan')

# column headers
_COLUMNS = ['MODEL_NAME', 'MODEL_ACCESSION', 'SEQUENCE_NAME',
            'SEQUENCE_ACCESSION', 'TYPE_OF_MODEL', 'MODEL_START_POSITION',
            'MODEL_END_POSITION', 'SEQUENCE_START_POSITION',
            'SEQUENCE_END_POSITION', 'STRAND', 'TRUNCATED', 'PASS',
            'GC_CONTENT', 'BIAS', 'BITSCORE', 'EVALUE', 'INC', 'DESCRIPTION']


class CmscanFormatError(FileFormatError):
    pass


def _construct(record, constructor=None, **kwargs):
    if constructor is None:
Beispiel #30
0
"""

# ----------------------------------------------------------------------------
# Copyright (c) 2013--, scikit-bio development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file COPYING.txt, distributed with this software.
# ----------------------------------------------------------------------------

from skbio.alignment import TabularMSA
from skbio.io import create_format, PhylipFormatError
from skbio.util._misc import chunk_str

phylip = create_format('phylip')


@phylip.sniffer()
def _phylip_sniffer(fh):
    # Strategy:
    #   Read the header and a single sequence; verify that the sequence length
    #   matches the header information.  Do not verify that the total number of
    #   lines matches the header information, since that would require reading
    #   the whole file.
    try:
        header = next(_line_generator(fh))
        _, seq_len = _validate_header(header)
        line = next(_line_generator(fh))
        _validate_line(line, seq_len)
    except (StopIteration, PhylipFormatError):
Beispiel #31
0
import re

import numpy as np

from skbio.io import create_format, FASTQFormatError
from skbio.io.format._base import (
    _decode_qual_to_phred, _encode_phred_to_qual, _get_nth_sequence,
    _parse_fasta_like_header, _format_fasta_like_records, _line_generator,
    _too_many_blanks)
from skbio.alignment import SequenceCollection, Alignment
from skbio.sequence import Sequence, DNA, RNA, Protein

_whitespace_regex = re.compile(r'\s')


fastq = create_format('fastq')


@fastq.sniffer()
def _fastq_sniffer(fh):
    # Strategy:
    #   Ignore up to 5 blank/whitespace-only lines at the beginning of the
    #   file. Read up to 10 records. If at least one record is read (i.e. the
    #   file isn't empty) and the quality scores are in printable ASCII range,
    #   assume the file is FASTQ.
    if _too_many_blanks(fh, 5):
        return False, {}

    try:
        not_empty = False
        for _ in zip(range(10), _fastq_to_generator(fh, phred_offset=33)):
Beispiel #32
0
# Copyright (c) 2013--, scikit-bio development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file COPYING.txt, distributed with this software.
# ----------------------------------------------------------------------------

from __future__ import (absolute_import, division, print_function,
                        unicode_literals)

import pandas as pd

from skbio.io import create_format, BLAST7FormatError
from skbio.io.format._blast import _parse_blast_data

blast7 = create_format('blast+7')

column_converter = {'query id': 'qseqid', 'query gi': 'qgi',
                    'query acc.': 'qacc', 'query acc.ver': 'qaccver',
                    'query length': 'qlen', 'subject id': 'sseqid',
                    'subject ids': 'sallseqid', 'subject gi': 'sgi',
                    'subject gis': 'sallgi', 'subject acc.': 'sacc',
                    'subject acc.ver': 'saccver', 'subject accs.': 'sallacc',
                    'subject length': 'slen', 'q. start': 'qstart',
                    'q. end': 'qend', 's. start': 'sstart', 's. end': 'send',
                    'query seq': 'qseq', 'subject seq': 'sseq',
                    'evalue': 'evalue', 'bit score': 'bitscore',
                    'score': 'score', 'alignment length': 'length',
                    '% identity': 'pident', 'identical': 'nident',
                    'mismatches': 'mismatch', 'positives': 'positive',
                    'gap opens': 'gapopen', 'gaps': 'gaps',
Beispiel #33
0
import re

import numpy as np

from skbio.io import create_format, FASTQFormatError
from skbio.io.format._base import (_decode_qual_to_phred,
                                   _encode_phred_to_qual, _get_nth_sequence,
                                   _parse_fasta_like_header,
                                   _format_fasta_like_records, _line_generator,
                                   _too_many_blanks)
from skbio.alignment import SequenceCollection, Alignment
from skbio.sequence import Sequence, DNA, RNA, Protein

_whitespace_regex = re.compile(r'\s')

fastq = create_format('fastq')


@fastq.sniffer()
def _fastq_sniffer(fh):
    # Strategy:
    #   Ignore up to 5 blank/whitespace-only lines at the beginning of the
    #   file. Read up to 10 records. If at least one record is read (i.e. the
    #   file isn't empty) and the quality scores are in printable ASCII range,
    #   assume the file is FASTQ.
    if _too_many_blanks(fh, 5):
        return False, {}

    try:
        not_empty = False
        for _ in zip(range(10), _fastq_to_generator(fh, phred_offset=33)):
Beispiel #34
0
.. [2] http://evolution.genetics.washington.edu/phylip/newicktree.html

"""

# ----------------------------------------------------------------------------
# Copyright (c) 2013--, scikit-bio development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file COPYING.txt, distributed with this software.
# ----------------------------------------------------------------------------

from skbio.io import create_format, NewickFormatError
from skbio.tree import TreeNode

newick = create_format('newick')


@newick.sniffer()
def _newick_sniffer(fh):
    # Strategy:
    #   The following conditions preclude a file from being newick:
    #       * It is an empty file.
    #       * There is whitespace inside of a label (handled by tokenizer)
    #       * : is followed by anything that is an operator
    #       * ( is not preceded immediately by , or another (
    #       * The parens are unablanced when ; is found.
    #   If 100 tokens (or less if EOF occurs earlier) then it is probably
    #   newick, or at least we can't prove it isn't.
    operators = set(",;:()")
    empty = True
Beispiel #35
0
# Copyright (c) 2013--, scikit-bio development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file COPYING.txt, distributed with this software.
# ----------------------------------------------------------------------------

import csv

import numpy as np

from skbio.stats.distance import DissimilarityMatrix, DistanceMatrix
from skbio.io import create_format, LSMatFormatError


lsmat = create_format('lsmat')


@lsmat.sniffer()
def _lsmat_sniffer(fh):
    header = _find_header(fh)

    if header is not None:
        try:
            dialect = csv.Sniffer().sniff(header)
            delimiter = dialect.delimiter

            ids = _parse_header(header, delimiter)
            first_id, _ = next(_parse_data(fh, delimiter), (None, None))

            if first_id is not None and first_id == ids[0]:
Beispiel #36
0
# The full license is in the file COPYING.txt, distributed with this software.
# ----------------------------------------------------------------------------

import re
import numpy as np
import pandas as pd
from functools import partial

from skbio.io import create_format, GenBankFormatError
from skbio.io.format._base import (
    _get_nth_sequence, _line_generator, _too_many_blanks)
from skbio.util._misc import chunk_str
from skbio.sequence import Sequence, DNA, RNA, Protein


genbank = create_format('genbank')

# This list is ordered
# used to read and write genbank file.
_HEADERS = ['LOCUS',
            'DEFINITION',
            'ACCESSION',
            'VERSION',
            'DBSOURCE',
            'DBLINK',
            'KEYWORDS',
            'SOURCE',
            'REFERENCE',
            'COMMENT',
            'FEATURES',
            'ORIGIN']
Beispiel #37
0
# Copyright (c) 2013--, scikit-bio development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file COPYING.txt, distributed with this software.
# ----------------------------------------------------------------------------

from __future__ import (absolute_import, division, print_function,
                        unicode_literals)

from skbio.io import create_format, ClustalFormatError
from skbio.sequence import Sequence
from skbio.alignment import Alignment


clustal = create_format('clustal')


def _label_line_parser(record, strict=True):
    """Returns dict mapping list of data to labels, plus list with field order.

    Field order contains labels in order encountered in file.

    NOTE: doesn't care if lines are out of order in different blocks. This
    should never happen anyway, but it's possible that this behavior should
    be changed to tighten up validation.
    """
    labels = []
    result = {}
    for line in record:
        split_line = line.strip().rsplit(None, 1)
Beispiel #38
0
from __future__ import (absolute_import, division, print_function,
                        unicode_literals)

from future.builtins import zip, range

from skbio.io import create_format, QSeqFormatError
from skbio.io.format._base import _decode_qual_to_phred, _get_nth_sequence
from skbio.alignment import SequenceCollection
from skbio.sequence import Sequence, DNA, RNA, Protein

_default_phred_offset = None
_default_variant = None
_will_filter = True

qseq = create_format('qseq')


@qseq.sniffer()
def _qseq_sniffer(fh):
    empty = True
    try:
        for _, line in zip(range(10), fh):
            _record_parser(line)
            empty = False
        return not empty, {}
    except QSeqFormatError:
        return False, {}


@qseq.reader(None)
Beispiel #39
0
+------+------+---------------------------------------------------------------+
|Yes   |No    |generator of :mod:`skbio.sequence.Sequence` objects            |
+------+------+---------------------------------------------------------------+


Reference
---------
.. [#] https://samtools.github.io/hts-specs/SAMv1.pdf
'''

from skbio.io import create_format
from skbio.sequence import Sequence, DNA, RNA, Protein
from skbio.io.format._base import (_line_generator, _get_nth_sequence,
                                   _too_many_blanks)

sam = create_format('sam')

# Alignment headers
_REQUIRED_FIELDS = [
    'QNAME',  # Query template NAME.
    'FLAG',  # Combination of bitwise FLAGs
    'RNAME',  # Reference sequence NAME of the alignment
    'POS',  # 1-based leftmost mapping position of the first base
    'MAPQ',  # Mapping quality. -10log10(P_err).
    'CIGAR',  # CIGAR string
    'RNEXT',  # Reference sequence name of the primary alignment of NEXT
    'PNEXT',  # Position of the primary alignment of the NEXT read
    'TLEN',  # signed observed template length
    'SEQ',  # segment sequence
    'QUAL',  # ASCII of base quality
]