예제 #1
0
    def __init__(self, target, mode="w"):
        """Create the writer object."""
        if mode == "w":
            try:
                target.write("")
            except TypeError:
                # target was opened in binary mode
                raise StreamModeError(
                    "File must be opened in text mode.") from None
            except AttributeError:
                # target is a path
                handle = open(target, mode)
            else:
                handle = target
        elif mode == "wb":
            try:
                target.write(b"")
            except TypeError:
                # target was opened in text mode
                raise StreamModeError(
                    "File must be opened in binary mode.") from None
            except AttributeError:
                # target is a path
                handle = open(target, mode)
            else:
                handle = target
        else:
            raise RuntimeError("Unknown mode '%s'" % mode)

        self._target = target
        self.handle = handle
예제 #2
0
    def __init__(self, target, mode="w"):
        """Create the writer object."""
        if target is not None:
            # target is None if we only use the writer to format strings.
            if mode == "w":
                try:
                    target.write("")
                except TypeError:
                    # target was opened in binary mode
                    raise StreamModeError(
                        "File must be opened in text mode.") from None
                except AttributeError:
                    # target is a path
                    stream = open(target, mode)
                else:
                    stream = target
            elif mode == "wb":
                try:
                    target.write(b"")
                except TypeError:
                    # target was opened in text mode
                    raise StreamModeError(
                        "File must be opened in binary mode.") from None
                except AttributeError:
                    # target is a path
                    stream = open(target, mode)
                else:
                    stream = target
            else:
                raise RuntimeError("Unknown mode '%s'" % mode)
            self.stream = stream

        self._target = target
예제 #3
0
def SimpleFastaParser(source):
    """Iterate over Fasta records as string tuples.

    Arguments:
     - source - input stream opened in text mode, or a path to a file

    For each record a tuple of two strings is returned, the FASTA title
    line (without the leading '>' character), and the sequence (with any
    whitespace removed). The title line is not divided up into an
    identifier (the first word) and comment or description.

    >>> with open("Fasta/dups.fasta") as handle:
    ...     for values in SimpleFastaParser(handle):
    ...         print(values)
    ...
    ('alpha', 'ACGTA')
    ('beta', 'CGTC')
    ('gamma', 'CCGCC')
    ('alpha (again - this is a duplicate entry to test the indexing code)', 'ACGTA')
    ('delta', 'CGCGC')

    """
    try:
        handle = open(source)
    except TypeError:
        handle = source
        if handle.read(0) != "":
            raise StreamModeError(
                "Fasta files must be opened in text mode") from None

    try:
        # Skip any text before the first record (e.g. blank lines, comments)
        for line in handle:
            if line[0] == ">":
                title = line[1:].rstrip()
                break
        else:
            # no break encountered - probably an empty file
            return

        # Main logic
        # Note, remove trailing whitespace, and any internal spaces
        # (and any embedded \r which are possible in mangled files
        # when not opened in universal read lines mode)
        lines = []
        for line in handle:
            if line[0] == ">":
                yield title, "".join(lines).replace(" ", "").replace("\r", "")
                lines = []
                title = line[1:].rstrip()
                continue
            lines.append(line.rstrip())

        yield title, "".join(lines).replace(" ", "").replace("\r", "")

    finally:
        if handle is not source:
            handle.close()
예제 #4
0
    def __init__(self, source, alphabet=None, mode="t", fmt=None):
        """Create a SequenceIterator object.

        Arguments:
        - source - input file stream, or path to input file
        - alphabet - no longer used, should be None

        This method MAY be overridden by any subclass.

        Note when subclassing:
        - there should be a single non-optional argument, the source.
        - you do not have to require an alphabet.
        - you can add additional optional arguments.
        """
        if alphabet is not None:
            raise ValueError("The alphabet argument is no longer supported")
        try:
            self.stream = open(source, "r" + mode)
            self.should_close_stream = True
        except TypeError:  # not a path, assume we received a stream
            if mode == "t":
                if source.read(0) != "":
                    raise StreamModeError(
                        "%s files must be opened in text mode." % fmt
                    ) from None
            elif mode == "b":
                if source.read(0) != b"":
                    raise StreamModeError(
                        "%s files must be opened in binary mode." % fmt
                    ) from None
            else:
                raise ValueError("Unknown mode '%s'" % mode)
            self.stream = source
            self.should_close_stream = False
        try:
            self.records = self.parse(self.stream)
        except Exception:
            if self.should_close_stream:
                self.stream.close()
            raise
예제 #5
0
파일: IgIO.py 프로젝트: wenh06/biopython
def IgIterator(source, alphabet=single_letter_alphabet):
    """Iterate over IntelliGenetics records (as SeqRecord objects).

    source - file-like object opened in text mode, or a path to a file
    alphabet - optional alphabet

    The optional free format file header lines (which start with two
    semi-colons) are ignored.

    The free format commentary lines at the start of each record (which
    start with a semi-colon) are recorded as a single string with embedded
    new line characters in the SeqRecord's annotations dictionary under the
    key 'comment'.

    Examples
    --------
    >>> with open("IntelliGenetics/TAT_mase_nuc.txt") as handle:
    ...     for record in IgIterator(handle):
    ...         print("%s length %i" % (record.id, len(record)))
    ...
    A_U455 length 303
    B_HXB2R length 306
    C_UG268A length 267
    D_ELI length 309
    F_BZ163A length 309
    O_ANT70 length 342
    O_MVP5180 length 348
    CPZGAB length 309
    CPZANT length 309
    A_ROD length 390
    B_EHOA length 420
    D_MM251 length 390
    STM_STM length 387
    VER_AGM3 length 354
    GRI_AGM677 length 264
    SAB_SAB1C length 219
    SYK_SYK length 330

    """
    try:
        handle = open(source)
    except TypeError:
        handle = source
        if handle.read(0) != "":
            raise StreamModeError(
                "IntelliGenetics files must be opened in text mode.") from None

    try:
        yield from _parse(handle, alphabet)
    finally:
        if handle is not source:
            handle.close()
예제 #6
0
    def __init__(self, source, mode="t", fmt=None):
        """Create an AlignmentIterator object.

        Arguments:
        - source - input file stream, or path to input file

        This method MAY be overridden by any subclass.

        Note when subclassing:
        - there should be a single non-optional argument, the source.
        - you can add additional optional arguments.
        """
        try:
            self.stream = open(source, "r" + mode)
            self.should_close_stream = True
        except TypeError:  # not a path, assume we received a stream
            if mode == "t":
                if source.read(0) != "":
                    raise StreamModeError(
                        "%s files must be opened in text mode." % fmt
                    ) from None
            elif mode == "b":
                if source.read(0) != b"":
                    raise StreamModeError(
                        "%s files must be opened in binary mode." % fmt
                    ) from None
            else:
                raise ValueError("Unknown mode '%s'" % mode) from None
            self.stream = source
            self.should_close_stream = False
        try:
            self.alignments = self.parse(self.stream)
        except Exception:
            if self.should_close_stream:
                self.stream.close()
            raise
예제 #7
0
 def __init__(self, stream_or_path, namespace=None):
     """Create the object and initialize the XML parser."""
     self.parser = sax.make_parser()
     content_handler = ContentHandler()
     self.parser.setContentHandler(content_handler)
     self.parser.setFeature(handler.feature_namespaces, True)
     try:
         handle = open(stream_or_path, "rb")
     except TypeError:  # not a path, assume we received a stream
         # Make sure we got a binary handle. If we got a text handle, then
         # the parser will still run but unicode characters will be garbled
         # if the text handle was opened with a different encoding than the
         # one specified in the XML file. With a binary handle, the correct
         # encoding is picked up by the parser from the XML file.
         if stream_or_path.read(0) != b"":
             raise StreamModeError(
                 "SeqXML files should be opened in binary mode") from None
         self.handle = stream_or_path
         self.should_close_handle = False
     else:  # we received a path
         self.handle = handle
         self.should_close_handle = True
     # Read until we see the seqXML element with the seqXMLversion
     BLOCK = self.BLOCK
     try:
         while True:
             # Read in another block of the file...
             text = self.handle.read(BLOCK)
             if not text:
                 if content_handler.startElementNS is None:
                     raise ValueError("Empty file.")
                 else:
                     raise ValueError("XML file contains no data.")
             self.parser.feed(text)
             seqXMLversion = content_handler.seqXMLversion
             if seqXMLversion is not None:
                 break
     except Exception:
         if self.should_close_handle:
             self.handle.close()
         raise
     self.seqXMLversion = seqXMLversion
     self.source = content_handler.source
     self.sourceVersion = content_handler.sourceVersion
     self.ncbiTaxID = content_handler.ncbiTaxID
     self.speciesName = content_handler.speciesName
예제 #8
0
def SnapGeneIterator(source):
    """Parse a SnapGene file and return a SeqRecord object.

    Argument source is a file-like object or a path to a file.

    Note that a SnapGene file can only contain one sequence, so this
    iterator will always return a single record.
    """
    try:
        handle = open(source, "rb")
    except TypeError:
        handle = source
        if handle.read(0) != b"":
            raise StreamModeError(
                "SnapGene files must be opened in binary mode.") from None

    record = SeqRecord(None)

    try:
        packets = _iterate(handle)
        try:
            packet_type, length, data = next(packets)
        except StopIteration:
            raise ValueError("Empty file.") from None

        if packet_type != 0x09:
            raise ValueError(
                "The file does not start with a SnapGene cookie packet")
        _parse_cookie_packet(length, data, record)

        for (packet_type, length, data) in packets:
            handler = _packet_handlers.get(packet_type)
            if handler is not None:
                handler(length, data, record)

    finally:
        if handle is not source:
            handle.close()

    if not record.seq:
        raise ValueError("No DNA packet in file")

    yield record
예제 #9
0
파일: GckIO.py 프로젝트: wenh06/biopython
def GckIterator(source):
    """Parse a GCK file and return a SeqRecord object.

    Argument source is a file-like object or a path to a file.

    Note that a GCK file can only contain one sequence, so this
    iterator will always return a single record.
    """
    try:
        handle = open(source, "rb")
    except TypeError:
        handle = source
        if handle.read(0) != b"":
            raise StreamModeError(
                "GCK files must be opened in binary mode.") from None

    try:
        records = _parse(handle)
        yield from records
    finally:
        if handle is not source:
            handle.close()
예제 #10
0
def FastaTwoLineParser(source):
    """Iterate over no-wrapping Fasta records as string tuples.

    Arguments:
     - source - input stream opened in text mode, or a path to a file

    Functionally the same as SimpleFastaParser but with a strict
    interpretation of the FASTA format as exactly two lines per
    record, the greater-than-sign identifier with description,
    and the sequence with no line wrapping.

    Any line wrapping will raise an exception, as will excess blank
    lines (other than the special case of a zero-length sequence
    as the second line of a record).

    Examples
    --------
    This file uses two lines per FASTA record:

    >>> with open("Fasta/aster_no_wrap.pro") as handle:
    ...     for title, seq in FastaTwoLineParser(handle):
    ...         print("%s = %s..." % (title, seq[:3]))
    ...
    gi|3298468|dbj|BAA31520.1| SAMIPF = GGH...

    This equivalent file uses line wrapping:

    >>> with open("Fasta/aster.pro") as handle:
    ...     for title, seq in FastaTwoLineParser(handle):
    ...         print("%s = %s..." % (title, seq[:3]))
    ...
    Traceback (most recent call last):
       ...
    ValueError: Expected FASTA record starting with '>' character. Perhaps this file is using FASTA line wrapping? Got: 'MTFGLVYTVYATAIDPKKGSLGTIAPIAIGFIVGANI'

    """
    try:
        handle = open(source)
    except TypeError:
        handle = source
        if handle.read(0) != "":
            raise StreamModeError("Fasta files must be opened in text mode") from None

    idx = -1  # for empty file
    try:
        for idx, line in enumerate(handle):
            if idx % 2 == 0:  # title line
                if line[0] != ">":
                    raise ValueError(
                        "Expected FASTA record starting with '>' character. "
                        "Perhaps this file is using FASTA line wrapping? "
                        f"Got: '{line}'"
                    )
                title = line[1:].rstrip()
            else:  # sequence line
                if line[0] == ">":
                    raise ValueError(
                        "Two '>' FASTA lines in a row. Missing sequence line "
                        "if this is strict two-line-per-record FASTA format. "
                        f"Have '>{title}' and '{line}'"
                    )
                yield title, line.strip()

        if idx == -1:
            pass  # empty file
        elif idx % 2 == 0:  # on a title line
            raise ValueError(
                "Missing sequence line at end of file if this is strict "
                f"two-line-per-record FASTA format. Have title line '{line}'"
            )
        else:
            assert line[0] != ">", "line[0] == '>' ; this should be impossible!"
    finally:
        if handle is not source:
            handle.close()
예제 #11
0
파일: XdnaIO.py 프로젝트: wenh06/biopython
def XdnaIterator(source):
    """Parse a Xdna file and return a SeqRecord object.

    Argument source is a file-like object in binary mode or a path to a file.

    Note that this is an "iterator" in name only since an Xdna file always
    contain a single sequence.

    """
    try:
        handle = open(source, "rb")
    except TypeError:
        handle = source
        if handle.read(0) != b"":
            raise StreamModeError(
                "Xdna files must be opened in binary mode.") from None
    # Parse fixed-size header and do some rudimentary checks
    #
    # The "neg_length" value is the length of the part of the sequence
    # before the nucleotide considered as the "origin" (nucleotide number 1,
    # which in DNA Strider is not always the first nucleotide).
    # Biopython's SeqRecord has no such concept of a sequence origin as far
    # as I know, so we ignore that value. SerialCloner has no such concept
    # either and always generates files with a neg_length of zero.

    try:

        header = handle.read(112)
        if not header:
            raise ValueError("Empty file.")
        if len(header) < 112:
            raise ValueError(
                "Improper header, cannot read 112 bytes from handle")
        (version, type, topology, length, neg_length,
         com_length) = unpack(">BBB25xII60xI12x", header)
        if version != 0:
            raise ValueError("Unsupported XDNA version")
        if type not in _seq_types:
            raise ValueError("Unknown sequence type")

        # Read actual sequence and comment found in all XDNA files
        sequence = _read(handle, length).decode("ASCII")
        comment = _read(handle, com_length).decode("ASCII")

        # Try to derive a name from the first "word" of the comment
        name = comment.split(" ")[0]

        # Create record object
        record = SeqRecord(Seq(sequence, _seq_types[type]),
                           description=comment,
                           name=name,
                           id=name)
        if topology in _seq_topologies:
            record.annotations["topology"] = _seq_topologies[topology]

        if len(handle.read(1)) == 1:
            # This is an XDNA file with an optional annotation section.

            # Skip the overhangs as I don't know how to represent
            # them in the SeqRecord model.
            _read_overhang(handle)  # right-side overhang
            _read_overhang(handle)  # left-side overhang

            # Read the features
            num_features = unpack(">B", _read(handle, 1))[0]
            while num_features > 0:
                _read_feature(handle, record)
                num_features -= 1

        yield record

    finally:
        if handle is not source:
            handle.close()
예제 #12
0
파일: NibIO.py 프로젝트: wenh06/biopython
def NibIterator(source, alphabet=None):
    """Iterate over a nib file and yield a SeqRecord.

        - source - a file-like object or a path to a file in the nib file
          format as defined by UCSC; the file must be opened in binary mode.
        - alphabet - always ignored.

    Note that a nib file always contains only one sequence record.
    The sequence of the resulting SeqRecord object should match the sequence
    generated by Jim Kent's nibFrag utility run with the -masked option.

    This function is used internally via the Bio.SeqIO functions:

    >>> from Bio import SeqIO
    >>> record = SeqIO.read("Nib/test_even_bigendian.nib", "nib")
    >>> print("%s %i" % (record.seq, len(record)))
    nAGAAGagccgcNGgCActtGAnTAtCGTCgcCacCaGncGncTtGNtGG 50

    You can also call it directly:

    >>> with open("Nib/test_even_bigendian.nib", "rb") as handle:
    ...     for record in NibIterator(handle):
    ...         print("%s %i" % (record.seq, len(record)))
    ...
    nAGAAGagccgcNGgCActtGAnTAtCGTCgcCacCaGncGncTtGNtGG 50

    """
    if alphabet is not None:
        raise ValueError("Alphabets are ignored.")

    try:
        handle = open(source, "rb")
    except TypeError:
        handle = source
        if handle.read(0) != b"":
            raise StreamModeError(
                "nib files must be opened in binary mode.") from None

    try:
        word = handle.read(4)
        if not word:
            raise ValueError("Empty file.")
        signature = word.hex()
        if signature == "3a3de96b":
            byteorder = "little"  # little-endian
        elif signature == "6be93d3a":
            byteorder = "big"  # big-endian
        else:
            raise ValueError("unexpected signature in nib header")
        number = handle.read(4)
        length = int.from_bytes(number, byteorder)
        data = handle.read()
        indices = data.hex()
        if length % 2 == 0:
            if len(indices) != length:
                raise ValueError("Unexpected file size")
        elif length % 2 == 1:
            if len(indices) != length + 1:
                raise ValueError("Unexpected file size")
            indices = indices[:length]
        if not set(indices).issubset("0123489abc"):
            raise ValueError("Unexpected sequence data found in file")
        table = str.maketrans("0123489abc", "TCAGNtcagn")
        nucleotides = indices.translate(table)
        sequence = Seq(nucleotides)
        record = SeqRecord(sequence)
        yield record
    finally:
        if handle is not source:
            handle.close()
예제 #13
0
def PirIterator(source):
    """Iterate over a PIR file and yield SeqRecord objects.

    source - file-like object or a path to a file.

    Examples
    --------
    >>> with open("NBRF/DMB_prot.pir") as handle:
    ...    for record in PirIterator(handle):
    ...        print("%s length %i" % (record.id, len(record)))
    HLA:HLA00489 length 263
    HLA:HLA00490 length 94
    HLA:HLA00491 length 94
    HLA:HLA00492 length 80
    HLA:HLA00493 length 175
    HLA:HLA01083 length 188

    """
    try:
        handle = open(source)
    except TypeError:
        handle = source
        if handle.read(0) != "":
            raise StreamModeError(
                "PIR files must be opened in binary mode.") from None

    try:
        # Skip any text before the first record (e.g. blank lines, comments)
        for line in handle:
            if line[0] == ">":
                break
        else:
            return  # Premature end of file, or just empty?

        while True:
            pir_type = line[1:3]
            if pir_type not in _pir_alphabets or line[3] != ";":
                raise ValueError(
                    "Records should start with '>XX;' where XX is a valid sequence type"
                )
            identifier = line[4:].strip()
            description = handle.readline().strip()

            lines = []
            for line in handle:
                if line[0] == ">":
                    break
                # Remove trailing whitespace, and any internal spaces
                lines.append(line.rstrip().replace(" ", ""))
            else:
                line = None
            seq = "".join(lines)
            if seq[-1] != "*":
                # Note the * terminator is present on nucleotide sequences too,
                # it is not a stop codon!
                raise ValueError(
                    "Sequences in PIR files should include a * terminator!")

            # Return the record and then continue...
            record = SeqRecord(
                Seq(seq[:-1], _pir_alphabets[pir_type]),
                id=identifier,
                name=identifier,
                description=description,
            )
            record.annotations["PIR-type"] = pir_type
            yield record

            if line is None:
                return  # StopIteration
        raise ValueError("Unrecognised PIR record format.")
    finally:
        if handle is not source:
            handle.close()
예제 #14
0
def PdbSeqresIterator(source):
    """Return SeqRecord objects for each chain in a PDB file.

    Arguments:
     - source - input stream opened in text mode, or a path to a file

    The sequences are derived from the SEQRES lines in the
    PDB file header, not the atoms of the 3D structure.

    Specifically, these PDB records are handled: DBREF, SEQADV, SEQRES, MODRES

    See: http://www.wwpdb.org/documentation/format23/sect3.html

    This gets called internally via Bio.SeqIO for the SEQRES based interpretation
    of the PDB file format:

    >>> from Bio import SeqIO
    >>> for record in SeqIO.parse("PDB/1A8O.pdb", "pdb-seqres"):
    ...     print("Record id %s, chain %s" % (record.id, record.annotations["chain"]))
    ...     print(record.dbxrefs)
    ...
    Record id 1A8O:A, chain A
    ['UNP:P12497', 'UNP:POL_HV1N5']

    Equivalently,

    >>> with open("PDB/1A8O.pdb") as handle:
    ...     for record in PdbSeqresIterator(handle):
    ...         print("Record id %s, chain %s" % (record.id, record.annotations["chain"]))
    ...         print(record.dbxrefs)
    ...
    Record id 1A8O:A, chain A
    ['UNP:P12497', 'UNP:POL_HV1N5']

    Note the chain is recorded in the annotations dictionary, and any PDB DBREF
    lines are recorded in the database cross-references list.
    """
    # Late-binding import to avoid circular dependency on SeqIO in Bio.SeqUtils
    from Bio.SeqUtils import seq1

    chains = collections.defaultdict(list)
    metadata = collections.defaultdict(list)
    try:
        handle = open(source)
    except TypeError:
        handle = source
        if handle.read(0) != "":
            raise StreamModeError("PDB files must be opened in text mode.") from None

    try:
        rec_name = None
        for line in handle:
            rec_name = line[0:6].strip()
            if rec_name == "SEQRES":
                # NB: We only actually need chain ID and the residues here;
                # commented bits are placeholders from the wwPDB spec.
                # Serial number of the SEQRES record for the current chain.
                # Starts at 1 and increments by one each line.
                # Reset to 1 for each chain.
                # ser_num = int(line[8:10])
                # Chain identifier. This may be any single legal character,
                # including a blank which is used if there is only one chain.
                chn_id = line[11]
                # Number of residues in the chain (repeated on every record)
                # num_res = int(line[13:17])
                residues = [
                    seq1(res, custom_map=protein_letters_3to1)
                    for res in line[19:].split()
                ]
                chains[chn_id].extend(residues)
            elif rec_name == "DBREF":
                #  ID code of this entry (PDB ID)
                pdb_id = line[7:11]
                # Chain identifier.
                chn_id = line[12]
                # Initial sequence number of the PDB sequence segment.
                # seq_begin = int(line[14:18])
                # Initial insertion code of the PDB sequence segment.
                # icode_begin = line[18]
                # Ending sequence number of the PDB sequence segment.
                # seq_end = int(line[20:24])
                # Ending insertion code of the PDB sequence segment.
                # icode_end = line[24]
                # Sequence database name.
                database = line[26:32].strip()
                # Sequence database accession code.
                db_acc = line[33:41].strip()
                # Sequence database identification code.
                db_id_code = line[42:54].strip()
                # Initial sequence number of the database seqment.
                # db_seq_begin = int(line[55:60])
                # Insertion code of initial residue of the segment, if PDB is the
                # reference.
                # db_icode_begin = line[60]
                # Ending sequence number of the database segment.
                # db_seq_end = int(line[62:67])
                # Insertion code of the ending residue of the segment, if PDB is the
                # reference.
                # db_icode_end = line[67]
                metadata[chn_id].append(
                    {
                        "pdb_id": pdb_id,
                        "database": database,
                        "db_acc": db_acc,
                        "db_id_code": db_id_code,
                    }
                )
            # ENH: 'SEQADV' 'MODRES'

        if rec_name is None:
            raise ValueError("Empty file.")

        for chn_id, residues in sorted(chains.items()):
            record = SeqRecord(Seq("".join(residues), generic_protein))
            record.annotations = {"chain": chn_id}
            if chn_id in metadata:
                m = metadata[chn_id][0]
                record.id = record.name = "%s:%s" % (m["pdb_id"], chn_id)
                record.description = "%s:%s %s" % (
                    m["database"],
                    m["db_acc"],
                    m["db_id_code"],
                )
                for melem in metadata[chn_id]:
                    record.dbxrefs.extend(
                        [
                            "%s:%s" % (melem["database"], melem["db_acc"]),
                            "%s:%s" % (melem["database"], melem["db_id_code"]),
                        ]
                    )
            else:
                record.id = chn_id
            yield record
    finally:
        if handle is not source:
            handle.close()
예제 #15
0
def AbiIterator(source, alphabet=None, trim=False):
    """Return an iterator for the Abi file format."""
    # raise exception is alphabet is not dna
    if alphabet is not None:
        if isinstance(Alphabet._get_base_alphabet(alphabet), Alphabet.ProteinAlphabet):
            raise ValueError("Invalid alphabet, ABI files do not hold proteins.")
        if isinstance(Alphabet._get_base_alphabet(alphabet), Alphabet.RNAAlphabet):
            raise ValueError("Invalid alphabet, ABI files do not hold RNA.")

    try:
        handle = open(source, "rb")
    except TypeError:
        handle = source
        if handle.read(0) != b"":
            raise StreamModeError("ABI files must be opened in binary mode.") from None

    try:

        # check if input file is a valid Abi file
        marker = handle.read(4)
        if not marker:
            # handle empty file gracefully
            raise ValueError("Empty file.")

        if marker != b"ABIF":
            raise OSError("File should start ABIF, not %r" % marker)

        # dirty hack for handling time information
        times = {"RUND1": "", "RUND2": "", "RUNT1": "", "RUNT2": ""}

        # initialize annotations
        annot = dict(zip(_EXTRACT.values(), [None] * len(_EXTRACT)))

        # parse header and extract data from directories
        header = struct.unpack(_HEADFMT, handle.read(struct.calcsize(_HEADFMT)))

        # Set default sample ID value, which we expect to be present in most
        # cases in the SMPL1 tag, but may be missing.
        sample_id = "<unknown id>"

        raw = {}
        for tag_name, tag_number, tag_data in _abi_parse_header(header, handle):
            key = tag_name + str(tag_number)

            raw[key] = tag_data

            # PBAS2 is base-called sequence, only available in 3530
            if key == "PBAS2":
                seq = tag_data.decode()
                ambigs = "KYWMRS"
                if alphabet is None:
                    if set(seq).intersection(ambigs):
                        alphabet = ambiguous_dna
                    else:
                        alphabet = unambiguous_dna
            # PCON2 is quality values of base-called sequence
            elif key == "PCON2":
                qual = [ord(val) for val in tag_data.decode()]
            # SMPL1 is sample id entered before sequencing run, it must be
            # a string.
            elif key == "SMPL1":
                sample_id = _get_string_tag(tag_data)
            elif key in times:
                times[key] = tag_data
            else:
                if key in _EXTRACT:
                    annot[_EXTRACT[key]] = tag_data

        # set time annotations
        annot["run_start"] = "%s %s" % (times["RUND1"], times["RUNT1"])
        annot["run_finish"] = "%s %s" % (times["RUND2"], times["RUNT2"])

        # raw data (for advanced end users benefit)
        annot["abif_raw"] = raw

        # fsa check
        is_fsa_file = all(tn not in raw for tn in ("PBAS1", "PBAS2"))

        if is_fsa_file:
            try:
                file_name = basename(handle.name).replace(".fsa", "")
            except AttributeError:
                file_name = ""

            sample_id = _get_string_tag(raw.get("LIMS1"), sample_id)
            description = _get_string_tag(raw.get("CTID1"), "<unknown description>")
            record = SeqRecord(
                Seq(""),
                id=sample_id,
                name=file_name,
                description=description,
                annotations=annot,
            )

        else:
            # use the file name as SeqRecord.name if available
            try:
                file_name = basename(handle.name).replace(".ab1", "")
            except AttributeError:
                file_name = ""
            record = SeqRecord(
                Seq(seq, alphabet),
                id=sample_id,
                name=file_name,
                description="",
                annotations=annot,
                letter_annotations={"phred_quality": qual},
            )

        if not trim or is_fsa_file:
            yield record
        else:
            yield _abi_trim(record)

    finally:
        if handle is not source:
            handle.close()
예제 #16
0
def TabIterator(source, alphabet=single_letter_alphabet):
    """Iterate over tab separated lines as SeqRecord objects.

    Each line of the file should contain one tab only, dividing the line
    into an identifier and the full sequence.

    Arguments:
     - source - file-like object opened in text mode, or a path to a file
     - alphabet - optional alphabet

    The first field is taken as the record's .id and .name (regardless of
    any spaces within the text) and the second field is the sequence.

    Any blank lines are ignored.

    Examples
    --------
    >>> with open("GenBank/NC_005816.tsv") as handle:
    ...     for record in TabIterator(handle):
    ...         print("%s length %i" % (record.id, len(record)))
    gi|45478712|ref|NP_995567.1| length 340
    gi|45478713|ref|NP_995568.1| length 260
    gi|45478714|ref|NP_995569.1| length 64
    gi|45478715|ref|NP_995570.1| length 123
    gi|45478716|ref|NP_995571.1| length 145
    gi|45478717|ref|NP_995572.1| length 357
    gi|45478718|ref|NP_995573.1| length 138
    gi|45478719|ref|NP_995574.1| length 312
    gi|45478720|ref|NP_995575.1| length 99
    gi|45478721|ref|NP_995576.1| length 90

    """
    try:
        handle = open(source)
    except TypeError:
        handle = source
        if handle.read(0) != "":
            raise StreamModeError(
                "Tab-separated plain-text files must be opened in text mode."
            ) from None
    try:
        for line in handle:
            try:
                title, seq = line.split(
                    "\t")  # will fail if more than one tab!
            except ValueError:
                if line.strip() == "":
                    # It's a blank line, ignore it
                    continue
                raise ValueError(
                    "Each line should have one tab separating the" +
                    " title and sequence, this line has %i tabs: %r" %
                    (line.count("\t"), line)) from None
            title = title.strip()
            seq = seq.strip()  # removes the trailing new line
            yield SeqRecord(Seq(seq, alphabet),
                            id=title,
                            name=title,
                            description="")
    finally:
        if handle is not source:
            handle.close()