Ejemplo n.º 1
0
def XdnaIterator(handle):
    """Parse a Xdna file and return a SeqRecord object.

    Note that this is an "iterator" in name only since a Xdna file always
    contain a single sequence.
    """
    # Parse fixed-size header and do some rudimentary checks
    #
    # The "neg_length" value is the length of the part of the sequence
    # before the nucleotide considered as the "origin" (nucleotide number 1,
    # which in DNA Strider is not always the first nucleotide).
    # Biopython's SeqRecord has no such concept of a sequence origin as far
    # as I know, so we ignore that value. SerialCloner has no such concept
    # either and always generates files with a neg_length of zero.

    with as_handle(handle, "rb") as handle:

        header = _read_header(handle, 112)
        (version, type, topology, length, neg_length,
         com_length) = unpack(">BBB25xII60xI12x", header)
        if version != 0:
            raise ValueError("Unsupported XDNA version")
        if type not in _seq_types:
            raise ValueError("Unknown sequence type")

        # Read actual sequence and comment found in all XDNA files
        sequence = _read(handle, length).decode("ASCII")
        comment = _read(handle, com_length).decode("ASCII")

        # Try to derive a name from the first "word" of the comment
        name = comment.split(" ")[0]

        # Create record object
        record = SeqRecord(Seq(sequence, _seq_types[type]),
                           description=comment,
                           name=name,
                           id=name)
        if topology in _seq_topologies:
            record.annotations["topology"] = _seq_topologies[topology]

        if len(handle.read(1)) == 1:
            # This is an XDNA file with an optional annotation section.

            # Skip the overhangs as I don't know how to represent
            # them in the SeqRecord model.
            _read_overhang(handle)  # right-side overhang
            _read_overhang(handle)  # left-side overhang

            # Read the features
            num_features = unpack(">B", _read(handle, 1))[0]
            while num_features > 0:
                _read_feature(handle, record)
                num_features -= 1

        yield record
def GckIterator(handle):
    """Parse a GCK file and return a SeqRecord object.

    Note that a GCK file can only contain one sequence, so this
    iterator will always return a single record.
    """
    with as_handle(handle, "rb") as fp:
        # Skip file header
        # GCK files start with a 24-bytes header. Bytes 4 and 8 seem to
        # always be 12, maybe this could act as a magic cookie. Bytes
        # 17-20 and 21-24 contain variable values of unknown meaning.
        # check if file is empty
        _read_header(handle, 24)

        # Read the actual sequence data
        packet, length = _read_packet(handle)
        # The body of the sequence packet starts with a 32-bit integer
        # representing the length of the sequence.
        seq_length = unpack(">I", packet[:4])[0]
        # This length should not be larger than the length of the
        # sequence packet.
        if seq_length > length - 4:
            raise ValueError("Conflicting sequence length values")
        sequence = packet[4:].decode("ASCII")
        record = SeqRecord(Seq(sequence, alphabet=Alphabet.generic_dna))

        # Skip unknown packet
        _read_packet(handle)

        # Read features packet
        packet, length = _read_packet(handle)
        (seq_length, num_features) = unpack(">IH", packet[:6])
        # Check that length in the features packet matches the actual
        # length of the sequence
        if seq_length != len(record):
            raise ValueError("Conflicting sequence length values")
        # Each feature is stored in a 92-bytes structure.
        if length - 6 != num_features * 92:
            raise ValueError(
                "Features packet size inconsistent with number of features")
        for i in range(0, num_features):
            offset = 6 + i * 92
            feature_data = packet[offset:offset + 92]

            # There's probably more stuff to unpack in that structure,
            # but those values are the only ones I understand.
            (start, end, type, strand, has_name, has_comment,
             version) = unpack(">II6xH14xB17xII35xB", feature_data)

            if strand == 1:  # Reverse strand
                strand = -1
            else:
                # Other possible values are 0 (no strand specified),
                # 2 (forward strand), and 3 (both strands). All are
                # treated as a forward strand.
                strand = 1
            location = FeatureLocation(start, end, strand=strand)

            # It looks like any value > 0 indicates a CDS...
            if type > 0:
                type = "CDS"
            else:
                type = "misc_feature"

            # Each feature may have a name and a comment, which are then
            # stored immediately after the features packet. Names are
            # stored as Pascal strings (1 length byte followed by the
            # string itself), comments are stored as "32-bit Pascal strings"
            # (4 length bytes followed by the string).
            qualifiers = {}
            if has_name > 0:
                name = _read_pstring(handle)
                qualifiers["label"] = [name]
            if has_comment > 0:
                comment = _read_p4string(handle)
                qualifiers["note"] = [comment]

            # Each feature may exist in several "versions". We keep only
            # the most recent version.
            if version > 0:
                continue

            feature = SeqFeature(location, type=type, qualifiers=qualifiers)
            record.features.append(feature)

        # Read restriction sites packet
        # We are not interested in restriction sites, but we must still read
        # that packet so that we can skip the names and comments for each
        # site, which are stored after that packet in a similar way as for
        # the features above.
        packet, length = _read_packet(handle)
        (seq_length, num_sites) = unpack(">IH", packet[:6])
        # Each site is stored in a 88-bytes structure
        if length - 6 != num_sites * 88:
            raise ValueError(
                "Sites packet size inconsistent with number of sites")
        for i in range(0, num_sites):
            offset = 6 + i * 88
            site_data = packet[offset:offset + 88]

            (start, end, has_name,
             has_comment) = unpack(">II24xII48x", site_data)

            # Skip names and comments
            if has_name:
                _read_pstring(handle)
            if has_comment:
                _read_p4string(handle)

        # Skip unknown packet
        _read_packet(handle)

        # Next in the file are "version packets".
        # However they are not properly formatted "packets" as they are not
        # preceded by an integer giving their size. Instead we have a
        # short integer indicating how many versions are there, and then
        # as many 260-bytes block as we have versions.
        num_versions = _read(handle, 2)
        num_versions = unpack(">H", num_versions)[0]
        versions = _read(handle, num_versions * 260)
        for i in range(0, num_versions):
            offset = i * 260
            version_data = versions[offset:offset + 260]

            # Each version may have a comment, which is then stored
            # after all the "version packets".
            has_comment = unpack(">I", version_data[-4:])[0]
            if has_comment > 0:
                _read_p4string(handle)

        # Skip unknown fixed-size block
        # Whatever this block contains, it is not preceded by any length
        # indicator, so I hope its size is indeed constant in all files...
        _read(handle, 706)

        # Read the construct's name
        name = _read_pstring(handle)
        record.name = record.id = name.split(" ")[0]
        record.description = name

        # Circularity byte
        # There may be other flags in that block, but their meaning
        # is unknown to me.
        flags = _read(handle, 17)
        circularity = unpack(">16xB", flags)[0]
        if circularity > 0:
            record.annotations["topology"] = "circular"
        else:
            record.annotations["topology"] = "linear"

        yield record