Example #1
0
    def __iter__(self):
        handle = self._handle
        handle.seek(0)
        start_offset = handle.tell()
        regex_id = re.compile(_as_bytes(r'Query\s*(?:sequence|HMM)?:\s*(.*)'))

        # determine flag for hmmsearch
        is_hmmsearch = False
        line = read_forward(handle)
        if line.startswith(_as_bytes('hmmsearch')):
            is_hmmsearch = True

        while True:
            end_offset = handle.tell()

            if line.startswith(self.qresult_start):
                regx = re.search(regex_id, line)
                qresult_key = regx.group(1).strip()
                # qresult start offset is the offset of this line
                # (starts with the start mark)
                start_offset = end_offset - len(line)
            elif line.startswith(self.qresult_end):
                yield _bytes_to_string(qresult_key), start_offset, 0
                start_offset = end_offset
            elif not line:
                # HACK: since hmmsearch can only have one query result
                if is_hmmsearch:
                    yield _bytes_to_string(qresult_key), start_offset, 0
                break

            line = read_forward(handle)
Example #2
0
 def __iter__(self):
     """Returns (id,offset) tuples."""
     marker_offset = len(self._marker)
     marker_re = self._marker_re
     handle = self._handle
     handle.seek(0)
     # Skip and header before first record
     while True:
         start_offset = handle.tell()
         line = handle.readline()
         if marker_re.match(line) or not line:
             break
     # Should now be at the start of a record, or end of the file
     while marker_re.match(line):
         # Here we can assume the record.id is the first word after the
         # marker. This is generally fine... but not for GenBank, EMBL, Swiss
         id = line[marker_offset:].strip().split(None, 1)[0]
         length = len(line)
         while True:
             end_offset = handle.tell()
             line = handle.readline()
             if marker_re.match(line) or not line:
                 yield _bytes_to_string(id), start_offset, length
                 start_offset = end_offset
                 break
             else:
                 # Track this explicitly as can't do file offset difference on BGZF
                 length += len(line)
     assert not line, repr(line)
Example #3
0
 def __iter__(self):
     handle = self._handle
     handle.seek(0)
     marker_re = self._marker_re
     semi_char = _as_bytes(";")
     #Skip any header before first record
     while True:
         start_offset = handle.tell()
         line = handle.readline()
         if marker_re.match(line) or not line:
             break
     #Should now be at the start of a record, or end of the file
     while marker_re.match(line):
         length = len(line)
         #We cannot assume the record.id is the first word after ID,
         #normally the following AC line is used.
         line = handle.readline()
         length += len(line)
         assert line.startswith(_as_bytes("AC "))
         key = line[3:].strip().split(semi_char)[0].strip()
         while True:
             end_offset = handle.tell()
             line = handle.readline()
             if marker_re.match(line) or not line:
                 yield _bytes_to_string(key), start_offset, length
                 start_offset = end_offset
                 break
             length += len(line)
     assert not line, repr(line)
Example #4
0
 def __iter__(self):
     """Returns (id,offset) tuples."""
     marker_offset = len(self._marker)
     marker_re = self._marker_re
     handle = self._handle
     handle.seek(0)
     #Skip and header before first record
     while True:
         start_offset = handle.tell()
         line = handle.readline()
         if marker_re.match(line) or not line:
             break
     #Should now be at the start of a record, or end of the file
     while marker_re.match(line):
         #Here we can assume the record.id is the first word after the
         #marker. This is generally fine... but not for GenBank, EMBL, Swiss
         id = line[marker_offset:].strip().split(None, 1)[0]
         length = len(line)
         while True:
             end_offset = handle.tell()
             line = handle.readline()
             if marker_re.match(line) or not line:
                 yield _bytes_to_string(id), start_offset, length
                 start_offset = end_offset
                 break
             else:
                 #Track this explicitly as can't do file offset difference on BGZF
                 length += len(line)
     assert not line, repr(line)
Example #5
0
 def __iter__(self):
     handle = self._handle
     handle.seek(0)
     marker_re = self._marker_re
     semi_char = _as_bytes(";")
     # Skip any header before first record
     while True:
         start_offset = handle.tell()
         line = handle.readline()
         if marker_re.match(line) or not line:
             break
     # Should now be at the start of a record, or end of the file
     while marker_re.match(line):
         length = len(line)
         # We cannot assume the record.id is the first word after ID,
         # normally the following AC line is used.
         line = handle.readline()
         length += len(line)
         assert line.startswith(_as_bytes("AC "))
         key = line[3:].strip().split(semi_char)[0].strip()
         while True:
             end_offset = handle.tell()
             line = handle.readline()
             if marker_re.match(line) or not line:
                 yield _bytes_to_string(key), start_offset, length
                 start_offset = end_offset
                 break
             length += len(line)
     assert not line, repr(line)
Example #6
0
 def get_qresult_id(self, pos):
     """Returns the query ID of the nearest cigar line."""
     handle = self._handle
     handle.seek(pos)
     # get line, check if it's a vulgar line, and get query ID
     line = handle.readline()
     assert line.startswith(self._query_mark), line
     id = re.search(_RE_CIGAR, _bytes_to_string(line))
     return id.group(1)
Example #7
0
 def get_qresult_id(self, pos):
     """Returns the query ID of the nearest cigar line."""
     handle = self._handle
     handle.seek(pos)
     # get line, check if it's a vulgar line, and get query ID
     line = handle.readline()
     assert line.startswith(self._query_mark), line
     id = re.search(_RE_CIGAR, _bytes_to_string(line))
     return id.group(1)
Example #8
0
    def __iter__(self):
        qstart_mark = self.qstart_mark
        qend_mark = self.qend_mark
        blast_id_mark = _as_bytes('Query_')
        block_size = self.block_size
        handle = self._handle
        handle.seek(0)
        re_desc = re.compile(
            _as_bytes(r'<Iteration_query-ID>(.*?)'
                      '</Iteration_query-ID>\s+?<Iteration_query-def>'
                      '(.*?)</Iteration_query-def>'))
        re_desc_end = re.compile(_as_bytes(r'</Iteration_query-def>'))
        counter = 0

        while True:
            start_offset = handle.tell()
            line = handle.readline()
            if not line:
                break
            if qstart_mark not in line:
                continue
            # The following requirements are to make supporting BGZF compressed
            # BLAST XML files simpler (avoids complex offset manipulations):
            assert line.count(qstart_mark) == 1, "XML without line breaks?"
            assert line.lstrip().startswith(qstart_mark), line
            if qend_mark in line:
                # Should cope with <Iteration>...</Iteration> on one long line
                block = line
            else:
                # Load the rest of this block up to and including </Iteration>
                block = [line]
                while line and qend_mark not in line:
                    line = handle.readline()
                    assert qstart_mark not in line, line
                    block.append(line)
                assert line.rstrip().endswith(qend_mark), line
                block = _empty_bytes_string.join(block)
            assert block.count(
                qstart_mark) == 1, "XML without line breaks? %r" % block
            assert block.count(
                qend_mark) == 1, "XML without line breaks? %r" % block
            #Now we have a full <Iteration>...</Iteration> block, find the ID
            regx = re.search(re_desc, block)
            try:
                qstart_desc = regx.group(2)
                qstart_id = regx.group(1)
            except AttributeError:
                # use the fallback values
                assert re.search(re_desc_end, block)
                qstart_desc = _as_bytes(self._fallback['description'])
                qstart_id = _as_bytes(self._fallback['id'])
            if qstart_id.startswith(blast_id_mark):
                qstart_id = qstart_desc.split(_as_bytes(' '), 1)[0]
            yield _bytes_to_string(qstart_id), start_offset, len(block)
            counter += 1
Example #9
0
    def __iter__(self):
        """Iterates over the file handle; yields key, start offset, and length."""
        handle = self._handle
        handle.seek(0)
        query_id_idx = self._query_id_idx
        qresult_key = None
        header_mark = _as_bytes('#')
        split_mark = _as_bytes(' ')
        # set line with initial mock value, to emulate header
        line = header_mark

        # read through header
        while line.startswith(header_mark):
            start_offset = handle.tell()
            line = handle.readline()

        # and index the qresults
        while True:
            end_offset = handle.tell()

            if not line:
                break

            cols = [x for x in line.strip().split(split_mark) if x]
            if qresult_key is None:
                qresult_key = cols[query_id_idx]
            else:
                curr_key = cols[query_id_idx]

                if curr_key != qresult_key:
                    adj_end = end_offset - len(line)
                    yield _bytes_to_string(qresult_key), start_offset, \
                            adj_end - start_offset
                    qresult_key = curr_key
                    start_offset = adj_end

            line = handle.readline()
            if not line:
                yield _bytes_to_string(qresult_key), start_offset, \
                        end_offset - start_offset
                break
Example #10
0
    def __iter__(self):
        """Iterates over the file handle; yields key, start offset, and length."""
        handle = self._handle
        handle.seek(0)
        query_id_idx = self._query_id_idx
        qresult_key = None
        header_mark = _as_bytes('#')
        split_mark = _as_bytes(' ')
        # set line with initial mock value, to emulate header
        line = header_mark

        # read through header
        while line.startswith(header_mark):
            start_offset = handle.tell()
            line = handle.readline()

        # and index the qresults
        while True:
            end_offset = handle.tell()

            if not line:
                break

            cols = [x for x in line.strip().split(split_mark) if x]
            if qresult_key is None:
                qresult_key = cols[query_id_idx]
            else:
                curr_key = cols[query_id_idx]

                if curr_key != qresult_key:
                    adj_end = end_offset - len(line)
                    yield _bytes_to_string(qresult_key), start_offset, \
                            adj_end - start_offset
                    qresult_key = curr_key
                    start_offset = adj_end

            line = handle.readline()
            if not line:
                yield _bytes_to_string(qresult_key), start_offset, \
                        end_offset - start_offset
                break
Example #11
0
 def __iter__(self):
     handle = self._handle
     handle.seek(0)
     id = None
     start_offset = handle.tell()
     line = handle.readline()
     if not line:
         #Empty file!
         return
     at_char = _as_bytes("@")
     plus_char = _as_bytes("+")
     if line[0:1] != at_char:
         raise ValueError("Problem with FASTQ @ line:\n%r" % line)
     while line:
         #assert line[0]=="@"
         #This record seems OK (so far)
         id = line[1:].rstrip().split(None, 1)[0]
         #Find the seq line(s)
         seq_len = 0
         length = len(line)
         while line:
             line = handle.readline()
             length += len(line)
             if line.startswith(plus_char):
                 break
             seq_len += len(line.strip())
         if not line:
             raise ValueError("Premature end of file in seq section")
         #assert line[0]=="+"
         #Find the qual line(s)
         qual_len = 0
         while line:
             if seq_len == qual_len:
                 if seq_len == 0:
                     #Special case, quality line should be just "\n"
                     line = handle.readline()
                     if line.strip():
                         raise ValueError(
                             "Expected blank quality line, not %r" % line)
                 #Should be end of record...
                 end_offset = handle.tell()
                 line = handle.readline()
                 if line and line[0:1] != at_char:
                     raise ValueError("Problem with line %r" % line)
                 break
             else:
                 line = handle.readline()
                 qual_len += len(line.strip())
                 length += len(line)
         if seq_len != qual_len:
             raise ValueError("Problem with quality section")
         yield _bytes_to_string(id), start_offset, length
         start_offset = end_offset
Example #12
0
    def __iter__(self):
        """Iterates over the file handle; yields key, start offset, and length."""
        handle = self._handle
        handle.seek(0)

        if not self._kwargs['comments']:
            iterfunc = self._qresult_index
        else:
            iterfunc = self._qresult_index_commented

        for key, offset, length in iterfunc():
            yield _bytes_to_string(key), offset, length
Example #13
0
    def __iter__(self):
        """Iterates over the file handle; yields key, start offset, and length."""
        handle = self._handle
        handle.seek(0)

        if not self._kwargs['comments']:
            iterfunc = self._qresult_index
        else:
            iterfunc = self._qresult_index_commented

        for key, offset, length in iterfunc():
            yield _bytes_to_string(key), offset, length
Example #14
0
    def __iter__(self):
        """Iterates over the file handle; yields key, start offset, and length."""
        handle = self._handle
        handle.seek(0)
        # denotes column location for query identifier
        query_id_idx = 9
        qresult_key = None
        tab_char = _as_bytes('\t')

        start_offset = handle.tell()
        line = handle.readline()
        # read through header
        # this assumes that the result row match the regex
        while not re.search(_RE_ROW_CHECK_IDX, line.strip()):
            start_offset = handle.tell()
            line = handle.readline()
            if not line:
                raise StopIteration

        # and index the qresults
        while True:
            end_offset = handle.tell()

            cols = [x for x in line.strip().split(tab_char) if x]
            if qresult_key is None:
                qresult_key = cols[query_id_idx]
            else:
                curr_key = cols[query_id_idx]

                if curr_key != qresult_key:
                    yield _bytes_to_string(qresult_key), start_offset, \
                            end_offset - start_offset
                    qresult_key = curr_key
                    start_offset = end_offset - len(line)

            line = handle.readline()
            if not line:
                yield _bytes_to_string(qresult_key), start_offset, \
                        end_offset - start_offset
                break
Example #15
0
    def __iter__(self):
        """Iterates over the file handle; yields key, start offset, and length."""
        handle = self._handle
        handle.seek(0)
        # denotes column location for query identifier
        query_id_idx = 9
        qresult_key = None
        tab_char = _as_bytes('\t')

        start_offset = handle.tell()
        line = handle.readline()
        # read through header
        # this assumes that the result row match the regex
        while not re.search(_RE_ROW_CHECK_IDX, line.strip()):
            start_offset = handle.tell()
            line = handle.readline()
            if not line:
                raise StopIteration

        # and index the qresults
        while True:
            end_offset = handle.tell()

            cols = [x for x in line.strip().split(tab_char) if x]
            if qresult_key is None:
                qresult_key = cols[query_id_idx]
            else:
                curr_key = cols[query_id_idx]

                if curr_key != qresult_key:
                    yield _bytes_to_string(qresult_key), start_offset, \
                            end_offset - start_offset
                    qresult_key = curr_key
                    start_offset = end_offset - len(line)

            line = handle.readline()
            if not line:
                yield _bytes_to_string(qresult_key), start_offset, \
                        end_offset - start_offset
                break
Example #16
0
def _parse_tag_data(elem_code, elem_num, raw_data):
    """Returns single data value.

    elem_code - What kind of data
    elem_num - How many data points
    raw_data - abi file object from which the tags would be unpacked
    """
    if elem_code in _BYTEFMT:
        # because '>1s' unpack differently from '>s'
        if elem_num == 1:
            num = ""
        else:
            num = str(elem_num)
        fmt = ">" + num + _BYTEFMT[elem_code]

        assert len(raw_data) == struct.calcsize(fmt)
        data = struct.unpack(fmt, raw_data)

        # no need to use tuple if len(data) == 1
        # also if data is date / time
        if elem_code not in [10, 11] and len(data) == 1:
            data = data[0]

        # account for different data types
        if elem_code == 2:
            return _bytes_to_string(data)
        elif elem_code == 10:
            return str(datetime.date(*data))
        elif elem_code == 11:
            return str(datetime.time(*data[:3]))
        elif elem_code == 13:
            return bool(data)
        elif elem_code == 18:
            return _bytes_to_string(data[1:])
        elif elem_code == 19:
            return _bytes_to_string(data[:-1])
        else:
            return data
    else:
        return None
Example #17
0
def _parse_tag_data(elem_code, elem_num, raw_data):
    """Returns single data value.

    elem_code - What kind of data
    elem_num - How many data points
    raw_data - abi file object from which the tags would be unpacked
    """
    if elem_code in _BYTEFMT:
        # because '>1s' unpack differently from '>s'
        if elem_num == 1:
            num = ''
        else:
            num = str(elem_num)
        fmt = '>' + num + _BYTEFMT[elem_code]

        assert len(raw_data) == struct.calcsize(fmt)
        data = struct.unpack(fmt, raw_data)

        # no need to use tuple if len(data) == 1
        # also if data is date / time
        if elem_code not in [10, 11] and len(data) == 1:
            data = data[0]

        # account for different data types
        if elem_code == 2:
            return _bytes_to_string(data)
        elif elem_code == 10:
            return str(datetime.date(*data))
        elif elem_code == 11:
            return str(datetime.time(*data[:3]))
        elif elem_code == 13:
            return bool(data)
        elif elem_code == 18:
            return _bytes_to_string(data[1:])
        elif elem_code == 19:
            return _bytes_to_string(data[:-1])
        else:
            return data
    else:
        return None
Example #18
0
    def __iter__(self):
        qstart_mark = self.qstart_mark
        qend_mark = self.qend_mark
        blast_id_mark = _as_bytes('Query_')
        block_size = self.block_size
        handle = self._handle
        handle.seek(0)
        re_desc = re.compile(_as_bytes(r'<Iteration_query-ID>(.*?)'
                '</Iteration_query-ID>\s+?<Iteration_query-def>'
                '(.*?)</Iteration_query-def>'))
        re_desc_end = re.compile(_as_bytes(r'</Iteration_query-def>'))
        counter = 0

        while True:
            start_offset = handle.tell()
            line = handle.readline()
            if not line:
                break
            if qstart_mark not in line:
                continue
            # The following requirements are to make supporting BGZF compressed
            # BLAST XML files simpler (avoids complex offset manipulations):
            assert line.count(qstart_mark) == 1, "XML without line breaks?"
            assert line.lstrip().startswith(qstart_mark), line
            if qend_mark in line:
                # Should cope with <Iteration>...</Iteration> on one long line
                block = line
            else:
                # Load the rest of this block up to and including </Iteration>
                block = [line]
                while line and qend_mark not in line:
                    line = handle.readline()
                    assert qstart_mark not in line, line
                    block.append(line)
                assert line.rstrip().endswith(qend_mark), line
                block = _empty_bytes_string.join(block)
            assert block.count(qstart_mark) == 1, "XML without line breaks? %r" % block
            assert block.count(qend_mark) == 1, "XML without line breaks? %r" % block
            #Now we have a full <Iteration>...</Iteration> block, find the ID
            regx = re.search(re_desc, block)
            try:
                qstart_desc = regx.group(2)
                qstart_id = regx.group(1)
            except AttributeError:
                # use the fallback values
                assert re.search(re_desc_end, block)
                qstart_desc = _as_bytes(self._fallback['description'])
                qstart_id = _as_bytes(self._fallback['id'])
            if qstart_id.startswith(blast_id_mark):
                qstart_id = qstart_desc.split(_as_bytes(' '), 1)[0]
            yield _bytes_to_string(qstart_id), start_offset, len(block)
            counter += 1
Example #19
0
 def __iter__(self):
     handle = self._handle
     handle.seek(0)
     id = None
     start_offset = handle.tell()
     line = handle.readline()
     if not line:
         # Empty file!
         return
     at_char = _as_bytes("@")
     plus_char = _as_bytes("+")
     if line[0:1] != at_char:
         raise ValueError("Problem with FASTQ @ line:\n%r" % line)
     while line:
         # assert line[0]=="@"
         # This record seems OK (so far)
         id = line[1:].rstrip().split(None, 1)[0]
         # Find the seq line(s)
         seq_len = 0
         length = len(line)
         while line:
             line = handle.readline()
             length += len(line)
             if line.startswith(plus_char):
                 break
             seq_len += len(line.strip())
         if not line:
             raise ValueError("Premature end of file in seq section")
         # assert line[0]=="+"
         # Find the qual line(s)
         qual_len = 0
         while line:
             if seq_len == qual_len:
                 if seq_len == 0:
                     # Special case, quality line should be just "\n"
                     line = handle.readline()
                     if line.strip():
                         raise ValueError("Expected blank quality line, not %r" % line)
                 # Should be end of record...
                 end_offset = handle.tell()
                 line = handle.readline()
                 if line and line[0:1] != at_char:
                     raise ValueError("Problem with line %r" % line)
                 break
             else:
                 line = handle.readline()
                 qual_len += len(line.strip())
                 length += len(line)
         if seq_len != qual_len:
             raise ValueError("Problem with quality section")
         yield _bytes_to_string(id), start_offset, length
         start_offset = end_offset
Example #20
0
def _abi_parse_header(header, handle):
    """Generator that returns directory contents.
    """
    # header structure (after ABIF marker):
    # file version, tag name, tag number,
    # element type code, element size, number of elements
    # data size, data offset, handle (not file handle)
    head_elem_size = header[4]
    head_elem_num = header[5]
    head_offset = header[7]
    index = 0

    while index < head_elem_num:
        start = head_offset + index * head_elem_size
        # add directory offset to tuple
        # to handle directories with data size <= 4 bytes
        handle.seek(start)
        dir_entry = struct.unpack(_DIRFMT,
                                  handle.read(struct.calcsize(_DIRFMT))) + (start,)
        index += 1
        # only parse desired dirs
        key = _bytes_to_string(dir_entry[0])
        key += str(dir_entry[1])
        if key in (list(_EXTRACT) + _SPCTAGS):
            tag_name = _bytes_to_string(dir_entry[0])
            tag_number = dir_entry[1]
            elem_code = dir_entry[2]
            elem_num = dir_entry[4]
            data_size = dir_entry[5]
            data_offset = dir_entry[6]
            tag_offset = dir_entry[8]
            # if data size <= 4 bytes, data is stored inside tag
            # so offset needs to be changed
            if data_size <= 4:
                data_offset = tag_offset + 20
            handle.seek(data_offset)
            data = handle.read(data_size)
            yield tag_name, tag_number, \
                _parse_tag_data(elem_code, elem_num, data)
Example #21
0
 def get(self, offset):
     #TODO - Can we handle this directly in the parser?
     #This is a hack - use get_raw for <entry>...</entry> and wrap it with
     #the apparently required XML header and footer.
     data = """<?xml version='1.0' encoding='UTF-8'?>
     <uniprot xmlns="http://uniprot.org/uniprot"
     xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
     xsi:schemaLocation="http://uniprot.org/uniprot
     http://www.uniprot.org/support/docs/uniprot.xsd">
     %s
     </uniprot>
     """ % _bytes_to_string(self.get_raw(offset))
     #TODO - For consistency, this function should not accept a string:
     return next(SeqIO.UniprotIO.UniprotIterator(data))
Example #22
0
def _abi_parse_header(header, handle):
    """Generator that returns directory contents.
    """
    # header structure (after ABIF marker):
    # file version, tag name, tag number,
    # element type code, element size, number of elements
    # data size, data offset, handle (not file handle)
    head_elem_size = header[4]
    head_elem_num = header[5]
    head_offset = header[7]
    index = 0

    while index < head_elem_num:
        start = head_offset + index * head_elem_size
        # add directory offset to tuple
        # to handle directories with data size <= 4 bytes
        handle.seek(start)
        dir_entry = struct.unpack(_DIRFMT, handle.read(struct.calcsize(_DIRFMT))) + (start,)
        index += 1
        # only parse desired dirs
        key = _bytes_to_string(dir_entry[0])
        key += str(dir_entry[1])
        if key in (list(_EXTRACT) + _SPCTAGS):
            tag_name = _bytes_to_string(dir_entry[0])
            tag_number = dir_entry[1]
            elem_code = dir_entry[2]
            elem_num = dir_entry[4]
            data_size = dir_entry[5]
            data_offset = dir_entry[6]
            tag_offset = dir_entry[8]
            # if data size <= 4 bytes, data is stored inside tag
            # so offset needs to be changed
            if data_size <= 4:
                data_offset = tag_offset + 20
            handle.seek(data_offset)
            data = handle.read(data_size)
            yield tag_name, tag_number, _parse_tag_data(elem_code, elem_num, data)
Example #23
0
    def get_qresult_id(self, pos):
        """Returns the query ID from the nearest "Query:" line."""
        handle = self._handle
        handle.seek(pos)
        sentinel = _as_bytes('Query:')

        while True:
            line = handle.readline().strip()
            if line.startswith(sentinel):
                break
            if not line:
                raise StopIteration
        qid, desc = _parse_hit_or_query_line(_bytes_to_string(line))

        return qid
Example #24
0
    def get_qresult_id(self, pos):
        """Returns the query ID from the nearest "Query:" line."""
        handle = self._handle
        handle.seek(pos)
        sentinel = _as_bytes('Query:')

        while True:
            line = handle.readline().strip()
            if line.startswith(sentinel):
                break
            if not line:
                raise StopIteration
        qid, desc = _parse_hit_or_query_line(_bytes_to_string(line))

        return qid
Example #25
0
 def get(self, offset):
     # TODO - Can we handle this directly in the parser?
     # This is a hack - use get_raw for <entry>...</entry> and wrap it with
     # the apparently required XML header and footer.
     data = """<?xml version='1.0' encoding='UTF-8'?>
     <uniprot xmlns="http://uniprot.org/uniprot"
     xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
     xsi:schemaLocation="http://uniprot.org/uniprot
     http://www.uniprot.org/support/docs/uniprot.xsd">
     %s
     </uniprot>
     """ % _bytes_to_string(
         self.get_raw(offset)
     )
     # TODO - For consistency, this function should not accept a string:
     return next(SeqIO.UniprotIO.UniprotIterator(data))
Example #26
0
 def __iter__(self):
     handle = self._handle
     handle.seek(0)
     marker_re = self._marker_re
     start_acc_marker = _as_bytes("<accession>")
     end_acc_marker = _as_bytes("</accession>")
     end_entry_marker = _as_bytes("</entry>")
     less_than = _as_bytes("<")
     #Skip any header before first record
     while True:
         start_offset = handle.tell()
         line = handle.readline()
         if marker_re.match(line) or not line:
             break
     #Should now be at the start of a record, or end of the file
     while marker_re.match(line):
         length = len(line)
         #We expect the next line to be <accession>xxx</accession>
         #(possibly with leading spaces)
         #but allow it to be later on within the <entry>
         key = None
         while True:
             line = handle.readline()
             if key is None and start_acc_marker in line:
                 assert end_acc_marker in line, line
                 key = line[line.find(start_acc_marker) + 11:].split(
                     less_than, 1)[0]
                 length += len(line)
             elif end_entry_marker in line:
                 end_offset = handle.tell() - len(line) \
                     + line.find(end_entry_marker) + 8
                 break
             elif marker_re.match(line) or not line:
                 #Start of next record or end of file
                 raise ValueError("Didn't find end of record")
             else:
                 length += len(line)
         if not key:
             raise ValueError(
                 "Did not find <accession> line in bytes %i to %i" %
                 (start_offset, end_offset))
         yield _bytes_to_string(key), start_offset, length
         #Find start of next record
         while not marker_re.match(line) and line:
             start_offset = handle.tell()
             line = handle.readline()
     assert not line, repr(line)
Example #27
0
 def __iter__(self):
     handle = self._handle
     handle.seek(0)
     marker_re = self._marker_re
     semi_char = _as_bytes(";")
     dot_char = _as_bytes(".")
     sv_marker = _as_bytes("SV ")
     #Skip any header before first record
     while True:
         start_offset = handle.tell()
         line = handle.readline()
         if marker_re.match(line) or not line:
             break
     #Should now be at the start of a record, or end of the file
     while marker_re.match(line):
         #We cannot assume the record.id is the first word after ID,
         #normally the SV line is used.
         length = len(line)
         if line[2:].count(semi_char) == 6:
             #Looks like the semi colon separated style introduced in 2006
             parts = line[3:].rstrip().split(semi_char)
             if parts[1].strip().startswith(sv_marker):
                 #The SV bit gives the version
                 key = parts[0].strip() + dot_char + \
                     parts[1].strip().split()[1]
             else:
                 key = parts[0].strip()
         elif line[2:].count(semi_char) == 3:
             #Looks like the pre 2006 style, take first word only
             key = line[3:].strip().split(None, 1)[0]
         else:
             raise ValueError('Did not recognise the ID line layout:\n' +
                              line)
         while True:
             end_offset = handle.tell()
             line = handle.readline()
             if marker_re.match(line) or not line:
                 end_offset = handle.tell() - len(line)
                 yield _bytes_to_string(key), start_offset, length
                 start_offset = end_offset
                 break
             elif line.startswith(sv_marker):
                 key = line.rstrip().split()[1]
             length += len(line)
     assert not line, repr(line)
Example #28
0
 def __iter__(self):
     handle = self._handle
     handle.seek(0)
     tab_char = _as_bytes("\t")
     while True:
         start_offset = handle.tell()
         line = handle.readline()
         if not line:
             break  # End of file
         try:
             key = line.split(tab_char)[0]
         except ValueError as err:
             if not line.strip():
                 #Ignore blank lines
                 continue
             else:
                 raise err
         else:
             yield _bytes_to_string(key), start_offset, len(line)
Example #29
0
 def __iter__(self):
     handle = self._handle
     handle.seek(0)
     marker_re = self._marker_re
     start_acc_marker = _as_bytes("<accession>")
     end_acc_marker = _as_bytes("</accession>")
     end_entry_marker = _as_bytes("</entry>")
     less_than = _as_bytes("<")
     # Skip any header before first record
     while True:
         start_offset = handle.tell()
         line = handle.readline()
         if marker_re.match(line) or not line:
             break
     # Should now be at the start of a record, or end of the file
     while marker_re.match(line):
         length = len(line)
         # We expect the next line to be <accession>xxx</accession>
         # (possibly with leading spaces)
         # but allow it to be later on within the <entry>
         key = None
         while True:
             line = handle.readline()
             if key is None and start_acc_marker in line:
                 assert end_acc_marker in line, line
                 key = line[line.find(start_acc_marker) + 11 :].split(less_than, 1)[0]
                 length += len(line)
             elif end_entry_marker in line:
                 end_offset = handle.tell() - len(line) + line.find(end_entry_marker) + 8
                 break
             elif marker_re.match(line) or not line:
                 # Start of next record or end of file
                 raise ValueError("Didn't find end of record")
             else:
                 length += len(line)
         if not key:
             raise ValueError("Did not find <accession> line in bytes %i to %i" % (start_offset, end_offset))
         yield _bytes_to_string(key), start_offset, length
         # Find start of next record
         while not marker_re.match(line) and line:
             start_offset = handle.tell()
             line = handle.readline()
     assert not line, repr(line)
Example #30
0
 def __iter__(self):
     handle = self._handle
     handle.seek(0)
     marker_re = self._marker_re
     semi_char = _as_bytes(";")
     dot_char = _as_bytes(".")
     sv_marker = _as_bytes("SV ")
     # Skip any header before first record
     while True:
         start_offset = handle.tell()
         line = handle.readline()
         if marker_re.match(line) or not line:
             break
     # Should now be at the start of a record, or end of the file
     while marker_re.match(line):
         # We cannot assume the record.id is the first word after ID,
         # normally the SV line is used.
         length = len(line)
         if line[2:].count(semi_char) == 6:
             # Looks like the semi colon separated style introduced in 2006
             parts = line[3:].rstrip().split(semi_char)
             if parts[1].strip().startswith(sv_marker):
                 # The SV bit gives the version
                 key = parts[0].strip() + dot_char + parts[1].strip().split()[1]
             else:
                 key = parts[0].strip()
         elif line[2:].count(semi_char) == 3:
             # Looks like the pre 2006 style, take first word only
             key = line[3:].strip().split(None, 1)[0]
         else:
             raise ValueError("Did not recognise the ID line layout:\n" + line)
         while True:
             end_offset = handle.tell()
             line = handle.readline()
             if marker_re.match(line) or not line:
                 end_offset = handle.tell() - len(line)
                 yield _bytes_to_string(key), start_offset, length
                 start_offset = end_offset
                 break
             elif line.startswith(sv_marker):
                 key = line.rstrip().split()[1]
             length += len(line)
     assert not line, repr(line)
Example #31
0
 def __iter__(self):
     handle = self._handle
     handle.seek(0)
     tab_char = _as_bytes("\t")
     while True:
         start_offset = handle.tell()
         line = handle.readline()
         if not line:
             break  # End of file
         try:
             key = line.split(tab_char)[0]
         except ValueError as err:
             if not line.strip():
                 # Ignore blank lines
                 continue
             else:
                 raise err
         else:
             yield _bytes_to_string(key), start_offset, len(line)
Example #32
0
    def __iter__(self):
        handle = self._handle
        handle.seek(0)
        start_offset = handle.tell()
        regex_id = re.compile(_as_bytes(_QRE_ID_LEN_PTN))

        while True:
            line = read_forward(handle)
            end_offset = handle.tell()

            if line.startswith(self.qresult_start):
                regx = re.search(regex_id, line)
                qresult_key = regx.group(1).strip()
                # qresult start offset is the offset of this line
                # (starts with the start mark)
                start_offset = end_offset - len(line)
            elif line.startswith(self.qresult_end):
                yield _bytes_to_string(qresult_key), start_offset, 0
                start_offset = end_offset
            elif not line:
                break
Example #33
0
    def __iter__(self):
        handle = self._handle
        handle.seek(0)
        start_offset = handle.tell()
        regex_id = re.compile(_as_bytes(_QRE_ID_LEN_PTN))

        while True:
            line = read_forward(handle)
            end_offset = handle.tell()

            if line.startswith(self.qresult_start):
                regx = re.search(regex_id, line)
                qresult_key = regx.group(1).strip()
                # qresult start offset is the offset of this line
                # (starts with the start mark)
                start_offset = end_offset - len(line)
            elif line.startswith(self.qresult_end):
                yield _bytes_to_string(qresult_key), start_offset, 0
                start_offset = end_offset
            elif not line:
                break
Example #34
0
 def __iter__(self):
     handle = self._handle
     handle.seek(0)
     marker_re = self._marker_re
     dot_char = _as_bytes(".")
     accession_marker = _as_bytes("ACCESSION ")
     version_marker = _as_bytes("VERSION ")
     #Skip and header before first record
     while True:
         start_offset = handle.tell()
         line = handle.readline()
         if marker_re.match(line) or not line:
             break
     #Should now be at the start of a record, or end of the file
     while marker_re.match(line):
         #We cannot assume the record.id is the first word after LOCUS,
         #normally the first entry on the VERSION or ACCESSION line is used.
         key = None
         length = len(line)
         while True:
             end_offset = handle.tell()
             line = handle.readline()
             if marker_re.match(line) or not line:
                 if not key:
                     raise ValueError(
                         "Did not find ACCESSION/VERSION lines")
                 yield _bytes_to_string(key), start_offset, length
                 start_offset = end_offset
                 break
             elif line.startswith(accession_marker):
                 key = line.rstrip().split()[1]
             elif line.startswith(version_marker):
                 version_id = line.rstrip().split()[1]
                 if version_id.count(dot_char) == 1 and version_id.split(
                         dot_char)[1].isdigit():
                     #This should mimic the GenBank parser...
                     key = version_id
             length += len(line)
     assert not line, repr(line)
Example #35
0
 def __iter__(self):
     handle = self._handle
     handle.seek(0)
     marker_re = self._marker_re
     semi_char = _as_bytes(";")
     while True:
         offset = handle.tell()
         line = handle.readline()
         length = len(line)
         if marker_re.match(line):
             # Now look for the first line which doesn't start ";"
             while True:
                 line = handle.readline()
                 if line[0:1] != semi_char and line.strip():
                     key = line.split()[0]
                     yield _bytes_to_string(key), offset, length
                     break
                 if not line:
                     raise ValueError("Premature end of file?")
                 length += len(line)
         elif not line:
             # End of file
             break
Example #36
0
 def __iter__(self):
     handle = self._handle
     handle.seek(0)
     marker_re = self._marker_re
     semi_char = _as_bytes(";")
     while True:
         offset = handle.tell()
         line = handle.readline()
         length = len(line)
         if marker_re.match(line):
             #Now look for the first line which doesn't start ";"
             while True:
                 line = handle.readline()
                 if line[0:1] != semi_char and line.strip():
                     key = line.split()[0]
                     yield _bytes_to_string(key), offset, length
                     break
                 if not line:
                     raise ValueError("Premature end of file?")
                 length += len(line)
         elif not line:
             #End of file
             break
Example #37
0
 def __iter__(self):
     handle = self._handle
     handle.seek(0)
     marker_re = self._marker_re
     dot_char = _as_bytes(".")
     accession_marker = _as_bytes("ACCESSION ")
     version_marker = _as_bytes("VERSION ")
     # Skip and header before first record
     while True:
         start_offset = handle.tell()
         line = handle.readline()
         if marker_re.match(line) or not line:
             break
     # Should now be at the start of a record, or end of the file
     while marker_re.match(line):
         # We cannot assume the record.id is the first word after LOCUS,
         # normally the first entry on the VERSION or ACCESSION line is used.
         key = None
         length = len(line)
         while True:
             end_offset = handle.tell()
             line = handle.readline()
             if marker_re.match(line) or not line:
                 if not key:
                     raise ValueError("Did not find ACCESSION/VERSION lines")
                 yield _bytes_to_string(key), start_offset, length
                 start_offset = end_offset
                 break
             elif line.startswith(accession_marker):
                 key = line.rstrip().split()[1]
             elif line.startswith(version_marker):
                 version_id = line.rstrip().split()[1]
                 if version_id.count(dot_char) == 1 and version_id.split(dot_char)[1].isdigit():
                     # This should mimic the GenBank parser...
                     key = version_id
             length += len(line)
     assert not line, repr(line)
Example #38
0
    def __iter__(self):
        handle = self._handle
        handle.seek(0)
        start_offset = handle.tell()
        qresult_key = None
        query_mark = _as_bytes('>>>')

        while True:
            line = handle.readline()
            peekline = handle.peekline()
            end_offset = handle.tell()

            if not line.startswith(query_mark) and query_mark in line:
                regx = re.search(_RE_ID_DESC_SEQLEN_IDX, line)
                qresult_key = _bytes_to_string(regx.group(1))
                start_offset = end_offset - len(line)
            # yield whenever we encounter a new query or at the end of the file
            if qresult_key is not None:
                if (not peekline.startswith(query_mark)
                        and query_mark in peekline) or not line:
                    yield qresult_key, start_offset, end_offset - start_offset
                    if not line:
                        break
                    start_offset = end_offset
Example #39
0
    def __iter__(self):
        handle = self._handle
        handle.seek(0)
        start_offset = handle.tell()
        qresult_key = None
        query_mark = _as_bytes('>>>')

        while True:
            line = handle.readline()
            peekline = handle.peekline()
            end_offset = handle.tell()

            if not line.startswith(query_mark) and query_mark in line:
                regx = re.search(_RE_ID_DESC_SEQLEN_IDX, line)
                qresult_key = _bytes_to_string(regx.group(1))
                start_offset = end_offset - len(line)
            # yield whenever we encounter a new query or at the end of the file
            if qresult_key is not None:
                if (not peekline.startswith(query_mark)
                        and query_mark in peekline) or not line:
                    yield qresult_key, start_offset, end_offset - start_offset
                    if not line:
                        break
                    start_offset = end_offset
Example #40
0
 def get(self, offset):
     """Returns SeqRecord."""
     #Should be overridden for binary file formats etc:
     return self._parse(StringIO(_bytes_to_string(self.get_raw(offset))))
Example #41
0
 def get(self, offset):
     return self._parse(StringIO(_bytes_to_string(self.get_raw(offset))))
Example #42
0
 def get(self, offset):
     return self._parse(StringIO(_bytes_to_string(self.get_raw(offset))))
Example #43
0
 def get(self, offset):
     """Returns SeqRecord."""
     # Should be overridden for binary file formats etc:
     return self._parse(StringIO(_bytes_to_string(self.get_raw(offset))))