Esempio n. 1
0
    def __iter__(self):
        handle = self._handle
        handle.seek(0)
        start_offset = handle.tell()
        regex_id = re.compile(_as_bytes(r'Query\s*(?:sequence|HMM)?:\s*(.*)'))

        # determine flag for hmmsearch
        is_hmmsearch = False
        line = read_forward(handle)
        if line.startswith(_as_bytes('hmmsearch')):
            is_hmmsearch = True

        while True:
            end_offset = handle.tell()

            if line.startswith(self.qresult_start):
                regx = re.search(regex_id, line)
                qresult_key = regx.group(1).strip()
                # qresult start offset is the offset of this line
                # (starts with the start mark)
                start_offset = end_offset - len(line)
            elif line.startswith(self.qresult_end):
                yield _bytes_to_string(qresult_key), start_offset, 0
                start_offset = end_offset
            elif not line:
                # HACK: since hmmsearch can only have one query result
                if is_hmmsearch:
                    yield _bytes_to_string(qresult_key), start_offset, 0
                break

            line = read_forward(handle)
Esempio n. 2
0
 def __iter__(self):
     handle = self._handle
     handle.seek(0)
     marker_re = self._marker_re
     semi_char = _as_bytes(";")
     # Skip any header before first record
     while True:
         start_offset = handle.tell()
         line = handle.readline()
         if marker_re.match(line) or not line:
             break
     # Should now be at the start of a record, or end of the file
     while marker_re.match(line):
         length = len(line)
         # We cannot assume the record.id is the first word after ID,
         # normally the following AC line is used.
         line = handle.readline()
         length += len(line)
         assert line.startswith(_as_bytes("AC "))
         key = line[3:].strip().split(semi_char)[0].strip()
         while True:
             end_offset = handle.tell()
             line = handle.readline()
             if marker_re.match(line) or not line:
                 yield _bytes_to_string(key), start_offset, length
                 start_offset = end_offset
                 break
             length += len(line)
     assert not line, repr(line)
Esempio n. 3
0
 def __iter__(self):
     """Returns (id,offset) tuples."""
     marker_offset = len(self._marker)
     marker_re = self._marker_re
     handle = self._handle
     handle.seek(0)
     # Skip and header before first record
     while True:
         start_offset = handle.tell()
         line = handle.readline()
         if marker_re.match(line) or not line:
             break
     # Should now be at the start of a record, or end of the file
     while marker_re.match(line):
         # Here we can assume the record.id is the first word after the
         # marker. This is generally fine... but not for GenBank, EMBL, Swiss
         id = line[marker_offset:].strip().split(None, 1)[0]
         length = len(line)
         while True:
             end_offset = handle.tell()
             line = handle.readline()
             if marker_re.match(line) or not line:
                 yield _bytes_to_string(id), start_offset, length
                 start_offset = end_offset
                 break
             else:
                 # Track this explicitly as can't do file offset difference on BGZF
                 length += len(line)
     assert not line, repr(line)
Esempio n. 4
0
 def __iter__(self):
     handle = self._handle
     handle.seek(0)
     marker_re = self._marker_re
     semi_char = _as_bytes(";")
     dot_char = _as_bytes(".")
     sv_marker = _as_bytes("SV ")
     ac_marker = _as_bytes("AC ")
     # Skip any header before first record
     while True:
         start_offset = handle.tell()
         line = handle.readline()
         if marker_re.match(line) or not line:
             break
     # Should now be at the start of a record, or end of the file
     while marker_re.match(line):
         # We cannot assume the record.id is the first word after ID,
         # normally the SV line is used.
         setbysv = False  # resets sv as false
         length = len(line)
         if line[2:].count(semi_char) == 6:
             # Looks like the semi colon separated style introduced in 2006
             parts = line[3:].rstrip().split(semi_char)
             if parts[1].strip().startswith(sv_marker):
                 # The SV bit gives the version
                 key = parts[0].strip() + dot_char + \
                     parts[1].strip().split()[1]
                 setbysv = True
             else:
                 key = parts[0].strip()
         elif line[2:].count(semi_char) == 3:
             # Looks like the pre 2006 style, take first word only
             key = line[3:].strip().split(None, 1)[0]
             if key.endswith(semi_char):
                 key = key[:-1]
         else:
             raise ValueError('Did not recognise the ID line layout:\n' +
                              line)
         while True:
             end_offset = handle.tell()
             line = handle.readline()
             if marker_re.match(line) or not line:
                 end_offset = handle.tell() - len(line)
                 yield _bytes_to_string(key), start_offset, length
                 start_offset = end_offset
                 break
             elif line.startswith(ac_marker) and not setbysv:
                 key = line.rstrip().split()[1]
                 if key.endswith(semi_char):
                     key = key[:-1]
             elif line.startswith(sv_marker):
                 key = line.rstrip().split()[1]
                 setbysv = True
             length += len(line)
     assert not line, repr(line)
Esempio n. 5
0
    def __iter__(self):
        """Iterates over the file handle; yields key, start offset, and length."""
        handle = self._handle
        handle.seek(0)
        query_id_idx = self._query_id_idx
        qresult_key = None
        header_mark = _as_bytes('#')
        split_mark = _as_bytes(' ')
        # set line with initial mock value, to emulate header
        line = header_mark

        # read through header
        while line.startswith(header_mark):
            start_offset = handle.tell()
            line = handle.readline()

        # and index the qresults
        while True:
            end_offset = handle.tell()

            if not line:
                break

            cols = [x for x in line.strip().split(split_mark) if x]
            if qresult_key is None:
                qresult_key = cols[query_id_idx]
            else:
                curr_key = cols[query_id_idx]

                if curr_key != qresult_key:
                    adj_end = end_offset - len(line)
                    yield _bytes_to_string(qresult_key), start_offset, \
                            adj_end - start_offset
                    qresult_key = curr_key
                    start_offset = adj_end

            line = handle.readline()
            if not line:
                yield _bytes_to_string(qresult_key), start_offset, \
                        end_offset - start_offset
                break
Esempio n. 6
0
 def __iter__(self):
     handle = self._handle
     handle.seek(0)
     id = None
     start_offset = handle.tell()
     line = handle.readline()
     if not line:
         # Empty file!
         return
     at_char = _as_bytes("@")
     plus_char = _as_bytes("+")
     if line[0:1] != at_char:
         raise ValueError("Problem with FASTQ @ line:\n%r" % line)
     while line:
         # assert line[0]=="@"
         # This record seems OK (so far)
         id = line[1:].rstrip().split(None, 1)[0]
         # Find the seq line(s)
         seq_len = 0
         length = len(line)
         while line:
             line = handle.readline()
             length += len(line)
             if line.startswith(plus_char):
                 break
             seq_len += len(line.strip())
         if not line:
             raise ValueError("Premature end of file in seq section")
         # assert line[0]=="+"
         # Find the qual line(s)
         qual_len = 0
         while line:
             if seq_len == qual_len:
                 if seq_len == 0:
                     # Special case, quality line should be just "\n"
                     line = handle.readline()
                     if line.strip():
                         raise ValueError(
                             "Expected blank quality line, not %r" % line)
                 # Should be end of record...
                 end_offset = handle.tell()
                 line = handle.readline()
                 if line and line[0:1] != at_char:
                     raise ValueError("Problem with line %r" % line)
                 break
             else:
                 line = handle.readline()
                 qual_len += len(line.strip())
                 length += len(line)
         if seq_len != qual_len:
             raise ValueError("Problem with quality section")
         yield _bytes_to_string(id), start_offset, length
         start_offset = end_offset
Esempio n. 7
0
    def __iter__(self):
        """Iterates over the file handle; yields key, start offset, and length."""
        handle = self._handle
        handle.seek(0)
        # denotes column location for query identifier
        query_id_idx = 9
        qresult_key = None
        tab_char = _as_bytes('\t')

        start_offset = handle.tell()
        line = handle.readline()
        # read through header
        # this assumes that the result row match the regex
        while not re.search(_RE_ROW_CHECK_IDX, line.strip()):
            start_offset = handle.tell()
            line = handle.readline()
            if not line:
                raise StopIteration

        # and index the qresults
        while True:
            end_offset = handle.tell()

            cols = [x for x in line.strip().split(tab_char) if x]
            if qresult_key is None:
                qresult_key = cols[query_id_idx]
            else:
                curr_key = cols[query_id_idx]

                if curr_key != qresult_key:
                    yield _bytes_to_string(qresult_key), start_offset, \
                            end_offset - start_offset
                    qresult_key = curr_key
                    start_offset = end_offset - len(line)

            line = handle.readline()
            if not line:
                yield _bytes_to_string(qresult_key), start_offset, \
                        end_offset - start_offset
                break
Esempio n. 8
0
 def get(self, offset):
     # TODO - Can we handle this directly in the parser?
     # This is a hack - use get_raw for <entry>...</entry> and wrap it with
     # the apparently required XML header and footer.
     data = """<?xml version='1.0' encoding='UTF-8'?>
     <uniprot xmlns="http://uniprot.org/uniprot"
     xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
     xsi:schemaLocation="http://uniprot.org/uniprot
     http://www.uniprot.org/support/docs/uniprot.xsd">
     %s
     </uniprot>
     """ % _bytes_to_string(self.get_raw(offset))
     # TODO - For consistency, this function should not accept a string:
     return next(SeqIO.UniprotIO.UniprotIterator(data))
Esempio n. 9
0
 def __iter__(self):
     handle = self._handle
     handle.seek(0)
     marker_re = self._marker_re
     start_acc_marker = _as_bytes("<accession>")
     end_acc_marker = _as_bytes("</accession>")
     end_entry_marker = _as_bytes("</entry>")
     less_than = _as_bytes("<")
     # Skip any header before first record
     while True:
         start_offset = handle.tell()
         line = handle.readline()
         if marker_re.match(line) or not line:
             break
     # Should now be at the start of a record, or end of the file
     while marker_re.match(line):
         length = len(line)
         # We expect the next line to be <accession>xxx</accession>
         # (possibly with leading spaces)
         # but allow it to be later on within the <entry>
         key = None
         while True:
             line = handle.readline()
             if key is None and start_acc_marker in line:
                 assert end_acc_marker in line, line
                 key = line[line.find(start_acc_marker) + 11:].split(
                     less_than, 1)[0]
                 length += len(line)
             elif end_entry_marker in line:
                 end_offset = handle.tell() - len(line) \
                     + line.find(end_entry_marker) + 8
                 break
             elif marker_re.match(line) or not line:
                 # Start of next record or end of file
                 raise ValueError("Didn't find end of record")
             else:
                 length += len(line)
         if not key:
             raise ValueError(
                 "Did not find <accession> line in bytes %i to %i" %
                 (start_offset, end_offset))
         yield _bytes_to_string(key), start_offset, length
         # Find start of next record
         while not marker_re.match(line) and line:
             start_offset = handle.tell()
             line = handle.readline()
     assert not line, repr(line)
Esempio n. 10
0
 def __iter__(self):
     handle = self._handle
     handle.seek(0)
     tab_char = _as_bytes("\t")
     while True:
         start_offset = handle.tell()
         line = handle.readline()
         if not line:
             break  # End of file
         try:
             key = line.split(tab_char)[0]
         except ValueError as err:
             if not line.strip():
                 # Ignore blank lines
                 continue
             else:
                 raise err
         else:
             yield _bytes_to_string(key), start_offset, len(line)
Esempio n. 11
0
    def __iter__(self):
        handle = self._handle
        handle.seek(0)
        start_offset = handle.tell()
        regex_id = re.compile(_as_bytes(_QRE_ID_LEN_PTN))

        while True:
            line = read_forward(handle)
            end_offset = handle.tell()

            if line.startswith(self.qresult_start):
                regx = re.search(regex_id, line)
                qresult_key = regx.group(1).strip()
                # qresult start offset is the offset of this line
                # (starts with the start mark)
                start_offset = end_offset - len(line)
            elif line.startswith(self.qresult_end):
                yield _bytes_to_string(qresult_key), start_offset, 0
                start_offset = end_offset
            elif not line:
                break
Esempio n. 12
0
 def __iter__(self):
     handle = self._handle
     handle.seek(0)
     marker_re = self._marker_re
     dot_char = _as_bytes(".")
     accession_marker = _as_bytes("ACCESSION ")
     version_marker = _as_bytes("VERSION ")
     # Skip and header before first record
     while True:
         start_offset = handle.tell()
         line = handle.readline()
         if marker_re.match(line) or not line:
             break
     # Should now be at the start of a record, or end of the file
     while marker_re.match(line):
         # We cannot assume the record.id is the first word after LOCUS,
         # normally the first entry on the VERSION or ACCESSION line is used.
         key = None
         length = len(line)
         while True:
             end_offset = handle.tell()
             line = handle.readline()
             if marker_re.match(line) or not line:
                 if not key:
                     raise ValueError(
                         "Did not find ACCESSION/VERSION lines")
                 yield _bytes_to_string(key), start_offset, length
                 start_offset = end_offset
                 break
             elif line.startswith(accession_marker):
                 key = line.rstrip().split()[1]
             elif line.startswith(version_marker):
                 version_id = line.rstrip().split()[1]
                 if version_id.count(dot_char) == 1 and version_id.split(
                         dot_char)[1].isdigit():
                     # This should mimic the GenBank parser...
                     key = version_id
             length += len(line)
     assert not line, repr(line)
Esempio n. 13
0
 def __iter__(self):
     handle = self._handle
     handle.seek(0)
     marker_re = self._marker_re
     semi_char = _as_bytes(";")
     while True:
         offset = handle.tell()
         line = handle.readline()
         length = len(line)
         if marker_re.match(line):
             # Now look for the first line which doesn't start ";"
             while True:
                 line = handle.readline()
                 if line[0:1] != semi_char and line.strip():
                     key = line.split()[0]
                     yield _bytes_to_string(key), offset, length
                     break
                 if not line:
                     raise ValueError("Premature end of file?")
                 length += len(line)
         elif not line:
             # End of file
             break
Esempio n. 14
0
    def __iter__(self):
        handle = self._handle
        handle.seek(0)
        start_offset = handle.tell()
        qresult_key = None
        query_mark = _as_bytes('>>>')

        while True:
            line = handle.readline()
            peekline = handle.peekline()
            end_offset = handle.tell()

            if not line.startswith(query_mark) and query_mark in line:
                regx = re.search(_RE_ID_DESC_SEQLEN_IDX, line)
                qresult_key = _bytes_to_string(regx.group(1))
                start_offset = end_offset - len(line)
            # yield whenever we encounter a new query or at the end of the file
            if qresult_key is not None:
                if (not peekline.startswith(query_mark)
                        and query_mark in peekline) or not line:
                    yield qresult_key, start_offset, end_offset - start_offset
                    if not line:
                        break
                    start_offset = end_offset
Esempio n. 15
0
 def get(self, offset):
     return self._parse(StringIO(_bytes_to_string(self.get_raw(offset))))
Esempio n. 16
0
 def get(self, offset):
     """Returns SeqRecord."""
     # Should be overridden for binary file formats etc:
     return self._parse(StringIO(_bytes_to_string(self.get_raw(offset))))