def get_raw(self, offset): """Return the raw record from the file as a bytes string.""" handle = self._handle qresult_raw = _as_bytes('') query_mark = _as_bytes('>>>') # read header first handle.seek(0) while True: line = handle.readline() peekline = handle.peekline() qresult_raw += line if not peekline.startswith(query_mark) and query_mark in peekline: break # and read the qresult raw string handle.seek(offset) while True: # preserve whitespace, don't use read_forward line = handle.readline() peekline = handle.peekline() qresult_raw += line # break when we've reached qresult end if (not peekline.startswith(query_mark) and query_mark in peekline) or \ not line: break # append mock end marker to qresult_raw, since it's not always present return qresult_raw + _as_bytes('>>><<<\n')
def __iter__(self): handle = self._handle handle.seek(0) start_offset = handle.tell() regex_id = re.compile(_as_bytes(r'Query\s*(?:sequence|HMM)?:\s*(.*)')) # determine flag for hmmsearch is_hmmsearch = False line = read_forward(handle) if line.startswith(_as_bytes('hmmsearch')): is_hmmsearch = True while True: end_offset = handle.tell() if line.startswith(self.qresult_start): regx = re.search(regex_id, line) qresult_key = regx.group(1).strip() # qresult start offset is the offset of this line # (starts with the start mark) start_offset = end_offset - len(line) elif line.startswith(self.qresult_end): yield _bytes_to_string(qresult_key), start_offset, 0 start_offset = end_offset elif not line: # HACK: since hmmsearch can only have one query result if is_hmmsearch: yield _bytes_to_string(qresult_key), start_offset, 0 break line = read_forward(handle)
def __iter__(self): handle = self._handle handle.seek(0) marker_re = self._marker_re semi_char = _as_bytes(";") # Skip any header before first record while True: start_offset = handle.tell() line = handle.readline() if marker_re.match(line) or not line: break # Should now be at the start of a record, or end of the file while marker_re.match(line): length = len(line) # We cannot assume the record.id is the first word after ID, # normally the following AC line is used. line = handle.readline() length += len(line) assert line.startswith(_as_bytes("AC ")) key = line[3:].strip().split(semi_char)[0].strip() while True: end_offset = handle.tell() line = handle.readline() if marker_re.match(line) or not line: yield _bytes_to_string(key), start_offset, length start_offset = end_offset break length += len(line) assert not line, repr(line)
class Hmmer3TextIndexer(_BaseHmmerTextIndexer): """Indexer class for HMMER plain text output.""" _parser = Hmmer3TextParser qresult_start = _as_bytes('Query: ') qresult_end = _as_bytes('//') def __iter__(self): handle = self._handle handle.seek(0) start_offset = handle.tell() regex_id = re.compile(_as_bytes(_QRE_ID_LEN_PTN)) while True: line = read_forward(handle) end_offset = handle.tell() if line.startswith(self.qresult_start): regx = re.search(regex_id, line) qresult_key = regx.group(1).strip() # qresult start offset is the offset of this line # (starts with the start mark) start_offset = end_offset - len(line) elif line.startswith(self.qresult_end): yield _bytes_to_string(qresult_key), start_offset, 0 start_offset = end_offset elif not line: break
def __iter__(self): handle = self._handle handle.seek(0) marker_re = self._marker_re semi_char = _as_bytes(";") dot_char = _as_bytes(".") sv_marker = _as_bytes("SV ") ac_marker = _as_bytes("AC ") # Skip any header before first record while True: start_offset = handle.tell() line = handle.readline() if marker_re.match(line) or not line: break # Should now be at the start of a record, or end of the file while marker_re.match(line): # We cannot assume the record.id is the first word after ID, # normally the SV line is used. setbysv = False # resets sv as false length = len(line) if line[2:].count(semi_char) == 6: # Looks like the semi colon separated style introduced in 2006 parts = line[3:].rstrip().split(semi_char) if parts[1].strip().startswith(sv_marker): # The SV bit gives the version key = parts[0].strip() + dot_char + \ parts[1].strip().split()[1] setbysv = True else: key = parts[0].strip() elif line[2:].count(semi_char) == 3: # Looks like the pre 2006 style, take first word only key = line[3:].strip().split(None, 1)[0] if key.endswith(semi_char): key = key[:-1] else: raise ValueError('Did not recognise the ID line layout:\n' + line) while True: end_offset = handle.tell() line = handle.readline() if marker_re.match(line) or not line: end_offset = handle.tell() - len(line) yield _bytes_to_string(key), start_offset, length start_offset = end_offset break elif line.startswith(ac_marker) and not setbysv: key = line.rstrip().split()[1] if key.endswith(semi_char): key = key[:-1] elif line.startswith(sv_marker): key = line.rstrip().split()[1] setbysv = True length += len(line) assert not line, repr(line)
def __iter__(self): handle = self._handle handle.seek(0) id = None start_offset = handle.tell() line = handle.readline() if not line: # Empty file! return at_char = _as_bytes("@") plus_char = _as_bytes("+") if line[0:1] != at_char: raise ValueError("Problem with FASTQ @ line:\n%r" % line) while line: # assert line[0]=="@" # This record seems OK (so far) id = line[1:].rstrip().split(None, 1)[0] # Find the seq line(s) seq_len = 0 length = len(line) while line: line = handle.readline() length += len(line) if line.startswith(plus_char): break seq_len += len(line.strip()) if not line: raise ValueError("Premature end of file in seq section") # assert line[0]=="+" # Find the qual line(s) qual_len = 0 while line: if seq_len == qual_len: if seq_len == 0: # Special case, quality line should be just "\n" line = handle.readline() if line.strip(): raise ValueError( "Expected blank quality line, not %r" % line) # Should be end of record... end_offset = handle.tell() line = handle.readline() if line and line[0:1] != at_char: raise ValueError("Problem with line %r" % line) break else: line = handle.readline() qual_len += len(line.strip()) length += len(line) if seq_len != qual_len: raise ValueError("Problem with quality section") yield _bytes_to_string(id), start_offset, length start_offset = end_offset
def get_raw(self, offset): """Return the raw record from the file as a bytes string.""" handle = self._handle handle.seek(offset) marker_re = self._marker_re lines = [] line = handle.readline() semi_char = _as_bytes(";") while line.startswith(semi_char): lines.append(line) line = handle.readline() while line and not line.startswith(semi_char): lines.append(line) line = handle.readline() return _as_bytes("").join(lines)
def get_raw(self, offset): """Return the raw record from the file as a bytes string.""" handle = self._handle qresult_raw = _as_bytes('') # read header first if not self._preamble: handle.seek(0) while True: line = handle.readline() if line.startswith(self.qresult_start): break qresult_raw += line else: qresult_raw += self._preamble # and read the qresult raw string handle.seek(offset) while True: # preserve whitespace, don't use read_forward line = handle.readline() qresult_raw += line # break when we've reached qresult end if line.startswith(self.qresult_end) or not line: break return qresult_raw
def __iter__(self): handle = self._handle handle.seek(0) marker_re = self._marker_re start_acc_marker = _as_bytes("<accession>") end_acc_marker = _as_bytes("</accession>") end_entry_marker = _as_bytes("</entry>") less_than = _as_bytes("<") # Skip any header before first record while True: start_offset = handle.tell() line = handle.readline() if marker_re.match(line) or not line: break # Should now be at the start of a record, or end of the file while marker_re.match(line): length = len(line) # We expect the next line to be <accession>xxx</accession> # (possibly with leading spaces) # but allow it to be later on within the <entry> key = None while True: line = handle.readline() if key is None and start_acc_marker in line: assert end_acc_marker in line, line key = line[line.find(start_acc_marker) + 11:].split( less_than, 1)[0] length += len(line) elif end_entry_marker in line: end_offset = handle.tell() - len(line) \ + line.find(end_entry_marker) + 8 break elif marker_re.match(line) or not line: # Start of next record or end of file raise ValueError("Didn't find end of record") else: length += len(line) if not key: raise ValueError( "Did not find <accession> line in bytes %i to %i" % (start_offset, end_offset)) yield _bytes_to_string(key), start_offset, length # Find start of next record while not marker_re.match(line) and line: start_offset = handle.tell() line = handle.readline() assert not line, repr(line)
def get_raw(self, offset): """Return the raw record from the file as a bytes string.""" # TODO - Refactor this and the __init__ method to reduce code duplication? handle = self._handle handle.seek(offset) line = handle.readline() data = line at_char = _as_bytes("@") plus_char = _as_bytes("+") if line[0:1] != at_char: raise ValueError("Problem with FASTQ @ line:\n%r" % line) # Find the seq line(s) seq_len = 0 while line: line = handle.readline() data += line if line.startswith(plus_char): break seq_len += len(line.strip()) if not line: raise ValueError("Premature end of file in seq section") assert line[0:1] == plus_char # Find the qual line(s) qual_len = 0 while line: if seq_len == qual_len: if seq_len == 0: # Special case, quality line should be just "\n" line = handle.readline() if line.strip(): raise ValueError( "Expected blank quality line, not %r" % line) data += line # Should be end of record... line = handle.readline() if line and line[0:1] != at_char: raise ValueError("Problem with line %r" % line) break else: line = handle.readline() data += line qual_len += len(line.strip()) if seq_len != qual_len: raise ValueError("Problem with quality section") return data
def get_raw(self, offset): """Return the raw record from the file as a bytes string.""" handle = self._handle marker_re = self._marker_re end_entry_marker = _as_bytes("</entry>") handle.seek(offset) data = [handle.readline()] while True: line = handle.readline() i = line.find(end_entry_marker) if i != -1: data.append(line[:i + 8]) break if marker_re.match(line) or not line: # End of file, or start of next record raise ValueError("Didn't find end of record") data.append(line) return _as_bytes("").join(data)
def __iter__(self): """Iterates over the file handle; yields key, start offset, and length.""" handle = self._handle handle.seek(0) query_id_idx = self._query_id_idx qresult_key = None header_mark = _as_bytes('#') split_mark = _as_bytes(' ') # set line with initial mock value, to emulate header line = header_mark # read through header while line.startswith(header_mark): start_offset = handle.tell() line = handle.readline() # and index the qresults while True: end_offset = handle.tell() if not line: break cols = [x for x in line.strip().split(split_mark) if x] if qresult_key is None: qresult_key = cols[query_id_idx] else: curr_key = cols[query_id_idx] if curr_key != qresult_key: adj_end = end_offset - len(line) yield _bytes_to_string(qresult_key), start_offset, \ adj_end - start_offset qresult_key = curr_key start_offset = adj_end line = handle.readline() if not line: yield _bytes_to_string(qresult_key), start_offset, \ end_offset - start_offset break
class Hmmer2TextIndexer(_BaseHmmerTextIndexer): """Indexer for hmmer2-text format.""" _parser = Hmmer2TextParser qresult_start = _as_bytes('Query') # qresults_ends for hmmpfam and hmmsearch # need to anticipate both since hmmsearch have different query end mark qresult_end = _as_bytes('//') def __iter__(self): handle = self._handle handle.seek(0) start_offset = handle.tell() regex_id = re.compile(_as_bytes(r'Query\s*(?:sequence|HMM)?:\s*(.*)')) # determine flag for hmmsearch is_hmmsearch = False line = read_forward(handle) if line.startswith(_as_bytes('hmmsearch')): is_hmmsearch = True while True: end_offset = handle.tell() if line.startswith(self.qresult_start): regx = re.search(regex_id, line) qresult_key = regx.group(1).strip() # qresult start offset is the offset of this line # (starts with the start mark) start_offset = end_offset - len(line) elif line.startswith(self.qresult_end): yield _bytes_to_string(qresult_key), start_offset, 0 start_offset = end_offset elif not line: # HACK: since hmmsearch can only have one query result if is_hmmsearch: yield _bytes_to_string(qresult_key), start_offset, 0 break line = read_forward(handle)
def __iter__(self): handle = self._handle handle.seek(0) marker_re = self._marker_re dot_char = _as_bytes(".") accession_marker = _as_bytes("ACCESSION ") version_marker = _as_bytes("VERSION ") # Skip and header before first record while True: start_offset = handle.tell() line = handle.readline() if marker_re.match(line) or not line: break # Should now be at the start of a record, or end of the file while marker_re.match(line): # We cannot assume the record.id is the first word after LOCUS, # normally the first entry on the VERSION or ACCESSION line is used. key = None length = len(line) while True: end_offset = handle.tell() line = handle.readline() if marker_re.match(line) or not line: if not key: raise ValueError( "Did not find ACCESSION/VERSION lines") yield _bytes_to_string(key), start_offset, length start_offset = end_offset break elif line.startswith(accession_marker): key = line.rstrip().split()[1] elif line.startswith(version_marker): version_id = line.rstrip().split()[1] if version_id.count(dot_char) == 1 and version_id.split( dot_char)[1].isdigit(): # This should mimic the GenBank parser... key = version_id length += len(line) assert not line, repr(line)
def get_raw(self, offset): """Return the raw record from the file as a bytes string.""" # For non-trivial file formats this must be over-ridden in the subclass handle = self._handle marker_re = self._marker_re handle.seek(offset) lines = [handle.readline()] while True: line = handle.readline() if marker_re.match(line) or not line: # End of file, or start of next record => end of this record break lines.append(line) return _as_bytes("").join(lines)
def get_raw(self, offset): """Returns the raw bytes string of a QueryResult object from the given offset.""" handle = self._handle handle.seek(offset) query_id_idx = self._query_id_idx qresult_key = None qresult_raw = _as_bytes('') split_mark = _as_bytes(' ') while True: line = handle.readline() if not line: break cols = [x for x in line.strip().split(split_mark) if x] if qresult_key is None: qresult_key = cols[query_id_idx] else: curr_key = cols[query_id_idx] if curr_key != qresult_key: break qresult_raw += line return qresult_raw
def write(self, data): # TODO - Check bytes vs unicode data = _as_bytes(data) # block_size = 2**16 = 65536 data_len = len(data) if len(self._buffer) + data_len < 65536: # print("Cached %r" % data) self._buffer += data return else: # print("Got %r, writing out some data..." % data) self._buffer += data while len(self._buffer) >= 65536: self._write_block(self._buffer[:65536]) self._buffer = self._buffer[65536:]
def __init__(self, filename, format, alphabet): SeqFileRandomAccess.__init__(self, filename, format, alphabet) marker = { "ace": "CO ", "embl": "ID ", "fasta": ">", "genbank": "LOCUS ", "gb": "LOCUS ", "imgt": "ID ", "phd": "BEGIN_SEQUENCE", "pir": ">..;", "qual": ">", "swiss": "ID ", "uniprot-xml": "<entry ", }[format] self._marker = marker self._marker_re = re.compile(_as_bytes("^%s" % marker))
def __iter__(self): handle = self._handle handle.seek(0) tab_char = _as_bytes("\t") while True: start_offset = handle.tell() line = handle.readline() if not line: break # End of file try: key = line.split(tab_char)[0] except ValueError as err: if not line.strip(): # Ignore blank lines continue else: raise err else: yield _bytes_to_string(key), start_offset, len(line)
def __iter__(self): handle = self._handle handle.seek(0) start_offset = handle.tell() regex_id = re.compile(_as_bytes(_QRE_ID_LEN_PTN)) while True: line = read_forward(handle) end_offset = handle.tell() if line.startswith(self.qresult_start): regx = re.search(regex_id, line) qresult_key = regx.group(1).strip() # qresult start offset is the offset of this line # (starts with the start mark) start_offset = end_offset - len(line) elif line.startswith(self.qresult_end): yield _bytes_to_string(qresult_key), start_offset, 0 start_offset = end_offset elif not line: break
def __iter__(self): """Iterates over the file handle; yields key, start offset, and length.""" handle = self._handle handle.seek(0) # denotes column location for query identifier query_id_idx = 9 qresult_key = None tab_char = _as_bytes('\t') start_offset = handle.tell() line = handle.readline() # read through header # this assumes that the result row match the regex while not re.search(_RE_ROW_CHECK_IDX, line.strip()): start_offset = handle.tell() line = handle.readline() if not line: raise StopIteration # and index the qresults while True: end_offset = handle.tell() cols = [x for x in line.strip().split(tab_char) if x] if qresult_key is None: qresult_key = cols[query_id_idx] else: curr_key = cols[query_id_idx] if curr_key != qresult_key: yield _bytes_to_string(qresult_key), start_offset, \ end_offset - start_offset qresult_key = curr_key start_offset = end_offset - len(line) line = handle.readline() if not line: yield _bytes_to_string(qresult_key), start_offset, \ end_offset - start_offset break
def __iter__(self): handle = self._handle handle.seek(0) marker_re = self._marker_re semi_char = _as_bytes(";") while True: offset = handle.tell() line = handle.readline() length = len(line) if marker_re.match(line): # Now look for the first line which doesn't start ";" while True: line = handle.readline() if line[0:1] != semi_char and line.strip(): key = line.split()[0] yield _bytes_to_string(key), offset, length break if not line: raise ValueError("Premature end of file?") length += len(line) elif not line: # End of file break
def __iter__(self): handle = self._handle handle.seek(0) start_offset = handle.tell() qresult_key = None query_mark = _as_bytes('>>>') while True: line = handle.readline() peekline = handle.peekline() end_offset = handle.tell() if not line.startswith(query_mark) and query_mark in line: regx = re.search(_RE_ID_DESC_SEQLEN_IDX, line) qresult_key = _bytes_to_string(regx.group(1)) start_offset = end_offset - len(line) # yield whenever we encounter a new query or at the end of the file if qresult_key is not None: if (not peekline.startswith(query_mark) and query_mark in peekline) or not line: yield qresult_key, start_offset, end_offset - start_offset if not line: break start_offset = end_offset
from anarci.Bio._py3k import _as_bytes, _bytes_to_string from anarci.Bio._py3k import zip from anarci.Bio.Alphabet import generic_dna from anarci.Bio.SearchIO._index import SearchIndexer from anarci.Bio.SearchIO._model import QueryResult, Hit, HSP, HSPFragment __all__ = ['BlatPslParser', 'BlatPslIndexer', 'BlatPslWriter'] # precompile regex patterns _PTR_ROW_CHECK = r'^\d+\s+\d+\s+\d+\s+\d+' _RE_ROW_CHECK = re.compile(_PTR_ROW_CHECK) _RE_ROW_CHECK_IDX = re.compile(_as_bytes(_PTR_ROW_CHECK)) def _list_from_csv(csv_string, caster=None): """Transforms the given comma-separated string into a list. :param csv_string: comma-separated input string :type csv_string: string :param caster: function used to cast each item in the input string to its intended type :type caster: callable, accepts string, returns object """ if caster is None: return [x for x in csv_string.split(',') if x] else:
def __init__(self, *args, **kwargs): super(_BaseHmmerTextIndexer, self).__init__(*args, **kwargs) self._preamble = _as_bytes('')
def __init__(self, filename, format, alphabet): SeqFileRandomAccess.__init__(self, filename, format, alphabet) self._marker_re = re.compile(_as_bytes("^;"))
from anarci.Bio._py3k import _as_bytes, _bytes_to_string from anarci.Bio.Alphabet import generic_dna, generic_protein from anarci.Bio.File import UndoHandle from anarci.Bio.SearchIO._index import SearchIndexer from anarci.Bio.SearchIO._model import QueryResult, Hit, HSP, HSPFragment __all__ = ['FastaM10Parser', 'FastaM10Indexer'] # precompile regex patterns # regex for program name _RE_FLAVS = re.compile(r't?fast[afmsxy]|pr[sf][sx]|lalign|[gs]?[glso]search') # regex for sequence ID and length ~ deals with both \n and \r\n _PTR_ID_DESC_SEQLEN = r'>>>(.+?)\s+(.*?) *- (\d+) (?:aa|nt)\s*$' _RE_ID_DESC_SEQLEN = re.compile(_PTR_ID_DESC_SEQLEN) _RE_ID_DESC_SEQLEN_IDX = re.compile(_as_bytes(_PTR_ID_DESC_SEQLEN)) # regex for qresult, hit, or hsp attribute value _RE_ATTR = re.compile(r'^; [a-z]+(_[ \w-]+):\s+(.*)$') # regex for capturing excess start and end sequences in alignments _RE_START_EXC = re.compile(r'^-*') _RE_END_EXC = re.compile(r'-*$') # attribute name mappings _HSP_ATTR_MAP = { '_initn': ('initn_score', int), '_init1': ('init1_score', int), '_opt': ('opt_score', int), '_s-w opt': ('opt_score', int), '_z-score': ('z_score', float), '_bits': ('bitscore', float), '_expect': ('evalue', float),