def __iter__(self): handle = self._handle handle.seek(0) start_offset = handle.tell() regex_id = re.compile(_as_bytes(r'Query\s*(?:sequence|HMM)?:\s*(.*)')) # determine flag for hmmsearch is_hmmsearch = False line = read_forward(handle) if line.startswith(_as_bytes('hmmsearch')): is_hmmsearch = True while True: end_offset = handle.tell() if line.startswith(self.qresult_start): regx = re.search(regex_id, line) qresult_key = regx.group(1).strip() # qresult start offset is the offset of this line # (starts with the start mark) start_offset = end_offset - len(line) elif line.startswith(self.qresult_end): yield _bytes_to_string(qresult_key), start_offset, 0 start_offset = end_offset elif not line: # HACK: since hmmsearch can only have one query result if is_hmmsearch: yield _bytes_to_string(qresult_key), start_offset, 0 break line = read_forward(handle)
def __iter__(self): """Iterate over Hmmer2TextIndexer; yields query results' key, offsets, 0.""" handle = self._handle handle.seek(0) start_offset = handle.tell() regex_id = re.compile(_as_bytes(r'Query\s*(?:sequence|HMM)?:\s*(.*)')) # determine flag for hmmsearch is_hmmsearch = False line = read_forward(handle) if line.startswith(_as_bytes('hmmsearch')): is_hmmsearch = True while True: end_offset = handle.tell() if line.startswith(self.qresult_start): regx = re.search(regex_id, line) qresult_key = regx.group(1).strip() # qresult start offset is the offset of this line # (starts with the start mark) start_offset = end_offset - len(line) elif line.startswith(self.qresult_end): yield _bytes_to_string(qresult_key), start_offset, 0 start_offset = end_offset elif not line: # HACK: since hmmsearch can only have one query result if is_hmmsearch: yield _bytes_to_string(qresult_key), start_offset, 0 break line = read_forward(handle)
def get_raw_check(self, filename, format, alphabet): handle = open(filename, "rb") raw_file = handle.read() handle.close() #Also checking the key_function here id_list = [rec.id.lower() for rec in \ SeqIO.parse(filename, format, alphabet)] rec_dict = SeqIO.index(filename, format, alphabet, key_function = lambda x : x.lower()) self.assertEqual(set(id_list), set(rec_dict.keys())) self.assertEqual(len(id_list), len(rec_dict)) for key in id_list: self.assertTrue(key in rec_dict) self.assertEqual(key, rec_dict[key].id.lower()) self.assertEqual(key, rec_dict.get(key).id.lower()) raw = rec_dict.get_raw(key) self.assertTrue(raw.strip()) self.assertTrue(raw in raw_file) rec1 = rec_dict[key] #Following isn't very elegant, but it lets me test the #__getitem__ SFF code is working. if format in SeqIO._BinaryFormats: handle = BytesIO(raw) else: handle = StringIO(_bytes_to_string(raw)) if format == "sff": rec2 = SeqIO.SffIO._sff_read_seq_record(handle, rec_dict._proxy._flows_per_read, rec_dict._proxy._flow_chars, rec_dict._proxy._key_sequence, rec_dict._proxy._alphabet, trim=False) elif format == "sff-trim": rec2 = SeqIO.SffIO._sff_read_seq_record(handle, rec_dict._proxy._flows_per_read, rec_dict._proxy._flow_chars, rec_dict._proxy._key_sequence, rec_dict._proxy._alphabet, trim=True) elif format == "uniprot-xml": self.assertTrue(raw.startswith(_as_bytes("<entry "))) self.assertTrue(raw.endswith(_as_bytes("</entry>"))) #Currently the __getitem__ method uses this #trick too, but we hope to fix that later raw = """<?xml version='1.0' encoding='UTF-8'?> <uniprot xmlns="http://uniprot.org/uniprot" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://uniprot.org/uniprot http://www.uniprot.org/support/docs/uniprot.xsd"> %s </uniprot> """ % _bytes_to_string(raw) handle = StringIO(raw) rec2 = SeqIO.read(handle, format, alphabet) else: rec2 = SeqIO.read(handle, format, alphabet) self.assertEqual(True, compare_record(rec1, rec2)) rec_dict._proxy._handle.close() #TODO - Better solution del rec_dict
def __iter__(self): handle = self._handle handle.seek(0) marker_re = self._marker_re dot_char = _as_bytes(".") accession_marker = _as_bytes("ACCESSION ") version_marker = _as_bytes("VERSION ") #Skip and header before first record while True: start_offset = handle.tell() line = handle.readline() if marker_re.match(line) or not line: break #Should now be at the start of a record, or end of the file while marker_re.match(line): #We cannot assume the record.id is the first word after LOCUS, #normally the first entry on the VERSION or ACCESSION line is used. key = None while True: line = handle.readline() if marker_re.match(line) or not line: if not key: raise ValueError("Did not find ACCESSION/VERSION lines") end_offset = handle.tell() - len(line) yield _bytes_to_string(key), start_offset, end_offset - start_offset start_offset = end_offset break elif line.startswith(accession_marker): key = line.rstrip().split()[1] elif line.startswith(version_marker): version_id = line.rstrip().split()[1] if version_id.count(dot_char)==1 and version_id.split(dot_char)[1].isdigit(): #This should mimic the GenBank parser... key = version_id assert not line, repr(line)
def __iter__(self): handle = self._handle handle.seek(0) # Skip any header offset = 0 line = "" while True: offset += len(line) line = handle.readline() if not line: break # Premature end of file, or just empty? if not line.startswith(b";;"): break while line: length = 0 assert offset + len(line) == handle.tell() if not line.startswith(b";"): raise ValueError("Records should start with ';' and not:\n%r" % line) while line.startswith(b";"): length += len(line) line = handle.readline() key = line.rstrip() # Now look for the first line which starts ";" while line and not line.startswith(b";"): length += len(line) line = handle.readline() yield _bytes_to_string(key), offset, length offset += length assert offset + len(line) == handle.tell()
def __iter__(self): """Return (id, offset, length) tuples.""" marker_offset = len(self._marker) marker_re = self._marker_re handle = self._handle handle.seek(0) # Skip any header before first record while True: start_offset = handle.tell() line = handle.readline() if marker_re.match(line) or not line: break # Should now be at the start of a record, or end of the file while marker_re.match(line): # Here we can assume the record.id is the first word after the # marker. This is generally fine... but not for GenBank, EMBL, Swiss id = line[marker_offset:].strip().split(None, 1)[0] length = len(line) while True: end_offset = handle.tell() line = handle.readline() if marker_re.match(line) or not line: yield _bytes_to_string(id), start_offset, length start_offset = end_offset break else: # Track this explicitly as can't do file offset difference on BGZF length += len(line) assert not line, repr(line)
def __iter__(self): """Iterate over the sequence records in the file.""" handle = self._handle handle.seek(0) # Skip any header offset = 0 line = "" while True: offset += len(line) line = handle.readline() if not line: break # Premature end of file, or just empty? if not line.startswith(b";;"): break while line: length = 0 assert offset + len(line) == handle.tell() if not line.startswith(b";"): raise ValueError("Records should start with ';' and not:\n%r" % line) while line.startswith(b";"): length += len(line) line = handle.readline() key = line.rstrip() # Now look for the first line which starts ";" while line and not line.startswith(b";"): length += len(line) line = handle.readline() yield _bytes_to_string(key), offset, length offset += length assert offset + len(line) == handle.tell()
def __iter__(self): """Returns (id,offset) tuples.""" marker_offset = len(self._marker) marker_re = self._marker_re handle = self._handle handle.seek(0) #Skip and header before first record while True: start_offset = handle.tell() line = handle.readline() if marker_re.match(line) or not line: break #Should now be at the start of a record, or end of the file while marker_re.match(line): #Here we can assume the record.id is the first word after the #marker. This is generally fine... but not for GenBank, EMBL, Swiss id = line[marker_offset:].strip().split(None, 1)[0] length = len(line) while True: end_offset = handle.tell() line = handle.readline() if marker_re.match(line) or not line: yield _bytes_to_string(id), start_offset, length start_offset = end_offset break else: #Track this explicitly as can't do file offset difference on BGZF length += len(line) assert not line, repr(line)
def __iter__(self): handle = self._handle handle.seek(0) marker_re = self._marker_re semi_char = _as_bytes(";") #Skip any header before first record while True: start_offset = handle.tell() line = handle.readline() if marker_re.match(line) or not line: break #Should now be at the start of a record, or end of the file while marker_re.match(line): length = len(line) #We cannot assume the record.id is the first word after ID, #normally the following AC line is used. line = handle.readline() length += len(line) assert line.startswith(_as_bytes("AC ")) key = line[3:].strip().split(semi_char)[0].strip() while True: end_offset = handle.tell() line = handle.readline() if marker_re.match(line) or not line: yield _bytes_to_string(key), start_offset, length start_offset = end_offset break length += len(line) assert not line, repr(line)
def __iter__(self): """Returns (id,offset) tuples.""" marker_offset = len(self._marker) marker_re = self._marker_re handle = self._handle handle.seek(0) #Skip and header before first record while True: start_offset = handle.tell() line = handle.readline() if marker_re.match(line) or not line: break #Should now be at the start of a record, or end of the file while marker_re.match(line): #Here we can assume the record.id is the first word after the #marker. This is generally fine... but not for GenBank, EMBL, Swiss id = line[marker_offset:].strip().split(None, 1)[0] while True: line = handle.readline() if marker_re.match(line) or not line: end_offset = handle.tell() - len(line) yield _bytes_to_string( id), start_offset, end_offset - start_offset start_offset = end_offset break assert not line, repr(line)
def __iter__(self): handle = self._handle handle.seek(0) marker_re = self._marker_re semi_char = _as_bytes(";") #Skip any header before first record while True: start_offset = handle.tell() line = handle.readline() if marker_re.match(line) or not line: break #Should now be at the start of a record, or end of the file while marker_re.match(line): #We cannot assume the record.id is the first word after ID, #normally the following AC line is used. line = handle.readline() assert line.startswith(_as_bytes("AC ")) key = line[3:].strip().split(semi_char)[0].strip() while True: line = handle.readline() if marker_re.match(line) or not line: end_offset = handle.tell() - len(line) yield _bytes_to_string( key), start_offset, end_offset - start_offset start_offset = end_offset break assert not line, repr(line)
def __iter__(self): """Iterate over FastaM10Indexer; yields query results' keys, start offsets, offset lengths.""" handle = self._handle handle.seek(0) start_offset = handle.tell() qresult_key = None query_mark = b">>>" while True: line = handle.readline() peekline = handle.peekline() end_offset = handle.tell() if not line.startswith(query_mark) and query_mark in line: regx = re.search(_RE_ID_DESC_SEQLEN_IDX, line) qresult_key = _bytes_to_string(regx.group(1)) start_offset = end_offset - len(line) # yield whenever we encounter a new query or at the end of the file if qresult_key is not None: if (not peekline.startswith(query_mark) and query_mark in peekline) or not line: yield qresult_key, start_offset, end_offset - start_offset if not line: break start_offset = end_offset
def __iter__(self): """Iterate over the sequence records in the file.""" handle = self._handle handle.seek(0) marker_re = self._marker_re # Skip any header before first record while True: start_offset = handle.tell() line = handle.readline() if marker_re.match(line) or not line: break # Should now be at the start of a record, or end of the file while marker_re.match(line): length = len(line) # We cannot assume the record.id is the first word after ID, # normally the following AC line is used. line = handle.readline() length += len(line) assert line.startswith(b"AC ") key = line[3:].strip().split(b";")[0].strip() while True: end_offset = handle.tell() line = handle.readline() if marker_re.match(line) or not line: yield _bytes_to_string(key), start_offset, length start_offset = end_offset break length += len(line) assert not line, repr(line)
def __iter__(self): """Iterate over the file handle; yields key, start offset, and length.""" handle = self._handle handle.seek(0) query_id_idx = self._query_id_idx qresult_key = None header_mark = _as_bytes("#") split_mark = _as_bytes(" ") # set line with initial mock value, to emulate header line = header_mark # read through header while line.startswith(header_mark): start_offset = handle.tell() line = handle.readline() # and index the qresults while True: end_offset = handle.tell() if not line: break cols = [x for x in line.strip().split(split_mark) if x] if qresult_key is None: qresult_key = cols[query_id_idx] else: curr_key = cols[query_id_idx] if curr_key != qresult_key: adj_end = end_offset - len(line) yield ( _bytes_to_string(qresult_key), start_offset, adj_end - start_offset, ) qresult_key = curr_key start_offset = adj_end line = handle.readline() if not line: yield ( _bytes_to_string(qresult_key), start_offset, end_offset - start_offset, ) break
def __iter__(self): """Iterate over BlastXmlIndexer yields qstart_id, start_offset, block's length.""" qstart_mark = self.qstart_mark qend_mark = self.qend_mark blast_id_mark = _as_bytes("Query_") block_size = self.block_size handle = self._handle handle.seek(0) re_desc = re.compile( _as_bytes( r"<Iteration_query-ID>(.*?)" r"</Iteration_query-ID>\s+?" "<Iteration_query-def>" "(.*?)</Iteration_query-def>" ) ) re_desc_end = re.compile(_as_bytes(r"</Iteration_query-def>")) counter = 0 while True: start_offset = handle.tell() line = handle.readline() if not line: break if qstart_mark not in line: continue # The following requirements are to make supporting BGZF compressed # BLAST XML files simpler (avoids complex offset manipulations): assert line.count(qstart_mark) == 1, "XML without line breaks?" assert line.lstrip().startswith(qstart_mark), line if qend_mark in line: # Should cope with <Iteration>...</Iteration> on one long line block = line else: # Load the rest of this block up to and including </Iteration> block = [line] while line and qend_mark not in line: line = handle.readline() assert qstart_mark not in line, line block.append(line) assert line.rstrip().endswith(qend_mark), line block = _empty_bytes_string.join(block) assert block.count(qstart_mark) == 1, "XML without line breaks? %r" % block assert block.count(qend_mark) == 1, "XML without line breaks? %r" % block # Now we have a full <Iteration>...</Iteration> block, find the ID regx = re.search(re_desc, block) try: qstart_desc = regx.group(2) qstart_id = regx.group(1) except AttributeError: # use the fallback values assert re.search(re_desc_end, block) qstart_desc = _as_bytes(self._fallback["description"]) qstart_id = _as_bytes(self._fallback["id"]) if qstart_id.startswith(blast_id_mark): qstart_id = qstart_desc.split(_as_bytes(" "), 1)[0] yield _bytes_to_string(qstart_id), start_offset, len(block) counter += 1
def get_qresult_id(self, pos): """Return the query ID of the nearest cigar line.""" handle = self._handle handle.seek(pos) # get line, check if it's a vulgar line, and get query ID line = handle.readline() assert line.startswith(self._query_mark), line id = re.search(_RE_CIGAR, _bytes_to_string(line)) return id.group(1)
def __iter__(self): """Iterate over the sequence records in the file.""" handle = self._handle handle.seek(0) marker_re = self._marker_re semi_char = b";" sv_marker = b"SV " ac_marker = b"AC " # Skip any header before first record while True: start_offset = handle.tell() line = handle.readline() if marker_re.match(line) or not line: break # Should now be at the start of a record, or end of the file while marker_re.match(line): # We cannot assume the record.id is the first word after ID, # normally the SV line is used. setbysv = False # resets sv as false length = len(line) if line[2:].count(b";") in [5, 6]: # Looks like the semi colon separated style introduced in 2006 # Or style from IPD-IMGT/HLA after their v3.16.0 release parts = line[3:].rstrip().split(b";") if parts[1].strip().startswith(sv_marker): # The SV bit gives the version key = parts[0].strip() + b"." + \ parts[1].strip().split()[1] setbysv = True else: key = parts[0].strip() elif line[2:].count(b";") in [2, 3]: # Looks like the pre 2006 style, take first word only # Or, with two colons, the KIPO patent variantion key = line[3:].strip().split(None, 1)[0] if key.endswith(b";"): key = key[:-1] else: raise ValueError('Did not recognise the ID line layout:\n%r' % line) while True: end_offset = handle.tell() line = handle.readline() if marker_re.match(line) or not line: end_offset = handle.tell() - len(line) yield _bytes_to_string(key), start_offset, length start_offset = end_offset break elif line.startswith(ac_marker) and not setbysv: key = line.rstrip().split()[1] if key.endswith(b";"): key = key[:-1] elif line.startswith(sv_marker): key = line.rstrip().split()[1] setbysv = True length += len(line) assert not line, repr(line)
def get_qresult_id(self, pos): """Return the query ID of the nearest vulgar line.""" handle = self._handle handle.seek(pos) # get line, check if it's a vulgar line, and get query ID line = handle.readline() assert line.startswith(self._query_mark), line id = re.search(_RE_VULGAR, _bytes_to_string(line)) return id.group(1)
def __iter__(self): handle = self._handle handle.seek(0) marker_re = self._marker_re semi_char = b";" sv_marker = b"SV " ac_marker = b"AC " # Skip any header before first record while True: start_offset = handle.tell() line = handle.readline() if marker_re.match(line) or not line: break # Should now be at the start of a record, or end of the file while marker_re.match(line): # We cannot assume the record.id is the first word after ID, # normally the SV line is used. setbysv = False # resets sv as false length = len(line) if line[2:].count(b";") in [5, 6]: # Looks like the semi colon separated style introduced in 2006 # Or style from IPD-IMGT/HLA after their v3.16.0 release parts = line[3:].rstrip().split(b";") if parts[1].strip().startswith(sv_marker): # The SV bit gives the version key = parts[0].strip() + b"." + \ parts[1].strip().split()[1] setbysv = True else: key = parts[0].strip() elif line[2:].count(b";") in [2, 3]: # Looks like the pre 2006 style, take first word only # Or, with two colons, the KIPO patent variantion key = line[3:].strip().split(None, 1)[0] if key.endswith(b";"): key = key[:-1] else: raise ValueError( 'Did not recognise the ID line layout:\n%r' % line) while True: end_offset = handle.tell() line = handle.readline() if marker_re.match(line) or not line: end_offset = handle.tell() - len(line) yield _bytes_to_string(key), start_offset, length start_offset = end_offset break elif line.startswith(ac_marker) and not setbysv: key = line.rstrip().split()[1] if key.endswith(b";"): key = key[:-1] elif line.startswith(sv_marker): key = line.rstrip().split()[1] setbysv = True length += len(line) assert not line, repr(line)
def gzip_open(filename, format): # At time of writing, under Python 3.2.2 seems gzip.open(filename, mode) # insists on giving byte strings (i.e. binary mode) # See http://bugs.python.org/issue13989 if sys.version_info[0] < 3 or format in SeqIO._BinaryFormats: return gzip.open(filename) handle = gzip.open(filename) data = handle.read() # bytes! handle.close() return StringIO(_bytes_to_string(data))
def __iter__(self): """Iterate over the sequence records in the file.""" handle = self._handle handle.seek(0) marker_re = self._marker_re accession_marker = b"ACCESSION " version_marker = b"VERSION " # Skip and header before first record while True: start_offset = handle.tell() line = handle.readline() if marker_re.match(line) or not line: break # Should now be at the start of a record, or end of the file while marker_re.match(line): # We cannot assume the record.id is the first word after LOCUS, # normally the first entry on the VERSION or ACCESSION line is used. # However if both missing, GenBank parser falls back on LOCUS entry. try: key = line[5:].split(None, 1)[0] except ValueError: # Warning? # No content in LOCUS line key = None length = len(line) while True: end_offset = handle.tell() line = handle.readline() if marker_re.match(line) or not line: if not key: raise ValueError( "Did not find usable ACCESSION/VERSION/LOCUS lines" ) yield _bytes_to_string(key), start_offset, length start_offset = end_offset break elif line.startswith(accession_marker): try: key = line.rstrip().split()[1] except IndexError: # No content in ACCESSION line pass elif line.startswith(version_marker): try: version_id = line.rstrip().split()[1] if version_id.count(b".") == 1 and version_id.split( b".")[1].isdigit(): # This should mimic the GenBank parser... key = version_id except IndexError: # No content in VERSION line pass length += len(line) assert not line, repr(line)
def __iter__(self): handle = self._handle handle.seek(0) marker_re = self._marker_re semi_char = _as_bytes(";") dot_char = _as_bytes(".") sv_marker = _as_bytes("SV ") ac_marker = _as_bytes("AC ") # Skip any header before first record while True: start_offset = handle.tell() line = handle.readline() if marker_re.match(line) or not line: break # Should now be at the start of a record, or end of the file while marker_re.match(line): # We cannot assume the record.id is the first word after ID, # normally the SV line is used. setbysv = False # resets sv as false length = len(line) if line[2:].count(semi_char) == 6: # Looks like the semi colon separated style introduced in 2006 parts = line[3:].rstrip().split(semi_char) if parts[1].strip().startswith(sv_marker): # The SV bit gives the version key = parts[0].strip() + dot_char + \ parts[1].strip().split()[1] setbysv = True else: key = parts[0].strip() elif line[2:].count(semi_char) == 3: # Looks like the pre 2006 style, take first word only key = line[3:].strip().split(None, 1)[0] if key.endswith(semi_char): key = key[:-1] else: raise ValueError( 'Did not recognise the ID line layout:\n' + line) while True: end_offset = handle.tell() line = handle.readline() if marker_re.match(line) or not line: end_offset = handle.tell() - len(line) yield _bytes_to_string(key), start_offset, length start_offset = end_offset break elif line.startswith(ac_marker) and not setbysv: key = line.rstrip().split()[1] if key.endswith(semi_char): key = key[:-1] elif line.startswith(sv_marker): key = line.rstrip().split()[1] setbysv = True length += len(line) assert not line, repr(line)
def _parse_tag_data(elem_code, elem_num, raw_data): """Returns single data value. Arguments: - elem_code - What kind of data - elem_num - How many data points - raw_data - abi file object from which the tags would be unpacked """ if elem_code in _BYTEFMT: # because '>1s' unpack differently from '>s' if elem_num == 1: num = '' else: num = str(elem_num) fmt = '>' + num + _BYTEFMT[elem_code] assert len(raw_data) == struct.calcsize(fmt) data = struct.unpack(fmt, raw_data) # no need to use tuple if len(data) == 1 # also if data is date / time if elem_code not in [10, 11] and len(data) == 1: data = data[0] # account for different data types if elem_code == 2: return _bytes_to_string(data) elif elem_code == 10: return str(datetime.date(*data)) elif elem_code == 11: return str(datetime.time(*data[:3])) elif elem_code == 13: return bool(data) elif elem_code == 18: return _bytes_to_string(data[1:]) elif elem_code == 19: return _bytes_to_string(data[:-1]) else: return data else: return None
def __iter__(self): """Iterate over BlastXmlIndexer yields qstart_id, start_offset, block's length.""" qstart_mark = self.qstart_mark qend_mark = self.qend_mark blast_id_mark = _as_bytes('Query_') block_size = self.block_size handle = self._handle handle.seek(0) re_desc = re.compile(_as_bytes(r'<Iteration_query-ID>(.*?)' r'</Iteration_query-ID>\s+?' '<Iteration_query-def>' '(.*?)</Iteration_query-def>')) re_desc_end = re.compile(_as_bytes(r'</Iteration_query-def>')) counter = 0 while True: start_offset = handle.tell() line = handle.readline() if not line: break if qstart_mark not in line: continue # The following requirements are to make supporting BGZF compressed # BLAST XML files simpler (avoids complex offset manipulations): assert line.count(qstart_mark) == 1, "XML without line breaks?" assert line.lstrip().startswith(qstart_mark), line if qend_mark in line: # Should cope with <Iteration>...</Iteration> on one long line block = line else: # Load the rest of this block up to and including </Iteration> block = [line] while line and qend_mark not in line: line = handle.readline() assert qstart_mark not in line, line block.append(line) assert line.rstrip().endswith(qend_mark), line block = _empty_bytes_string.join(block) assert block.count(qstart_mark) == 1, "XML without line breaks? %r" % block assert block.count(qend_mark) == 1, "XML without line breaks? %r" % block # Now we have a full <Iteration>...</Iteration> block, find the ID regx = re.search(re_desc, block) try: qstart_desc = regx.group(2) qstart_id = regx.group(1) except AttributeError: # use the fallback values assert re.search(re_desc_end, block) qstart_desc = _as_bytes(self._fallback['description']) qstart_id = _as_bytes(self._fallback['id']) if qstart_id.startswith(blast_id_mark): qstart_id = qstart_desc.split(_as_bytes(' '), 1)[0] yield _bytes_to_string(qstart_id), start_offset, len(block) counter += 1
def __iter__(self): """Iterate over the file handle; yields key, start offset, and length.""" handle = self._handle handle.seek(0) query_id_idx = self._query_id_idx qresult_key = None header_mark = _as_bytes('#') split_mark = _as_bytes(' ') # set line with initial mock value, to emulate header line = header_mark # read through header while line.startswith(header_mark): start_offset = handle.tell() line = handle.readline() # and index the qresults while True: end_offset = handle.tell() if not line: break cols = [x for x in line.strip().split(split_mark) if x] if qresult_key is None: qresult_key = cols[query_id_idx] else: curr_key = cols[query_id_idx] if curr_key != qresult_key: adj_end = end_offset - len(line) yield (_bytes_to_string(qresult_key), start_offset, adj_end - start_offset) qresult_key = curr_key start_offset = adj_end line = handle.readline() if not line: yield (_bytes_to_string(qresult_key), start_offset, end_offset - start_offset) break
def __iter__(self): """Iterate over the sequence records in the file.""" handle = self._handle handle.seek(0) id = None start_offset = handle.tell() line = handle.readline() if not line: # Empty file! return if line[0:1] != b"@": raise ValueError("Problem with FASTQ @ line:\n%r" % line) while line: # assert line[0]=="@" # This record seems OK (so far) id = line[1:].rstrip().split(None, 1)[0] # Find the seq line(s) seq_len = 0 length = len(line) while line: line = handle.readline() length += len(line) if line.startswith(b"+"): break seq_len += len(line.strip()) if not line: raise ValueError("Premature end of file in seq section") # assert line[0]=="+" # Find the qual line(s) qual_len = 0 while line: if seq_len == qual_len: if seq_len == 0: # Special case, quality line should be just "\n" line = handle.readline() if line.strip(): raise ValueError( "Expected blank quality line, not %r" % line) length += len(line) # Need to include the blank ling # Should be end of record... end_offset = handle.tell() line = handle.readline() if line and line[0:1] != b"@": raise ValueError("Problem with line %r" % line) break else: line = handle.readline() qual_len += len(line.strip()) length += len(line) if seq_len != qual_len: raise ValueError("Problem with quality section") yield _bytes_to_string(id), start_offset, length start_offset = end_offset
def __iter__(self): """Iterate over the file handle; yields key, start offset, and length.""" handle = self._handle handle.seek(0) if not self._kwargs['comments']: iterfunc = self._qresult_index else: iterfunc = self._qresult_index_commented for key, offset, length in iterfunc(): yield _bytes_to_string(key), offset, length
def __iter__(self): """Iterates over the file handle; yields key, start offset, and length.""" handle = self._handle handle.seek(0) if not self._kwargs['comments']: iterfunc = self._qresult_index else: iterfunc = self._qresult_index_commented for key, offset, length in iterfunc(): yield _bytes_to_string(key), offset, length
def __iter__(self): """Iterates over the file handle; yields key, start offset, and length.""" handle = self._handle handle.seek(0) # denotes column location for query identifier query_id_idx = 9 qresult_key = None tab_char = b"\t" start_offset = handle.tell() line = handle.readline() # read through header # this assumes that the result row match the regex while not re.search(_RE_ROW_CHECK_IDX, line.strip()): start_offset = handle.tell() line = handle.readline() if not line: return # and index the qresults while True: end_offset = handle.tell() cols = [x for x in line.strip().split(tab_char) if x] if qresult_key is None: qresult_key = cols[query_id_idx] else: curr_key = cols[query_id_idx] if curr_key != qresult_key: yield _bytes_to_string(qresult_key), start_offset, \ end_offset - start_offset qresult_key = curr_key start_offset = end_offset - len(line) line = handle.readline() if not line: yield _bytes_to_string(qresult_key), start_offset, \ end_offset - start_offset break
def __iter__(self): """Iterates over the file handle; yields key, start offset, and length.""" handle = self._handle handle.seek(0) query_id_idx = self._query_id_idx qresult_key = None header_mark = _as_bytes('#') split_mark = _as_bytes(' ') # set line with initial mock value, to emulate header line = header_mark # read through header while line.startswith(header_mark): start_offset = handle.tell() line = handle.readline() # and index the qresults while True: end_offset = handle.tell() if not line: break cols = line.strip().split(split_mark) if qresult_key is None: qresult_key = list(filter(None, cols))[query_id_idx] else: curr_key = list(filter(None, cols))[query_id_idx] if curr_key != qresult_key: yield _bytes_to_string(qresult_key), start_offset, \ end_offset - start_offset qresult_key = curr_key start_offset = end_offset - len(line) line = handle.readline() if not line: yield _bytes_to_string(qresult_key), start_offset, \ end_offset - start_offset break
def __iter__(self): """Iterates over the file handle; yields key, start offset, and length.""" handle = self._handle handle.seek(0) # denotes column location for query identifier query_id_idx = 9 qresult_key = None tab_char = _as_bytes('\t') start_offset = handle.tell() line = handle.readline() # read through header # this assumes that the result row match the regex while not re.search(_RE_ROW_CHECK_IDX, line.strip()): start_offset = handle.tell() line = handle.readline() if not line: raise StopIteration # and index the qresults while True: end_offset = handle.tell() cols = line.strip().split(tab_char) if qresult_key is None: qresult_key = list(filter(None, cols))[query_id_idx] else: curr_key = list(filter(None, cols))[query_id_idx] if curr_key != qresult_key: yield _bytes_to_string(qresult_key), start_offset, \ end_offset - start_offset qresult_key = curr_key start_offset = end_offset - len(line) line = handle.readline() if not line: yield _bytes_to_string(qresult_key), start_offset, \ end_offset - start_offset break
def __iter__(self): handle = self._handle handle.seek(0) id = None start_offset = handle.tell() line = handle.readline() if not line: # Empty file! return at_char = _as_bytes("@") plus_char = _as_bytes("+") if line[0:1] != at_char: raise ValueError("Problem with FASTQ @ line:\n%r" % line) while line: # assert line[0]=="@" # This record seems OK (so far) id = line[1:].rstrip().split(None, 1)[0] # Find the seq line(s) seq_len = 0 length = len(line) while line: line = handle.readline() length += len(line) if line.startswith(plus_char): break seq_len += len(line.strip()) if not line: raise ValueError("Premature end of file in seq section") # assert line[0]=="+" # Find the qual line(s) qual_len = 0 while line: if seq_len == qual_len: if seq_len == 0: # Special case, quality line should be just "\n" line = handle.readline() if line.strip(): raise ValueError("Expected blank quality line, not %r" % line) # Should be end of record... end_offset = handle.tell() line = handle.readline() if line and line[0:1] != at_char: raise ValueError("Problem with line %r" % line) break else: line = handle.readline() qual_len += len(line.strip()) length += len(line) if seq_len != qual_len: raise ValueError("Problem with quality section") yield _bytes_to_string(id), start_offset, length start_offset = end_offset
def _abi_parse_header(header, handle): """Generator that returns directory contents. """ # header structure (after ABIF marker): # file version, tag name, tag number, # element type code, element size, number of elements # data size, data offset, handle (not file handle) head_elem_size = header[4] head_elem_num = header[5] head_offset = header[7] index = 0 while index < head_elem_num: start = head_offset + index * head_elem_size # add directory offset to tuple # to handle directories with data size <= 4 bytes handle.seek(start) dir_entry = struct.unpack(_DIRFMT, handle.read( struct.calcsize(_DIRFMT))) + (start, ) index += 1 # only parse desired dirs key = _bytes_to_string(dir_entry[0]) key += str(dir_entry[1]) tag_name = _bytes_to_string(dir_entry[0]) tag_number = dir_entry[1] elem_code = dir_entry[2] elem_num = dir_entry[4] data_size = dir_entry[5] data_offset = dir_entry[6] tag_offset = dir_entry[8] # if data size <= 4 bytes, data is stored inside tag # so offset needs to be changed if data_size <= 4: data_offset = tag_offset + 20 handle.seek(data_offset) data = handle.read(data_size) yield tag_name, tag_number, \ _parse_tag_data(elem_code, elem_num, data)
def _abi_parse_header(header, handle): """Generator that returns directory contents. """ # header structure (after ABIF marker): # file version, tag name, tag number, # element type code, element size, number of elements # data size, data offset, handle (not file handle) head_elem_size = header[4] head_elem_num = header[5] head_offset = header[7] index = 0 while index < head_elem_num: start = head_offset + index * head_elem_size # add directory offset to tuple # to handle directories with data size <= 4 bytes handle.seek(start) dir_entry = struct.unpack(_DIRFMT, handle.read(struct.calcsize(_DIRFMT))) + (start,) index += 1 # only parse desired dirs key = _bytes_to_string(dir_entry[0]) key += str(dir_entry[1]) tag_name = _bytes_to_string(dir_entry[0]) tag_number = dir_entry[1] elem_code = dir_entry[2] elem_num = dir_entry[4] data_size = dir_entry[5] data_offset = dir_entry[6] tag_offset = dir_entry[8] # if data size <= 4 bytes, data is stored inside tag # so offset needs to be changed if data_size <= 4: data_offset = tag_offset + 20 handle.seek(data_offset) data = handle.read(data_size) yield tag_name, tag_number, \ _parse_tag_data(elem_code, elem_num, data)
def __iter__(self): handle = self._handle handle.seek(0) marker_re = self._marker_re accession_marker = b"ACCESSION " version_marker = b"VERSION " # Skip and header before first record while True: start_offset = handle.tell() line = handle.readline() if marker_re.match(line) or not line: break # Should now be at the start of a record, or end of the file while marker_re.match(line): # We cannot assume the record.id is the first word after LOCUS, # normally the first entry on the VERSION or ACCESSION line is used. # However if both missing, GenBank parser falls back on LOCUS entry. try: key = line[5:].split(None, 1)[0] except ValueError: # Warning? # No content in LOCUS line key = None length = len(line) while True: end_offset = handle.tell() line = handle.readline() if marker_re.match(line) or not line: if not key: raise ValueError("Did not find usable ACCESSION/VERSION/LOCUS lines") yield _bytes_to_string(key), start_offset, length start_offset = end_offset break elif line.startswith(accession_marker): try: key = line.rstrip().split()[1] except IndexError: # No content in ACCESSION line pass elif line.startswith(version_marker): try: version_id = line.rstrip().split()[1] if version_id.count(b".") == 1 and version_id.split(b".")[1].isdigit(): # This should mimic the GenBank parser... key = version_id except IndexError: # No content in VERSION line pass length += len(line) assert not line, repr(line)
def get(self, offset): #TODO - Can we handle this directly in the parser? #This is a hack - use get_raw for <entry>...</entry> and wrap it with #the apparently required XML header and footer. data = """<?xml version='1.0' encoding='UTF-8'?> <uniprot xmlns="http://uniprot.org/uniprot" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://uniprot.org/uniprot http://www.uniprot.org/support/docs/uniprot.xsd"> %s </uniprot> """ % _bytes_to_string(self.get_raw(offset)) #TODO - For consistency, this function should not accept a string: return next(SeqIO.UniprotIO.UniprotIterator(data))
def get(self, offset): #TODO - Can we handle this directly in the parser? #This is a hack - use get_raw for <entry>...</entry> and wrap it with #the apparently required XML header and footer. data = """<?xml version='1.0' encoding='UTF-8'?> <uniprot xmlns="http://uniprot.org/uniprot" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://uniprot.org/uniprot http://www.uniprot.org/support/docs/uniprot.xsd"> %s </uniprot> """ % _bytes_to_string(self.get_raw(offset)) #TODO - For consistency, this function should not accept a string: return SeqIO.UniprotIO.UniprotIterator(data).next()
def __iter__(self): """Iterate over the sequence records in the file.""" handle = self._handle handle.seek(0) marker_re = self._marker_re start_acc_marker = b"<accession>" end_acc_marker = b"</accession>" end_entry_marker = b"</entry>" # Skip any header before first record while True: start_offset = handle.tell() line = handle.readline() if marker_re.match(line) or not line: break # Should now be at the start of a record, or end of the file while marker_re.match(line): length = len(line) # We expect the next line to be <accession>xxx</accession> # (possibly with leading spaces) # but allow it to be later on within the <entry> key = None while True: line = handle.readline() if key is None and start_acc_marker in line: assert end_acc_marker in line, line key = line[line.find(start_acc_marker) + 11:].split( b"<", 1)[0] length += len(line) elif end_entry_marker in line: length += line.find(end_entry_marker) + 8 end_offset = handle.tell() - len(line) \ + line.find(end_entry_marker) + 8 assert start_offset + length == end_offset break elif marker_re.match(line) or not line: # Start of next record or end of file raise ValueError("Didn't find end of record") else: length += len(line) if not key: raise ValueError( "Did not find <accession> line in bytes %i to %i" % (start_offset, start_offset + length)) yield _bytes_to_string(key), start_offset, length # Find start of next record while not marker_re.match(line) and line: start_offset = handle.tell() line = handle.readline() assert not line, repr(line)
def _get_string_tag(opt_bytes_value, default=None): """Return the string value of the given an optional raw bytes tag value. If the bytes value is None, return the given default value. """ if opt_bytes_value is None: return default try: return _bytes_to_string(opt_bytes_value) except UnicodeDecodeError: # If we are in this 'except' block, a `.decode` call must have been # attempted, and so we must be on Python 3, which means opt_bytes_value # is a byte string. return opt_bytes_value.decode(encoding=sys.getdefaultencoding())
def get_qresult_id(self, pos): """Returns the query ID from the nearest "Query:" line.""" handle = self._handle handle.seek(pos) sentinel = _as_bytes('Query:') while True: line = handle.readline().strip() if line.startswith(sentinel): break if not line: raise StopIteration qid, desc = _parse_hit_or_query_line(_bytes_to_string(line)) return qid
def __iter__(self): handle = self._handle handle.seek(0) id = None start_offset = handle.tell() line = handle.readline() if not line: #Empty file! return at_char = _as_bytes("@") plus_char = _as_bytes("+") if line[0:1] != at_char: raise ValueError("Problem with FASTQ @ line:\n%s" % repr(line)) while line: #assert line[0]=="@" #This record seems OK (so far) id = line[1:].rstrip().split(None, 1)[0] #Find the seq line(s) seq_len = 0 length = len(line) while line: line = handle.readline() length += len(line) if line.startswith(plus_char): break seq_len += len(line.strip()) if not line: raise ValueError("Premature end of file in seq section") #assert line[0]=="+" #Find the qual line(s) qual_len = 0 while line: if seq_len == qual_len: #Should be end of record... end_offset = handle.tell() line = handle.readline() if line and line[0:1] != at_char: ValueError("Problem with line %s" % repr(line)) break else: line = handle.readline() qual_len += len(line.strip()) length += len(line) if seq_len != qual_len: raise ValueError("Problem with quality section") yield _bytes_to_string(id), start_offset, length start_offset = end_offset
def __iter__(self): handle = self._handle handle.seek(0) marker_re = self._marker_re start_acc_marker = b"<accession>" end_acc_marker = b"</accession>" end_entry_marker = b"</entry>" # Skip any header before first record while True: start_offset = handle.tell() line = handle.readline() if marker_re.match(line) or not line: break # Should now be at the start of a record, or end of the file while marker_re.match(line): length = len(line) # We expect the next line to be <accession>xxx</accession> # (possibly with leading spaces) # but allow it to be later on within the <entry> key = None while True: line = handle.readline() if key is None and start_acc_marker in line: assert end_acc_marker in line, line key = line[line.find( start_acc_marker) + 11:].split(b"<", 1)[0] length += len(line) elif end_entry_marker in line: length += line.find(end_entry_marker) + 8 end_offset = handle.tell() - len(line) \ + line.find(end_entry_marker) + 8 assert start_offset + length == end_offset break elif marker_re.match(line) or not line: # Start of next record or end of file raise ValueError("Didn't find end of record") else: length += len(line) if not key: raise ValueError("Did not find <accession> line in bytes %i to %i" % (start_offset, start_offset + length)) yield _bytes_to_string(key), start_offset, length # Find start of next record while not marker_re.match(line) and line: start_offset = handle.tell() line = handle.readline() assert not line, repr(line)
def __iter__(self): handle = self._handle handle.seek(0) tab_char = _as_bytes("\t") while True: start_offset = handle.tell() line = handle.readline() if not line : break #End of file try: key = line.split(tab_char)[0] except ValueError, err: if not line.strip(): #Ignore blank lines continue else: raise err else: yield _bytes_to_string(key), start_offset, len(line)
def __iter__(self): handle = self._handle handle.seek(0) id = None start_offset = handle.tell() line = handle.readline() if not line: # Empty file! return at_char = _as_bytes("@") plus_char = _as_bytes("+") if line[0:1] != at_char: raise ValueError("Problem with FASTQ @ line:\n%s" % repr(line)) while line: # assert line[0]=="@" # This record seems OK (so far) id = line[1:].rstrip().split(None, 1)[0] # Find the seq line(s) seq_len = 0 while line: line = handle.readline() if line.startswith(plus_char): break seq_len += len(line.strip()) if not line: raise ValueError("Premature end of file in seq section") # assert line[0]=="+" # Find the qual line(s) qual_len = 0 while line: if seq_len == qual_len: # Should be end of record... line = handle.readline() if line and line[0:1] != at_char: ValueError("Problem with line %s" % repr(line)) break else: line = handle.readline() qual_len += len(line.strip()) if seq_len != qual_len: raise ValueError("Problem with quality section") end_offset = handle.tell() - len(line) yield _bytes_to_string(id), start_offset, end_offset - start_offset start_offset = end_offset
def __iter__(self): handle = self._handle handle.seek(0) marker_re = self._marker_re start_acc_marker = _as_bytes("<accession>") end_acc_marker = _as_bytes("</accession>") end_entry_marker = _as_bytes("</entry>") #Skip any header before first record while True: start_offset = handle.tell() line = handle.readline() if marker_re.match(line) or not line: break #Should now be at the start of a record, or end of the file while marker_re.match(line): #We expect the next line to be <accession>xxx</accession> #(possibly with leading spaces) #but allow it to be later on within the <entry> key = None done = False while True: line = handle.readline() if key is None and start_acc_marker in line: assert end_acc_marker in line, line key = line[line.find(start_acc_marker) + 11:].split( _as_bytes("<"))[0] elif end_entry_marker in line: end_offset = handle.tell() - len(line) \ + line.find(end_entry_marker) + 8 break elif marker_re.match(line) or not line: #Start of next record or end of file raise ValueError("Didn't find end of record") if not key: raise ValueError("Did not find <accession> line in bytes %i to %i" \ % (start_offset, end_offset)) yield _bytes_to_string( key), start_offset, end_offset - start_offset #Find start of next record while not marker_re.match(line) and line: start_offset = handle.tell() line = handle.readline() assert not line, repr(line)
def __iter__(self): """Iterate over the sequence records in the file.""" handle = self._handle handle.seek(0) tab_char = b"\t" while True: start_offset = handle.tell() line = handle.readline() if not line: break # End of file try: key = line.split(tab_char)[0] except ValueError as err: if not line.strip(): # Ignore blank lines continue else: raise err else: yield _bytes_to_string(key), start_offset, len(line)
def __iter__(self): handle = self._handle handle.seek(0) start_offset = handle.tell() while True: line = _read_forward(handle) end_offset = handle.tell() if line.startswith(self.qresult_start): regx = re.search(_QRE_ID_LEN_IDX, line) qresult_key = regx.group(1).strip() # qresult start offset is the offset of this line # (starts with the start mark) start_offset = end_offset - len(line) elif line.startswith(self.qresult_end): yield _bytes_to_string(qresult_key), start_offset, 0 start_offset = end_offset elif not line: break
def __iter__(self): handle = self._handle handle.seek(0) marker_re = self._marker_re semi_char = _as_bytes(";") while True: offset = handle.tell() line = handle.readline() if marker_re.match(line): #Now look for the first line which doesn't start ";" while True: line = handle.readline() if line[0:1] != semi_char and line.strip(): key = line.split()[0] yield _bytes_to_string(key), offset, 0 break if not line: raise ValueError("Premature end of file?") elif not line: #End of file break
def __iter__(self): """Iterate over Hmmer3TextIndexer; yields query results' key, offsets, 0.""" handle = self._handle handle.seek(0) start_offset = handle.tell() regex_id = re.compile(_as_bytes(_QRE_ID_LEN_PTN)) while True: line = read_forward(handle) end_offset = handle.tell() if line.startswith(self.qresult_start): regx = re.search(regex_id, line) qresult_key = regx.group(1).strip() # qresult start offset is the offset of this line # (starts with the start mark) start_offset = end_offset - len(line) elif line.startswith(self.qresult_end): yield _bytes_to_string(qresult_key), start_offset, 0 start_offset = end_offset elif not line: break
def __iter__(self): handle = self._handle handle.seek(0) marker_re = self._marker_re start_acc_marker = _as_bytes("<accession>") end_acc_marker = _as_bytes("</accession>") end_entry_marker = _as_bytes("</entry>") #Skip any header before first record while True: start_offset = handle.tell() line = handle.readline() if marker_re.match(line) or not line: break #Should now be at the start of a record, or end of the file while marker_re.match(line): #We expect the next line to be <accession>xxx</accession> #but allow it to be later on within the <entry> key = None done = False while True: line = handle.readline() if key is None and line.startswith(start_acc_marker): assert end_acc_marker in line, line key = line[11:].split(_as_bytes("<"))[0] elif end_entry_marker in line: end_offset = handle.tell() - len(line) \ + line.find(end_entry_marker) + 8 break elif marker_re.match(line) or not line: #Start of next record or end of file raise ValueError("Didn't find end of record") if not key: raise ValueError("Did not find <accession> line") yield _bytes_to_string(key), start_offset, end_offset - start_offset #Find start of next record while not marker_re.match(line) and line: start_offset = handle.tell() line = handle.readline() assert not line, repr(line)
def __iter__(self): handle = self._handle handle.seek(0) start_offset = handle.tell() qresult_key = None query_mark = b">>>" while True: line = handle.readline() peekline = handle.peekline() end_offset = handle.tell() if not line.startswith(query_mark) and query_mark in line: regx = re.search(_RE_ID_DESC_SEQLEN_IDX, line) qresult_key = _bytes_to_string(regx.group(1)) start_offset = end_offset - len(line) # yield whenever we encounter a new query or at the end of the file if qresult_key is not None: if (not peekline.startswith(query_mark) and query_mark in peekline) or not line: yield qresult_key, start_offset, end_offset - start_offset if not line: break start_offset = end_offset
def get_raw_check(self, filename, format, alphabet, comp): # Also checking the key_function here if comp: h = gzip.open(filename, "rb") raw_file = h.read() h.close() h = gzip_open(filename, format) id_list = [rec.id.lower() for rec in SeqIO.parse(h, format, alphabet)] h.close() else: h = open(filename, "rb") raw_file = h.read() h.close() id_list = [rec.id.lower() for rec in SeqIO.parse(filename, format, alphabet)] if format in ["sff"]: with warnings.catch_warnings(): warnings.simplefilter('ignore', BiopythonParserWarning) rec_dict = SeqIO.index(filename, format, alphabet, key_function=lambda x: x.lower()) rec_dict_db = SeqIO.index_db(":memory:", filename, format, alphabet, key_function=lambda x: x.lower()) else: rec_dict = SeqIO.index(filename, format, alphabet, key_function=lambda x: x.lower()) rec_dict_db = SeqIO.index_db(":memory:", filename, format, alphabet, key_function=lambda x: x.lower()) self.assertEqual(set(id_list), set(rec_dict)) self.assertEqual(set(id_list), set(rec_dict_db)) self.assertEqual(len(id_list), len(rec_dict)) for key in id_list: self.assertIn(key, rec_dict) self.assertEqual(key, rec_dict[key].id.lower()) self.assertEqual(key, rec_dict.get(key).id.lower()) raw = rec_dict.get_raw(key) self.assertTrue(isinstance(raw, bytes), "Didn't get bytes from %s get_raw" % format) self.assertTrue(raw.strip()) self.assertIn(raw, raw_file) raw_db = rec_dict_db.get_raw(key) # Via index using format-specific get_raw which scans the file, # Via index_db in general using raw length found when indexing. self.assertEqual(raw, raw_db, "index and index_db .get_raw() different for %s" % format) rec1 = rec_dict[key] # Following isn't very elegant, but it lets me test the # __getitem__ SFF code is working. if format in SeqIO._BinaryFormats: handle = BytesIO(raw) else: handle = StringIO(_bytes_to_string(raw)) if format == "sff": rec2 = SeqIO.SffIO._sff_read_seq_record(handle, rec_dict._proxy._flows_per_read, rec_dict._proxy._flow_chars, rec_dict._proxy._key_sequence, rec_dict._proxy._alphabet, trim=False) elif format == "sff-trim": rec2 = SeqIO.SffIO._sff_read_seq_record(handle, rec_dict._proxy._flows_per_read, rec_dict._proxy._flow_chars, rec_dict._proxy._key_sequence, rec_dict._proxy._alphabet, trim=True) elif format == "uniprot-xml": self.assertTrue(raw.startswith(b"<entry ")) self.assertTrue(raw.endswith(b"</entry>")) # Currently the __getitem__ method uses this # trick too, but we hope to fix that later raw = """<?xml version='1.0' encoding='UTF-8'?> <uniprot xmlns="http://uniprot.org/uniprot" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://uniprot.org/uniprot http://www.uniprot.org/support/docs/uniprot.xsd"> %s </uniprot> """ % _bytes_to_string(raw) handle = StringIO(raw) rec2 = SeqIO.read(handle, format, alphabet) else: rec2 = SeqIO.read(handle, format, alphabet) self.assertEqual(True, compare_record(rec1, rec2)) rec_dict.close() del rec_dict