def _qresult_index(self): """Indexer for noncommented BLAST tabular files (PRIVATE).""" handle = self._handle handle.seek(0) start_offset = 0 qresult_key = None key_idx = self._key_idx tab_char = _as_bytes('\t') while True: # get end offset here since we only know a qresult ends after # encountering the next one end_offset = handle.tell() # line = handle.readline() line = handle.readline() if qresult_key is None: qresult_key = line.split(tab_char)[key_idx] else: try: curr_key = line.split(tab_char)[key_idx] except IndexError: curr_key = _as_bytes('') if curr_key != qresult_key: yield qresult_key, start_offset, end_offset - start_offset qresult_key = curr_key start_offset = end_offset # break if we've reached EOF if not line: break
def _get_raw_qresult(self, offset): """Return the raw bytes string of a single QueryResult from a noncommented file (PRIVATE).""" handle = self._handle handle.seek(offset) qresult_raw = _as_bytes('') tab_char = _as_bytes('\t') key_idx = self._key_idx qresult_key = None while True: line = handle.readline() # get the key if the first line (qresult key) if qresult_key is None: qresult_key = line.split(tab_char)[key_idx] else: try: curr_key = line.split(tab_char)[key_idx] except IndexError: curr_key = _as_bytes('') # only break when qresult is finished (key is different) if curr_key != qresult_key: break # append to the raw string as long as qresult is the same qresult_raw += line return qresult_raw
def __iter__(self): handle = self._handle handle.seek(0) start_offset = handle.tell() regex_id = re.compile(_as_bytes(r'Query\s*(?:sequence|HMM)?:\s*(.*)')) # determine flag for hmmsearch is_hmmsearch = False line = read_forward(handle) if line.startswith(_as_bytes('hmmsearch')): is_hmmsearch = True while True: end_offset = handle.tell() if line.startswith(self.qresult_start): regx = re.search(regex_id, line) qresult_key = regx.group(1).strip() # qresult start offset is the offset of this line # (starts with the start mark) start_offset = end_offset - len(line) elif line.startswith(self.qresult_end): yield _bytes_to_string(qresult_key), start_offset, 0 start_offset = end_offset elif not line: # HACK: since hmmsearch can only have one query result if is_hmmsearch: yield _bytes_to_string(qresult_key), start_offset, 0 break line = read_forward(handle)
def __iter__(self): handle = self._handle handle.seek(0) marker_re = self._marker_re dot_char = _as_bytes(".") accession_marker = _as_bytes("ACCESSION ") version_marker = _as_bytes("VERSION ") #Skip and header before first record while True: start_offset = handle.tell() line = handle.readline() if marker_re.match(line) or not line: break #Should now be at the start of a record, or end of the file while marker_re.match(line): #We cannot assume the record.id is the first word after LOCUS, #normally the first entry on the VERSION or ACCESSION line is used. key = None while True: line = handle.readline() if marker_re.match(line) or not line: if not key: raise ValueError("Did not find ACCESSION/VERSION lines") end_offset = handle.tell() - len(line) yield _bytes_to_string(key), start_offset, end_offset - start_offset start_offset = end_offset break elif line.startswith(accession_marker): key = line.rstrip().split()[1] elif line.startswith(version_marker): version_id = line.rstrip().split()[1] if version_id.count(dot_char)==1 and version_id.split(dot_char)[1].isdigit(): #This should mimic the GenBank parser... key = version_id assert not line, repr(line)
def __iter__(self): handle = self._handle handle.seek(0) marker_re = self._marker_re semi_char = _as_bytes(";") #Skip any header before first record while True: start_offset = handle.tell() line = handle.readline() if marker_re.match(line) or not line: break #Should now be at the start of a record, or end of the file while marker_re.match(line): length = len(line) #We cannot assume the record.id is the first word after ID, #normally the following AC line is used. line = handle.readline() length += len(line) assert line.startswith(_as_bytes("AC ")) key = line[3:].strip().split(semi_char)[0].strip() while True: end_offset = handle.tell() line = handle.readline() if marker_re.match(line) or not line: yield _bytes_to_string(key), start_offset, length start_offset = end_offset break length += len(line) assert not line, repr(line)
def get_raw(self, offset): """Return the raw record from the file as a bytes string.""" handle = self._handle qresult_raw = _as_bytes('') query_mark = _as_bytes('>>>') # read header first handle.seek(0) while True: line = handle.readline() peekline = handle.peekline() qresult_raw += line if not peekline.startswith(query_mark) and query_mark in peekline: break # and read the qresult raw string handle.seek(offset) while True: # preserve whitespace, don't use read_forward line = handle.readline() peekline = handle.peekline() qresult_raw += line # break when we've reached qresult end if (not peekline.startswith(query_mark) and query_mark in peekline) or \ not line: break # append mock end marker to qresult_raw, since it's not always present return qresult_raw + _as_bytes('>>><<<\n')
def _qresult_index_commented(self): """Indexer for commented BLAST tabular files (PRIVATE).""" handle = self._handle handle.seek(0) start_offset = 0 # mark of a new query query_mark = None # mark of the query's ID qid_mark = _as_bytes('# Query: ') # mark of the last line end_mark = _as_bytes('# BLAST processed') while True: end_offset = handle.tell() line = handle.readline() if query_mark is None: query_mark = line start_offset = end_offset elif line.startswith(qid_mark): qresult_key = line[len(qid_mark):].split()[0] elif line == query_mark or line.startswith(end_mark): yield qresult_key, start_offset, end_offset - start_offset start_offset = end_offset elif not line: break
def get_raw_check(self, filename, format, alphabet): handle = open(filename, "rb") raw_file = handle.read() handle.close() #Also checking the key_function here id_list = [rec.id.lower() for rec in \ SeqIO.parse(filename, format, alphabet)] rec_dict = SeqIO.index(filename, format, alphabet, key_function = lambda x : x.lower()) self.assertEqual(set(id_list), set(rec_dict.keys())) self.assertEqual(len(id_list), len(rec_dict)) for key in id_list: self.assertTrue(key in rec_dict) self.assertEqual(key, rec_dict[key].id.lower()) self.assertEqual(key, rec_dict.get(key).id.lower()) raw = rec_dict.get_raw(key) self.assertTrue(raw.strip()) self.assertTrue(raw in raw_file) rec1 = rec_dict[key] #Following isn't very elegant, but it lets me test the #__getitem__ SFF code is working. if format in SeqIO._BinaryFormats: handle = BytesIO(raw) else: handle = StringIO(_bytes_to_string(raw)) if format == "sff": rec2 = SeqIO.SffIO._sff_read_seq_record(handle, rec_dict._proxy._flows_per_read, rec_dict._proxy._flow_chars, rec_dict._proxy._key_sequence, rec_dict._proxy._alphabet, trim=False) elif format == "sff-trim": rec2 = SeqIO.SffIO._sff_read_seq_record(handle, rec_dict._proxy._flows_per_read, rec_dict._proxy._flow_chars, rec_dict._proxy._key_sequence, rec_dict._proxy._alphabet, trim=True) elif format == "uniprot-xml": self.assertTrue(raw.startswith(_as_bytes("<entry "))) self.assertTrue(raw.endswith(_as_bytes("</entry>"))) #Currently the __getitem__ method uses this #trick too, but we hope to fix that later raw = """<?xml version='1.0' encoding='UTF-8'?> <uniprot xmlns="http://uniprot.org/uniprot" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://uniprot.org/uniprot http://www.uniprot.org/support/docs/uniprot.xsd"> %s </uniprot> """ % _bytes_to_string(raw) handle = StringIO(raw) rec2 = SeqIO.read(handle, format, alphabet) else: rec2 = SeqIO.read(handle, format, alphabet) self.assertEqual(True, compare_record(rec1, rec2)) rec_dict._proxy._handle.close() #TODO - Better solution del rec_dict
def __iter__(self): handle = self._handle handle.seek(0) marker_re = self._marker_re semi_char = _as_bytes(";") dot_char = _as_bytes(".") sv_marker = _as_bytes("SV ") ac_marker = _as_bytes("AC ") # Skip any header before first record while True: start_offset = handle.tell() line = handle.readline() if marker_re.match(line) or not line: break # Should now be at the start of a record, or end of the file while marker_re.match(line): # We cannot assume the record.id is the first word after ID, # normally the SV line is used. setbysv = False # resets sv as false length = len(line) if line[2:].count(semi_char) == 6: # Looks like the semi colon separated style introduced in 2006 parts = line[3:].rstrip().split(semi_char) if parts[1].strip().startswith(sv_marker): # The SV bit gives the version key = parts[0].strip() + dot_char + \ parts[1].strip().split()[1] setbysv = True else: key = parts[0].strip() elif line[2:].count(semi_char) == 3: # Looks like the pre 2006 style, take first word only key = line[3:].strip().split(None, 1)[0] if key.endswith(semi_char): key = key[:-1] else: raise ValueError( 'Did not recognise the ID line layout:\n' + line) while True: end_offset = handle.tell() line = handle.readline() if marker_re.match(line) or not line: end_offset = handle.tell() - len(line) yield _bytes_to_string(key), start_offset, length start_offset = end_offset break elif line.startswith(ac_marker) and not setbysv: key = line.rstrip().split()[1] if key.endswith(semi_char): key = key[:-1] elif line.startswith(sv_marker): key = line.rstrip().split()[1] setbysv = True length += len(line) assert not line, repr(line)
def crc32(seq): """Returns the crc32 checksum for a sequence (string or Seq object).""" # NOTE - On Python 2 returns a signed int, on Python 3 it is unsigned # Docs suggest should use crc32(x) & 0xffffffff for consistency. # TODO - Should we return crc32(x) & 0xffffffff here? try: # Assume its a Seq object return _crc32(_as_bytes(str(seq))) except AttributeError: # Assume its a string/unicode return _crc32(_as_bytes(seq))
def __iter__(self): handle = self._handle handle.seek(0) id = None start_offset = handle.tell() line = handle.readline() if not line: # Empty file! return at_char = _as_bytes("@") plus_char = _as_bytes("+") if line[0:1] != at_char: raise ValueError("Problem with FASTQ @ line:\n%r" % line) while line: # assert line[0]=="@" # This record seems OK (so far) id = line[1:].rstrip().split(None, 1)[0] # Find the seq line(s) seq_len = 0 length = len(line) while line: line = handle.readline() length += len(line) if line.startswith(plus_char): break seq_len += len(line.strip()) if not line: raise ValueError("Premature end of file in seq section") # assert line[0]=="+" # Find the qual line(s) qual_len = 0 while line: if seq_len == qual_len: if seq_len == 0: # Special case, quality line should be just "\n" line = handle.readline() if line.strip(): raise ValueError("Expected blank quality line, not %r" % line) # Should be end of record... end_offset = handle.tell() line = handle.readline() if line and line[0:1] != at_char: raise ValueError("Problem with line %r" % line) break else: line = handle.readline() qual_len += len(line.strip()) length += len(line) if seq_len != qual_len: raise ValueError("Problem with quality section") yield _bytes_to_string(id), start_offset, length start_offset = end_offset
def get_raw(self, offset): handle = self._handle handle.seek(offset) marker_re = self._marker_re lines = [] line = handle.readline() semi_char = _as_bytes(";") while line.startswith(semi_char): lines.append(line) line = handle.readline() while line and not line.startswith(semi_char): lines.append(line) line = handle.readline() return _as_bytes("").join(lines)
def get_raw(self, offset): """Return the raw record from the file as a bytes string.""" handle = self._handle handle.seek(offset) marker_re = self._marker_re lines = [] line = handle.readline() semi_char = _as_bytes(";") while line.startswith(semi_char): lines.append(line) line = handle.readline() while line and not line.startswith(semi_char): lines.append(line) line = handle.readline() return _as_bytes("").join(lines)
def run_pv(self, out_file='probs.dat', data_dir='.', version=1, smooth=0.04): """Executes pv. out_file - Name of output file. data_dir - Where the data is found. """ self._generate_intfile(data_dir) if version == 1: pv_name = "pv" else: pv_name = "pv2" proc = subprocess.Popen([self._get_path(pv_name)], cwd=data_dir, shell=(sys.platform != "win32"), stdin=subprocess.PIPE, stdout=subprocess.PIPE, universal_newlines=True) proc.communicate(_as_bytes('data_fst_outfile %s out.dat\n%s\n' % (out_file, smooth))) pvf = open(data_dir + os.sep + out_file, 'r') result = map(lambda x: tuple(map(lambda y: my_float(y), x.rstrip().split(' '))), pvf.readlines()) pvf.close() return result
def check_by_line(self, old_file, new_file, old_gzip=False): for mode in ["r", "rb"]: if old_gzip: h = gzip.open(old_file, mode) else: h = open(old_file, mode) old = h.read() #Seems gzip can return bytes even if mode="r", #perhaps a bug in Python 3.2? if "b" in mode: old = _as_bytes(old) else: old = _as_string(old) h.close() for cache in [1,10]: h = bgzf.BgzfReader(new_file, mode, max_cache=cache) if "b" in mode: new = _empty_bytes_string.join(line for line in h) else: new = "".join(line for line in h) h.close() self.assertEqual(len(old), len(new)) self.assertEqual(old[:10], new[:10], \ "%r vs %r, mode %r" % (old[:10], new[:10], mode)) self.assertEqual(old, new)
def _open(url, post=None): """Helper function to build the URL and open a handle to it (PRIVATE). Open a handle to TogoWS, will raise an IOError if it encounters an error. In the absense of clear guidelines, this function enforces a limit of "up to three queries per second" to avoid abusing the TogoWS servers. """ delay = 0.333333333 #one third of a second current = time.time() wait = _open.previous + delay - current if wait > 0: time.sleep(wait) _open.previous = current + wait else: _open.previous = current #print url try: if post: handle = urllib2.urlopen(url, _as_bytes(urllib.urlencode(post))) else: handle = urllib2.urlopen(url) except urllib2.HTTPError, exception: raise exception
def _open(url, post=None): """Build the URL and open a handle to it (PRIVATE). Open a handle to TogoWS, will raise an IOError if it encounters an error. In the absence of clear guidelines, this function enforces a limit of "up to three queries per second" to avoid abusing the TogoWS servers. """ delay = 0.333333333 # one third of a second current = time.time() wait = _open.previous + delay - current if wait > 0: time.sleep(wait) _open.previous = current + wait else: _open.previous = current # print(url) if post: handle = _urlopen(url, _as_bytes(post)) else: handle = _urlopen(url) # We now trust TogoWS to have set an HTTP error code, that # suffices for my current unit tests. Previously we would # examine the start of the data returned back. return _binary_to_string_handle(handle)
def check_by_char(self, old_file, new_file, old_gzip=False): for mode in ["r", "rb"]: if old_gzip: h = gzip.open(old_file,mode) else: h = open(old_file, mode) old = h.read() #Seems gzip can return bytes even if mode="r", #perhaps a bug in Python 3.2? if "b" in mode: old = _as_bytes(old) else: old = _as_string(old) h.close() for cache in [1,10]: h = bgzf.BgzfReader(new_file, mode, max_cache=cache) temp = [] while True: char = h.read(1) if not char: break temp.append(char) if "b" in mode: new = _empty_bytes_string.join(temp) else: new = "".join(temp) del temp h.close() self.assertEqual(len(old), len(new)) #If bytes vs unicode mismatch, give a short error message: self.assertEqual(old[:10], new[:10], \ "%r vs %r, mode %r" % (old[:10], new[:10], mode)) self.assertEqual(old, new)
def seguid(seq): """Returns the SEGUID (string) for a sequence (string or Seq object). Given a nucleotide or amino-acid secuence (or any string), returns the SEGUID string (A SEquence Globally Unique IDentifier). seq type = str. For more information about SEGUID, see: http://bioinformatics.anl.gov/seguid/ DOI: 10.1002/pmic.200600032 """ import hashlib import base64 m = hashlib.sha1() try: # Assume it's a Seq object seq = str(seq) except AttributeError: # Assume it's a string pass m.update(_as_bytes(seq.upper())) try: # For Python 3+ return base64.encodebytes(m.digest()).decode().replace("\n", "").rstrip("=") except AttributeError: pass # For all other Pythons return base64.b64encode(m.digest()).rstrip("=")
def get_raw(self, offset): handle = self._handle qresult_raw = _as_bytes('') # read header first if not self._preamble: handle.seek(0) while True: line = handle.readline() if line.startswith(self.qresult_start): break qresult_raw += line else: qresult_raw += self._preamble # and read the qresult raw string handle.seek(offset) while True: # preserve whitespace, don't use read_forward line = handle.readline() qresult_raw += line # break when we've reached qresult end if line.startswith(self.qresult_end) or not line: break return qresult_raw
def check_raw(self, filename, id, raw, **kwargs): """Index filename using keyword arguments, check get_raw(id)==raw.""" idx = SearchIO.index(filename, self.fmt, **kwargs) raw = _as_bytes(raw) # Anticipate cases where the raw string and/or file uses different # newline characters ~ we set everything to \n. new = idx.get_raw(id) self.assertTrue(isinstance(new, bytes), "Didn't get bytes from %s get_raw" % self.fmt) self.assertEqual(raw.replace(b'\r\n', b'\n'), new.replace(b'\r\n', b'\n')) idx.close() # Now again, but using SQLite backend if sqlite3: idx = SearchIO.index_db(":memory:", filename, self.fmt, **kwargs) new = idx.get_raw(id) self.assertTrue(isinstance(new, bytes), "Didn't get bytes from %s get_raw" % self.fmt) self.assertEqual(raw.replace(b'\r\n', b'\n'), new.replace(b'\r\n', b'\n')) idx.close() if os.path.isfile(filename + ".bgz"): # Do the tests again with the BGZF compressed file print("[BONUS %s.bgz]" % filename) self.check_raw(filename + ".bgz", id, raw, **kwargs)
def _open(cgi, params={}, post=False): """Helper function to build the URL and open a handle to it (PRIVATE). Open a handle to Entrez. cgi is the URL for the cgi script to access. params is a dictionary with the options to pass to it. Does some simple error checking, and will raise an IOError if it encounters one. This function also enforces the "up to three queries per second rule" to avoid abusing the NCBI servers. """ # NCBI requirement: At most three queries per second. # Equivalently, at least a third of second between queries delay = 0.333333334 current = time.time() wait = _open.previous + delay - current if wait > 0: time.sleep(wait) _open.previous = current + wait else: _open.previous = current # Remove None values from the parameters for key, value in params.items(): if value is None: del params[key] # Tell Entrez that we are using Biopython (or whatever the user has # specified explicitly in the parameters or by changing the default) if not "tool" in params: params["tool"] = tool # Tell Entrez who we are if not "email" in params: if email is not None: params["email"] = email else: warnings.warn(""" Email address is not specified. To make use of NCBI's E-utilities, NCBI strongly recommends you to specify your email address with each request. From June 1, 2010, this will be mandatory. As an example, if your email address is [email protected], you can specify it as follows: from Bio import Entrez Entrez.email = '*****@*****.**' In case of excessive usage of the E-utilities, NCBI will attempt to contact a user at the email address provided before blocking access to the E-utilities.""", UserWarning) # Open a handle to Entrez. options = _urlencode(params, doseq=True) #print cgi + "?" + options try: if post: #HTTP POST handle = _urlopen(cgi, data=_as_bytes(options)) else: #HTTP GET cgi += "?" + options handle = _urlopen(cgi) except _HTTPError as exception: raise exception return _binary_to_string_handle(handle)
def __iter__(self): handle = self._handle handle.seek(0) marker_re = self._marker_re start_acc_marker = _as_bytes("<accession>") end_acc_marker = _as_bytes("</accession>") end_entry_marker = _as_bytes("</entry>") less_than = _as_bytes("<") #Skip any header before first record while True: start_offset = handle.tell() line = handle.readline() if marker_re.match(line) or not line: break #Should now be at the start of a record, or end of the file while marker_re.match(line): length = len(line) #We expect the next line to be <accession>xxx</accession> #(possibly with leading spaces) #but allow it to be later on within the <entry> key = None done = False while True: line = handle.readline() if key is None and start_acc_marker in line: assert end_acc_marker in line, line key = line[line.find( start_acc_marker) + 11:].split(less_than, 1)[0] length += len(line) elif end_entry_marker in line: end_offset = handle.tell() - len(line) \ + line.find(end_entry_marker) + 8 break elif marker_re.match(line) or not line: #Start of next record or end of file raise ValueError("Didn't find end of record") else: length += len(line) if not key: raise ValueError("Did not find <accession> line in bytes %i to %i" % (start_offset, end_offset)) yield _bytes_to_string(key), start_offset, length #Find start of next record while not marker_re.match(line) and line: start_offset = handle.tell() line = handle.readline() assert not line, repr(line)
def get_raw(self, offset): """Similar to the get method, but returns the record as a raw string.""" handle = self._handle marker_re = self._marker_re end_entry_marker = _as_bytes("</entry>") handle.seek(offset) data = [handle.readline()] while True: line = handle.readline() i = line.find(end_entry_marker) if i != -1: data.append(line[:i + 8]) break if marker_re.match(line) or not line: #End of file, or start of next record raise ValueError("Didn't find end of record") data.append(line) return _as_bytes("").join(data)
def get_raw(self, offset): """Return the raw record from the file as a bytes string.""" # TODO - Refactor this and the __init__ method to reduce code duplication? handle = self._handle handle.seek(offset) line = handle.readline() data = line at_char = _as_bytes("@") plus_char = _as_bytes("+") if line[0:1] != at_char: raise ValueError("Problem with FASTQ @ line:\n%r" % line) # Find the seq line(s) seq_len = 0 while line: line = handle.readline() data += line if line.startswith(plus_char): break seq_len += len(line.strip()) if not line: raise ValueError("Premature end of file in seq section") assert line[0:1] == plus_char # Find the qual line(s) qual_len = 0 while line: if seq_len == qual_len: if seq_len == 0: # Special case, quality line should be just "\n" line = handle.readline() if line.strip(): raise ValueError("Expected blank quality line, not %r" % line) data += line # Should be end of record... line = handle.readline() if line and line[0:1] != at_char: raise ValueError("Problem with line %r" % line) break else: line = handle.readline() data += line qual_len += len(line.strip()) if seq_len != qual_len: raise ValueError("Problem with quality section") return data
def __iter__(self): handle = self._handle handle.seek(0) id = None start_offset = handle.tell() line = handle.readline() if not line: # Empty file! return at_char = _as_bytes("@") plus_char = _as_bytes("+") if line[0:1] != at_char: raise ValueError("Problem with FASTQ @ line:\n%s" % repr(line)) while line: # assert line[0]=="@" # This record seems OK (so far) id = line[1:].rstrip().split(None, 1)[0] # Find the seq line(s) seq_len = 0 while line: line = handle.readline() if line.startswith(plus_char): break seq_len += len(line.strip()) if not line: raise ValueError("Premature end of file in seq section") # assert line[0]=="+" # Find the qual line(s) qual_len = 0 while line: if seq_len == qual_len: # Should be end of record... line = handle.readline() if line and line[0:1] != at_char: ValueError("Problem with line %s" % repr(line)) break else: line = handle.readline() qual_len += len(line.strip()) if seq_len != qual_len: raise ValueError("Problem with quality section") end_offset = handle.tell() - len(line) yield _bytes_to_string(id), start_offset, end_offset - start_offset start_offset = end_offset
def crc32(seq): """Returns the crc32 checksum for a sequence (string or Seq object). Note that the case is important: >>> crc32("ACGTACGTACGT") 20049947 >>> crc32("acgtACGTacgt") 1688586483 """ # NOTE - On Python 2 returns a signed int, on Python 3 it is unsigned # Docs suggest should use crc32(x) & 0xffffffff for consistency. # TODO - Should we return crc32(x) & 0xffffffff here? try: # Assume its a Seq object return _crc32(_as_bytes(str(seq))) except AttributeError: # Assume its a string/unicode return _crc32(_as_bytes(seq))
def __iter__(self): """Iterate over the file handle; yields key, start offset, and length.""" handle = self._handle handle.seek(0) query_id_idx = self._query_id_idx qresult_key = None header_mark = _as_bytes('#') split_mark = _as_bytes(' ') # set line with initial mock value, to emulate header line = header_mark # read through header while line.startswith(header_mark): start_offset = handle.tell() line = handle.readline() # and index the qresults while True: end_offset = handle.tell() if not line: break cols = [x for x in line.strip().split(split_mark) if x] if qresult_key is None: qresult_key = cols[query_id_idx] else: curr_key = cols[query_id_idx] if curr_key != qresult_key: adj_end = end_offset - len(line) yield (_bytes_to_string(qresult_key), start_offset, adj_end - start_offset) qresult_key = curr_key start_offset = adj_end line = handle.readline() if not line: yield (_bytes_to_string(qresult_key), start_offset, end_offset - start_offset) break
def isReadable(handle): """ Fast check, if this file is readable by this reader. Check if the file magic bytes equals to '.scf' as specified in the file format specification: http://staden.sourceforge.net/manual/formats_unix_2.html @param handle The file handle. @return True if this (probably) is an abi file. """ handle.seek(0) return handle.read(4) == _as_bytes('.scf')
def get_raw(self, offset): """Similar to the get method, but returns the record as a raw string.""" # TODO - Refactor this and the __init__ method to reduce code duplication? handle = self._handle handle.seek(offset) line = handle.readline() data = line at_char = _as_bytes("@") plus_char = _as_bytes("+") if line[0:1] != at_char: raise ValueError("Problem with FASTQ @ line:\n%s" % repr(line)) identifier = line[1:].rstrip().split(None, 1)[0] # Find the seq line(s) seq_len = 0 while line: line = handle.readline() data += line if line.startswith(plus_char): break seq_len += len(line.strip()) if not line: raise ValueError("Premature end of file in seq section") assert line[0:1] == plus_char # Find the qual line(s) qual_len = 0 while line: if seq_len == qual_len: # Should be end of record... pos = handle.tell() line = handle.readline() if line and line[0:1] != at_char: ValueError("Problem with line %s" % repr(line)) break else: line = handle.readline() data += line qual_len += len(line.strip()) if seq_len != qual_len: raise ValueError("Problem with quality section") return data
from Bio._py3k import _as_bytes, _bytes_to_string from Bio._py3k import zip from Bio.Alphabet import generic_dna from Bio.SearchIO._index import SearchIndexer from Bio.SearchIO._model import QueryResult, Hit, HSP, HSPFragment __all__ = ('BlatPslParser', 'BlatPslIndexer', 'BlatPslWriter') # precompile regex patterns _PTR_ROW_CHECK = r'^\d+\s+\d+\s+\d+\s+\d+' _RE_ROW_CHECK = re.compile(_PTR_ROW_CHECK) _RE_ROW_CHECK_IDX = re.compile(_as_bytes(_PTR_ROW_CHECK)) def _list_from_csv(csv_string, caster=None): """Transform the given comma-separated string into a list (PRIVATE). :param csv_string: comma-separated input string :type csv_string: string :param caster: function used to cast each item in the input string to its intended type :type caster: callable, accepts string, returns object """ if caster is None: return [x for x in csv_string.split(',') if x] else:
def qblast(program, database, sequence, auto_format=None,composition_based_statistics=None, db_genetic_code=None,endpoints=None,entrez_query='(none)', expect=10.0,filter=None,gapcosts=None,genetic_code=None, hitlist_size=50,i_thresh=None,layout=None,lcase_mask=None, matrix_name=None,nucl_penalty=None,nucl_reward=None, other_advanced=None,perc_ident=None,phi_pattern=None, query_file=None,query_believe_defline=None,query_from=None, query_to=None,searchsp_eff=None,service=None,threshold=None, ungapped_alignment=None,word_size=None, alignments=500,alignment_view=None,descriptions=500, entrez_links_new_window=None,expect_low=None,expect_high=None, format_entrez_query=None,format_object=None,format_type='XML', ncbi_gi=None,results_file=None,show_overview=None, megablast=None, ): """Do a BLAST search using the QBLAST server at NCBI. Supports all parameters of the qblast API for Put and Get. Some useful parameters: program blastn, blastp, blastx, tblastn, or tblastx (lower case) database Which database to search against (e.g. "nr"). sequence The sequence to search. ncbi_gi TRUE/FALSE whether to give 'gi' identifier. descriptions Number of descriptions to show. Def 500. alignments Number of alignments to show. Def 500. expect An expect value cutoff. Def 10.0. matrix_name Specify an alt. matrix (PAM30, PAM70, BLOSUM80, BLOSUM45). filter "none" turns off filtering. Default no filtering format_type "HTML", "Text", "ASN.1", or "XML". Def. "XML". entrez_query Entrez query to limit Blast search hitlist_size Number of hits to return. Default 50 megablast TRUE/FALSE whether to use MEga BLAST algorithm (blastn only) service plain, psi, phi, rpsblast, megablast (lower case) This function does no checking of the validity of the parameters and passes the values to the server as is. More help is available at: http://www.ncbi.nlm.nih.gov/BLAST/blast_overview.html """ import urllib, urllib2 import time assert program in ['blastn', 'blastp', 'blastx', 'tblastn', 'tblastx'] # Format the "Put" command, which sends search requests to qblast. # Parameters taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node5.html on 9 July 2007 # Additional parameters are taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node9.html on 8 Oct 2010 # To perform a PSI-BLAST or PHI-BLAST search the service ("Put" and "Get" commands) must be specified # (e.g. psi_blast = NCBIWWW.qblast("blastp", "refseq_protein", input_sequence, service="psi")) parameters = [ ('AUTO_FORMAT',auto_format), ('COMPOSITION_BASED_STATISTICS',composition_based_statistics), ('DATABASE',database), ('DB_GENETIC_CODE',db_genetic_code), ('ENDPOINTS',endpoints), ('ENTREZ_QUERY',entrez_query), ('EXPECT',expect), ('FILTER',filter), ('GAPCOSTS',gapcosts), ('GENETIC_CODE',genetic_code), ('HITLIST_SIZE',hitlist_size), ('I_THRESH',i_thresh), ('LAYOUT',layout), ('LCASE_MASK',lcase_mask), ('MEGABLAST',megablast), ('MATRIX_NAME',matrix_name), ('NUCL_PENALTY',nucl_penalty), ('NUCL_REWARD',nucl_reward), ('OTHER_ADVANCED',other_advanced), ('PERC_IDENT',perc_ident), ('PHI_PATTERN',phi_pattern), ('PROGRAM',program), #('PSSM',pssm), - It is possible to use PSI-BLAST via this API? ('QUERY',sequence), ('QUERY_FILE',query_file), ('QUERY_BELIEVE_DEFLINE',query_believe_defline), ('QUERY_FROM',query_from), ('QUERY_TO',query_to), #('RESULTS_FILE',...), - Can we use this parameter? ('SEARCHSP_EFF',searchsp_eff), ('SERVICE',service), ('THRESHOLD',threshold), ('UNGAPPED_ALIGNMENT',ungapped_alignment), ('WORD_SIZE',word_size), ('CMD', 'Put'), ] query = [x for x in parameters if x[1] is not None] message = _as_bytes(urllib.urlencode(query)) # Send off the initial query to qblast. # Note the NCBI do not currently impose a rate limit here, other # than the request not to make say 50 queries at once using multiple # threads. request = urllib2.Request("http://blast.ncbi.nlm.nih.gov/Blast.cgi", message, {"User-Agent":"BiopythonClient"}) handle = urllib2.urlopen(request) # Format the "Get" command, which gets the formatted results from qblast # Parameters taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node6.html on 9 July 2007 rid, rtoe = _parse_qblast_ref_page(handle) parameters = [ ('ALIGNMENTS',alignments), ('ALIGNMENT_VIEW',alignment_view), ('DESCRIPTIONS',descriptions), ('ENTREZ_LINKS_NEW_WINDOW',entrez_links_new_window), ('EXPECT_LOW',expect_low), ('EXPECT_HIGH',expect_high), ('FORMAT_ENTREZ_QUERY',format_entrez_query), ('FORMAT_OBJECT',format_object), ('FORMAT_TYPE',format_type), ('NCBI_GI',ncbi_gi), ('RID',rid), ('RESULTS_FILE',results_file), ('SERVICE',service), ('SHOW_OVERVIEW',show_overview), ('CMD', 'Get'), ] query = [x for x in parameters if x[1] is not None] message = _as_bytes(urllib.urlencode(query)) # Poll NCBI until the results are ready. Use a 3 second wait delay = 3.0 previous = time.time() while True: current = time.time() wait = previous + delay - current if wait > 0: time.sleep(wait) previous = current + wait else: previous = current request = urllib2.Request("http://blast.ncbi.nlm.nih.gov/Blast.cgi", message, {"User-Agent":"BiopythonClient"}) handle = urllib2.urlopen(request) results = _as_string(handle.read()) # Can see an "\n\n" page while results are in progress, # if so just wait a bit longer... if results=="\n\n": continue # XML results don't have the Status tag when finished if results.find("Status=") < 0: break i = results.index("Status=") j = results.index("\n", i) status = results[i+len("Status="):j].strip() if status.upper() == "READY": break return StringIO(results)
if (3, 0) <= sys.version_info[:2] <= (3, 1): # Workaround for bug in python 3.0 and 3.1, # see http://bugs.python.org/issue9257 from xml.etree import ElementTree as ElementTree else: from xml.etree import cElementTree as ElementTree except ImportError: from xml.etree import ElementTree as ElementTree from Bio.Alphabet import generic_dna, generic_protein from Bio.SearchIO._index import SearchIndexer from Bio.SearchIO._model import QueryResult, Hit, HSP, HSPFragment from Bio._py3k import _as_bytes, _bytes_to_string, unicode _empty_bytes_string = _as_bytes("") __all__ = ('BlastXmlParser', 'BlastXmlIndexer', 'BlastXmlWriter') # element - optional qresult attribute name mapping _ELEM_QRESULT_OPT = { 'Statistics_db-num': ('stat_db_num', int), 'Statistics_db-len': ('stat_db_len', int), 'Statistics_eff-space': ('stat_eff_space', float), 'Statistics_hsp-len': ('stat_hsp_len', int), 'Statistics_kappa': ('stat_kappa', float), 'Statistics_lambda': ('stat_lambda', float), 'Statistics_entropy': ('stat_entropy', float), } # element - hit attribute name mapping _ELEM_HIT = {
class BlastXmlIndexer(SearchIndexer): """Indexer class for BLAST XML output.""" _parser = BlastXmlParser qstart_mark = _as_bytes('<Iteration>') qend_mark = _as_bytes('</Iteration>') block_size = 16384 def __init__(self, filename, **kwargs): """Initialize the class.""" SearchIndexer.__init__(self, filename) # TODO: better way to do this? iter_obj = self._parser(self._handle, **kwargs) self._meta, self._fallback = iter_obj._meta, iter_obj._fallback def __iter__(self): """Iterate over BlastXmlIndexer yields qstart_id, start_offset, block's length.""" qstart_mark = self.qstart_mark qend_mark = self.qend_mark blast_id_mark = _as_bytes('Query_') block_size = self.block_size handle = self._handle handle.seek(0) re_desc = re.compile( _as_bytes(r'<Iteration_query-ID>(.*?)' r'</Iteration_query-ID>\s+?' '<Iteration_query-def>' '(.*?)</Iteration_query-def>')) re_desc_end = re.compile(_as_bytes(r'</Iteration_query-def>')) counter = 0 while True: start_offset = handle.tell() line = handle.readline() if not line: break if qstart_mark not in line: continue # The following requirements are to make supporting BGZF compressed # BLAST XML files simpler (avoids complex offset manipulations): assert line.count(qstart_mark) == 1, "XML without line breaks?" assert line.lstrip().startswith(qstart_mark), line if qend_mark in line: # Should cope with <Iteration>...</Iteration> on one long line block = line else: # Load the rest of this block up to and including </Iteration> block = [line] while line and qend_mark not in line: line = handle.readline() assert qstart_mark not in line, line block.append(line) assert line.rstrip().endswith(qend_mark), line block = _empty_bytes_string.join(block) assert block.count( qstart_mark) == 1, "XML without line breaks? %r" % block assert block.count( qend_mark) == 1, "XML without line breaks? %r" % block # Now we have a full <Iteration>...</Iteration> block, find the ID regx = re.search(re_desc, block) try: qstart_desc = regx.group(2) qstart_id = regx.group(1) except AttributeError: # use the fallback values assert re.search(re_desc_end, block) qstart_desc = _as_bytes(self._fallback['description']) qstart_id = _as_bytes(self._fallback['id']) if qstart_id.startswith(blast_id_mark): qstart_id = qstart_desc.split(_as_bytes(' '), 1)[0] yield _bytes_to_string(qstart_id), start_offset, len(block) counter += 1 def _parse(self, handle): """Overwrite SearchIndexer parse (PRIVATE). As we need to set the meta and fallback dictionaries to the parser. """ generator = self._parser(handle, **self._kwargs) generator._meta = self._meta generator._fallback = self._fallback return next(iter(generator)) def get_raw(self, offset): """Return the raw record from the file as a bytes string.""" qend_mark = self.qend_mark handle = self._handle handle.seek(offset) qresult_raw = handle.readline() assert qresult_raw.lstrip().startswith(self.qstart_mark) while qend_mark not in qresult_raw: qresult_raw += handle.readline() assert qresult_raw.rstrip().endswith(qend_mark) assert qresult_raw.count(qend_mark) == 1 # Note this will include any leading and trailing whitespace, in # general expecting " <Iteration>\n...\n </Iteration>\n" return qresult_raw
def AbiIterator(handle, alphabet=None, trim=False): """Iterator for the Abi file format. """ # raise exception is alphabet is not dna if alphabet is not None: if isinstance(Alphabet._get_base_alphabet(alphabet), Alphabet.ProteinAlphabet): raise ValueError( "Invalid alphabet, ABI files do not hold proteins.") if isinstance(Alphabet._get_base_alphabet(alphabet), Alphabet.RNAAlphabet): raise ValueError("Invalid alphabet, ABI files do not hold RNA.") # raise exception if handle mode is not 'rb' if hasattr(handle, 'mode'): if set('rb') != set(handle.mode.lower()): raise ValueError("ABI files has to be opened in 'rb' mode.") # check if input file is a valid Abi file handle.seek(0) marker = handle.read(4) if not marker: # handle empty file gracefully raise StopIteration if marker != _as_bytes('ABIF'): raise IOError('File should start ABIF, not %r' % marker) # dirty hack for handling time information times = { 'RUND1': '', 'RUND2': '', 'RUNT1': '', 'RUNT2': '', } # initialize annotations annot = dict(zip(_EXTRACT.values(), [None] * len(_EXTRACT))) # parse header and extract data from directories header = struct.unpack(_HEADFMT, handle.read(struct.calcsize(_HEADFMT))) for tag_name, tag_number, tag_data in _abi_parse_header(header, handle): # stop iteration if all desired tags have been extracted # 4 tags from _EXTRACT + 2 time tags from _SPCTAGS - 3, # and seq, qual, id # todo key = tag_name + str(tag_number) # PBAS2 is base-called sequence if key == 'PBAS2': seq = tag_data ambigs = 'KYWMRS' if alphabet is None: if set(seq).intersection(ambigs): alphabet = ambiguous_dna else: alphabet = unambiguous_dna # PCON2 is quality values of base-called sequence elif key == 'PCON2': qual = [ord(val) for val in tag_data] # SMPL1 is sample id entered before sequencing run elif key == 'SMPL1': sample_id = tag_data elif key in times: times[key] = tag_data else: # extract sequence annotation as defined in _EXTRACT if key in _EXTRACT: annot[_EXTRACT[key]] = tag_data # set time annotations annot['run_start'] = '%s %s' % (times['RUND1'], times['RUNT1']) annot['run_finish'] = '%s %s' % (times['RUND2'], times['RUNT2']) # use the file name as SeqRecord.name if available try: file_name = basename(handle.name).replace('.ab1', '') except: file_name = "" record = SeqRecord(Seq(seq, alphabet), id=sample_id, name=file_name, description='', annotations=annot, letter_annotations={'phred_quality': qual}) if not trim: yield record else: yield _abi_trim(record)
__all__ = ['Hmmer3TextParser', 'Hmmer3TextIndexer'] # precompile regex patterns for faster processing # regex for program name capture _RE_PROGRAM = re.compile(r'^# (\w*hmm\w+) :: .*$') # regex for version string capture _RE_VERSION = re.compile(r'# \w+ ([\w+\.]+) .*; http.*$') # regex for option string capture _RE_OPT = re.compile(r'^# (.+):\s+(.+)$') # regex for parsing query id and length, for parsing and indexing _QRE_ID_LEN_PTN = r'^Query:\s*(.*)\s+\[\w=(\d+)\]' _QRE_ID_LEN = re.compile(_QRE_ID_LEN_PTN) _QRE_ID_LEN_IDX = re.compile(_as_bytes(_QRE_ID_LEN_PTN)) # regex for hsp validation _HRE_VALIDATE = re.compile(r'score:\s(-?\d+\.?\d+)\sbits.*value:\s(.*)') # regexes for parsing hsp alignment blocks _HRE_ANNOT_LINE = re.compile(r'^(\s+)(.+)\s(\w+)') _HRE_ID_LINE = re.compile(r'^(\s+\S+\s+[0-9-]+ )(.+?)(\s+[0-9-]+)') def _read_forward(handle): """Reads through whitespaces, returns the first non-whitespace line.""" while True: line = handle.readline() # if line has characters and stripping does not remove them, # return the line if line and line.strip(): return line
from Bio.SearchIO._index import SearchIndexer from Bio.SearchIO._model import QueryResult, Hit, HSP, HSPFragment __all__ = ['FastaM10Parser', 'FastaM10Indexer'] __docformat__ = "restructuredtext en" # precompile regex patterns # regex for program name _RE_FLAVS = re.compile(r't?fast[afmsxy]|pr[sf][sx]|lalign|[gs]?[glso]search') # regex for sequence ID and length ~ deals with both \n and \r\n _PTR_ID_DESC_SEQLEN = r'>>>(.+?)\s+(.*?) *- (\d+) (?:aa|nt)\s*$' _RE_ID_DESC_SEQLEN = re.compile(_PTR_ID_DESC_SEQLEN) _RE_ID_DESC_SEQLEN_IDX = re.compile(_as_bytes(_PTR_ID_DESC_SEQLEN)) # regex for qresult, hit, or hsp attribute value _RE_ATTR = re.compile(r'^; [a-z]+(_[ \w-]+):\s+(.*)$') # regex for capturing excess start and end sequences in alignments _RE_START_EXC = re.compile(r'^-*') _RE_END_EXC = re.compile(r'-*$') # attribute name mappings _HSP_ATTR_MAP = { '_initn': ('initn_score', int), '_init1': ('init1_score', int), '_opt': ('opt_score', int), '_s-w opt': ('opt_score', int), '_z-score': ('z_score', float), '_bits': ('bitscore', float), '_expect': ('evalue', float),