예제 #1
0
    def _qresult_index(self):
        """Indexer for noncommented BLAST tabular files (PRIVATE)."""
        handle = self._handle
        handle.seek(0)
        start_offset = 0
        qresult_key = None
        key_idx = self._key_idx
        tab_char = _as_bytes('\t')

        while True:
            # get end offset here since we only know a qresult ends after
            # encountering the next one
            end_offset = handle.tell()
            # line = handle.readline()
            line = handle.readline()

            if qresult_key is None:
                qresult_key = line.split(tab_char)[key_idx]
            else:
                try:
                    curr_key = line.split(tab_char)[key_idx]
                except IndexError:
                    curr_key = _as_bytes('')

                if curr_key != qresult_key:
                    yield qresult_key, start_offset, end_offset - start_offset
                    qresult_key = curr_key
                    start_offset = end_offset

            # break if we've reached EOF
            if not line:
                break
예제 #2
0
    def _get_raw_qresult(self, offset):
        """Return the raw bytes string of a single QueryResult from a noncommented file (PRIVATE)."""
        handle = self._handle
        handle.seek(offset)
        qresult_raw = _as_bytes('')
        tab_char = _as_bytes('\t')
        key_idx = self._key_idx
        qresult_key = None

        while True:
            line = handle.readline()
            # get the key if the first line (qresult key)
            if qresult_key is None:
                qresult_key = line.split(tab_char)[key_idx]
            else:
                try:
                    curr_key = line.split(tab_char)[key_idx]
                except IndexError:
                    curr_key = _as_bytes('')
                # only break when qresult is finished (key is different)
                if curr_key != qresult_key:
                    break
            # append to the raw string as long as qresult is the same
            qresult_raw += line

        return qresult_raw
예제 #3
0
    def __iter__(self):
        handle = self._handle
        handle.seek(0)
        start_offset = handle.tell()
        regex_id = re.compile(_as_bytes(r'Query\s*(?:sequence|HMM)?:\s*(.*)'))

        # determine flag for hmmsearch
        is_hmmsearch = False
        line = read_forward(handle)
        if line.startswith(_as_bytes('hmmsearch')):
            is_hmmsearch = True

        while True:
            end_offset = handle.tell()

            if line.startswith(self.qresult_start):
                regx = re.search(regex_id, line)
                qresult_key = regx.group(1).strip()
                # qresult start offset is the offset of this line
                # (starts with the start mark)
                start_offset = end_offset - len(line)
            elif line.startswith(self.qresult_end):
                yield _bytes_to_string(qresult_key), start_offset, 0
                start_offset = end_offset
            elif not line:
                # HACK: since hmmsearch can only have one query result
                if is_hmmsearch:
                    yield _bytes_to_string(qresult_key), start_offset, 0
                break

            line = read_forward(handle)
예제 #4
0
파일: _index.py 프로젝트: LyonsLab/coge
 def __iter__(self):
     handle = self._handle
     handle.seek(0)
     marker_re = self._marker_re
     dot_char = _as_bytes(".")
     accession_marker = _as_bytes("ACCESSION ")
     version_marker = _as_bytes("VERSION ")
     #Skip and header before first record
     while True:
         start_offset = handle.tell()
         line = handle.readline()
         if marker_re.match(line) or not line:
             break
     #Should now be at the start of a record, or end of the file
     while marker_re.match(line):
         #We cannot assume the record.id is the first word after LOCUS,
         #normally the first entry on the VERSION or ACCESSION line is used.
         key = None
         while True:
             line = handle.readline()
             if marker_re.match(line) or not line:
                 if not key:
                     raise ValueError("Did not find ACCESSION/VERSION lines")
                 end_offset = handle.tell() - len(line)
                 yield _bytes_to_string(key), start_offset, end_offset - start_offset
                 start_offset = end_offset
                 break
             elif line.startswith(accession_marker):
                 key = line.rstrip().split()[1]
             elif line.startswith(version_marker):
                 version_id = line.rstrip().split()[1]
                 if version_id.count(dot_char)==1 and version_id.split(dot_char)[1].isdigit():
                     #This should mimic the GenBank parser...
                     key = version_id
     assert not line, repr(line)
예제 #5
0
 def __iter__(self):
     handle = self._handle
     handle.seek(0)
     marker_re = self._marker_re
     semi_char = _as_bytes(";")
     #Skip any header before first record
     while True:
         start_offset = handle.tell()
         line = handle.readline()
         if marker_re.match(line) or not line:
             break
     #Should now be at the start of a record, or end of the file
     while marker_re.match(line):
         length = len(line)
         #We cannot assume the record.id is the first word after ID,
         #normally the following AC line is used.
         line = handle.readline()
         length += len(line)
         assert line.startswith(_as_bytes("AC "))
         key = line[3:].strip().split(semi_char)[0].strip()
         while True:
             end_offset = handle.tell()
             line = handle.readline()
             if marker_re.match(line) or not line:
                 yield _bytes_to_string(key), start_offset, length
                 start_offset = end_offset
                 break
             length += len(line)
     assert not line, repr(line)
예제 #6
0
    def get_raw(self, offset):
        """Return the raw record from the file as a bytes string."""
        handle = self._handle
        qresult_raw = _as_bytes('')
        query_mark = _as_bytes('>>>')

        # read header first
        handle.seek(0)
        while True:
            line = handle.readline()
            peekline = handle.peekline()
            qresult_raw += line
            if not peekline.startswith(query_mark) and query_mark in peekline:
                break

        # and read the qresult raw string
        handle.seek(offset)
        while True:
            # preserve whitespace, don't use read_forward
            line = handle.readline()
            peekline = handle.peekline()
            qresult_raw += line

            # break when we've reached qresult end
            if (not peekline.startswith(query_mark) and query_mark in peekline) or \
                    not line:
                break

        # append mock end marker to qresult_raw, since it's not always present
        return qresult_raw + _as_bytes('>>><<<\n')
예제 #7
0
    def _qresult_index_commented(self):
        """Indexer for commented BLAST tabular files (PRIVATE)."""
        handle = self._handle
        handle.seek(0)
        start_offset = 0
        # mark of a new query
        query_mark = None
        # mark of the query's ID
        qid_mark = _as_bytes('# Query: ')
        # mark of the last line
        end_mark = _as_bytes('# BLAST processed')

        while True:
            end_offset = handle.tell()
            line = handle.readline()

            if query_mark is None:
                query_mark = line
                start_offset = end_offset
            elif line.startswith(qid_mark):
                qresult_key = line[len(qid_mark):].split()[0]
            elif line == query_mark or line.startswith(end_mark):
                yield qresult_key, start_offset, end_offset - start_offset
                start_offset = end_offset
            elif not line:
                break
 def get_raw_check(self, filename, format, alphabet):
     handle = open(filename, "rb")
     raw_file = handle.read()
     handle.close()
     #Also checking the key_function here
     id_list = [rec.id.lower() for rec in \
                SeqIO.parse(filename, format, alphabet)]
     rec_dict = SeqIO.index(filename, format, alphabet,
                            key_function = lambda x : x.lower())
     self.assertEqual(set(id_list), set(rec_dict.keys()))
     self.assertEqual(len(id_list), len(rec_dict))
     for key in id_list:
         self.assertTrue(key in rec_dict)
         self.assertEqual(key, rec_dict[key].id.lower())
         self.assertEqual(key, rec_dict.get(key).id.lower())
         raw = rec_dict.get_raw(key)
         self.assertTrue(raw.strip())
         self.assertTrue(raw in raw_file)
         rec1 = rec_dict[key]
         #Following isn't very elegant, but it lets me test the
         #__getitem__ SFF code is working.
         if format in SeqIO._BinaryFormats:
             handle = BytesIO(raw)
         else:
             handle = StringIO(_bytes_to_string(raw))
         if format == "sff":
             rec2 = SeqIO.SffIO._sff_read_seq_record(handle,
                         rec_dict._proxy._flows_per_read,
                         rec_dict._proxy._flow_chars,
                         rec_dict._proxy._key_sequence,
                         rec_dict._proxy._alphabet,
                         trim=False)
         elif format == "sff-trim":
             rec2 = SeqIO.SffIO._sff_read_seq_record(handle,
                         rec_dict._proxy._flows_per_read,
                         rec_dict._proxy._flow_chars,
                         rec_dict._proxy._key_sequence,
                         rec_dict._proxy._alphabet,
                         trim=True)
         elif format == "uniprot-xml":
             self.assertTrue(raw.startswith(_as_bytes("<entry ")))
             self.assertTrue(raw.endswith(_as_bytes("</entry>")))
             #Currently the __getitem__ method uses this
             #trick too, but we hope to fix that later
             raw = """<?xml version='1.0' encoding='UTF-8'?>
             <uniprot xmlns="http://uniprot.org/uniprot"
             xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
             xsi:schemaLocation="http://uniprot.org/uniprot
             http://www.uniprot.org/support/docs/uniprot.xsd">
             %s
             </uniprot>
             """ % _bytes_to_string(raw)
             handle = StringIO(raw)
             rec2 = SeqIO.read(handle, format, alphabet)
         else:
             rec2 = SeqIO.read(handle, format, alphabet)
         self.assertEqual(True, compare_record(rec1, rec2))
     rec_dict._proxy._handle.close() #TODO - Better solution
     del rec_dict
예제 #9
0
 def __iter__(self):
     handle = self._handle
     handle.seek(0)
     marker_re = self._marker_re
     semi_char = _as_bytes(";")
     dot_char = _as_bytes(".")
     sv_marker = _as_bytes("SV ")
     ac_marker = _as_bytes("AC ")
     # Skip any header before first record
     while True:
         start_offset = handle.tell()
         line = handle.readline()
         if marker_re.match(line) or not line:
             break
     # Should now be at the start of a record, or end of the file
     while marker_re.match(line):
         # We cannot assume the record.id is the first word after ID,
         # normally the SV line is used.
         setbysv = False  # resets sv as false
         length = len(line)
         if line[2:].count(semi_char) == 6:
             # Looks like the semi colon separated style introduced in 2006
             parts = line[3:].rstrip().split(semi_char)
             if parts[1].strip().startswith(sv_marker):
                 # The SV bit gives the version
                 key = parts[0].strip() + dot_char + \
                     parts[1].strip().split()[1]
                 setbysv = True
             else:
                 key = parts[0].strip()
         elif line[2:].count(semi_char) == 3:
             # Looks like the pre 2006 style, take first word only
             key = line[3:].strip().split(None, 1)[0]
             if key.endswith(semi_char):
                 key = key[:-1]
         else:
             raise ValueError(
                 'Did not recognise the ID line layout:\n' + line)
         while True:
             end_offset = handle.tell()
             line = handle.readline()
             if marker_re.match(line) or not line:
                 end_offset = handle.tell() - len(line)
                 yield _bytes_to_string(key), start_offset, length
                 start_offset = end_offset
                 break
             elif line.startswith(ac_marker) and not setbysv:
                 key = line.rstrip().split()[1]
                 if key.endswith(semi_char):
                     key = key[:-1]
             elif line.startswith(sv_marker):
                 key = line.rstrip().split()[1]
                 setbysv = True
             length += len(line)
     assert not line, repr(line)
예제 #10
0
def crc32(seq):
    """Returns the crc32 checksum for a sequence (string or Seq object)."""
    # NOTE - On Python 2 returns a signed int, on Python 3 it is unsigned
    # Docs suggest should use crc32(x) & 0xffffffff for consistency.
    # TODO - Should we return crc32(x) & 0xffffffff here?
    try:
        # Assume its a Seq object
        return _crc32(_as_bytes(str(seq)))
    except AttributeError:
        # Assume its a string/unicode
        return _crc32(_as_bytes(seq))
예제 #11
0
 def __iter__(self):
     handle = self._handle
     handle.seek(0)
     id = None
     start_offset = handle.tell()
     line = handle.readline()
     if not line:
         # Empty file!
         return
     at_char = _as_bytes("@")
     plus_char = _as_bytes("+")
     if line[0:1] != at_char:
         raise ValueError("Problem with FASTQ @ line:\n%r" % line)
     while line:
         # assert line[0]=="@"
         # This record seems OK (so far)
         id = line[1:].rstrip().split(None, 1)[0]
         # Find the seq line(s)
         seq_len = 0
         length = len(line)
         while line:
             line = handle.readline()
             length += len(line)
             if line.startswith(plus_char):
                 break
             seq_len += len(line.strip())
         if not line:
             raise ValueError("Premature end of file in seq section")
         # assert line[0]=="+"
         # Find the qual line(s)
         qual_len = 0
         while line:
             if seq_len == qual_len:
                 if seq_len == 0:
                     # Special case, quality line should be just "\n"
                     line = handle.readline()
                     if line.strip():
                         raise ValueError("Expected blank quality line, not %r" % line)
                 # Should be end of record...
                 end_offset = handle.tell()
                 line = handle.readline()
                 if line and line[0:1] != at_char:
                     raise ValueError("Problem with line %r" % line)
                 break
             else:
                 line = handle.readline()
                 qual_len += len(line.strip())
                 length += len(line)
         if seq_len != qual_len:
             raise ValueError("Problem with quality section")
         yield _bytes_to_string(id), start_offset, length
         start_offset = end_offset
예제 #12
0
 def get_raw(self, offset):
     handle = self._handle
     handle.seek(offset)
     marker_re = self._marker_re
     lines = []
     line = handle.readline()
     semi_char = _as_bytes(";")
     while line.startswith(semi_char):
         lines.append(line)
         line = handle.readline()
     while line and not line.startswith(semi_char):
         lines.append(line)
         line = handle.readline()
     return _as_bytes("").join(lines)
예제 #13
0
 def get_raw(self, offset):
     """Return the raw record from the file as a bytes string."""
     handle = self._handle
     handle.seek(offset)
     marker_re = self._marker_re
     lines = []
     line = handle.readline()
     semi_char = _as_bytes(";")
     while line.startswith(semi_char):
         lines.append(line)
         line = handle.readline()
     while line and not line.startswith(semi_char):
         lines.append(line)
         line = handle.readline()
     return _as_bytes("").join(lines)
예제 #14
0
    def run_pv(self, out_file='probs.dat', data_dir='.',
               version=1, smooth=0.04):
        """Executes pv.

        out_file - Name of output file.
        data_dir - Where the data is found.
        """

        self._generate_intfile(data_dir)

        if version == 1:
            pv_name = "pv"
        else:
            pv_name = "pv2"

        proc = subprocess.Popen([self._get_path(pv_name)], cwd=data_dir,
                                shell=(sys.platform != "win32"),
                                stdin=subprocess.PIPE,
                                stdout=subprocess.PIPE,
                                universal_newlines=True)
        proc.communicate(_as_bytes('data_fst_outfile %s out.dat\n%s\n'
                                   % (out_file, smooth)))
        pvf = open(data_dir + os.sep + out_file, 'r')
        result = map(lambda x: tuple(map(lambda y:
                                         my_float(y), x.rstrip().split(' '))),
                     pvf.readlines())
        pvf.close()
        return result
예제 #15
0
    def check_by_line(self, old_file, new_file, old_gzip=False):
        for mode in ["r", "rb"]:
            if old_gzip:
                h = gzip.open(old_file, mode)
            else:
                h = open(old_file, mode)
            old = h.read()
            #Seems gzip can return bytes even if mode="r",
            #perhaps a bug in Python 3.2?
            if "b" in mode:
                old = _as_bytes(old)
            else:
                old = _as_string(old)
            h.close()

            for cache in [1,10]:
                h = bgzf.BgzfReader(new_file, mode, max_cache=cache)
                if "b" in mode:
                    new = _empty_bytes_string.join(line for line in h)
                else:
                    new = "".join(line for line in h)
                h.close()

                self.assertEqual(len(old), len(new))
                self.assertEqual(old[:10], new[:10], \
                                 "%r vs %r, mode %r" % (old[:10], new[:10], mode))
                self.assertEqual(old, new)
예제 #16
0
def _open(url, post=None):
    """Helper function to build the URL and open a handle to it (PRIVATE).

    Open a handle to TogoWS, will raise an IOError if it encounters an error.

    In the absense of clear guidelines, this function enforces a limit of
    "up to three queries per second" to avoid abusing the TogoWS servers.
    """
    delay = 0.333333333 #one third of a second
    current = time.time()
    wait = _open.previous + delay - current
    if wait > 0:
        time.sleep(wait)
        _open.previous = current + wait
    else:
        _open.previous = current

    #print url
    try:
        if post:
            handle = urllib2.urlopen(url, _as_bytes(urllib.urlencode(post)))
        else:
            handle = urllib2.urlopen(url)
    except urllib2.HTTPError, exception:
        raise exception
예제 #17
0
def _open(url, post=None):
    """Build the URL and open a handle to it (PRIVATE).

    Open a handle to TogoWS, will raise an IOError if it encounters an error.

    In the absence of clear guidelines, this function enforces a limit of
    "up to three queries per second" to avoid abusing the TogoWS servers.
    """
    delay = 0.333333333  # one third of a second
    current = time.time()
    wait = _open.previous + delay - current
    if wait > 0:
        time.sleep(wait)
        _open.previous = current + wait
    else:
        _open.previous = current

    # print(url)
    if post:
        handle = _urlopen(url, _as_bytes(post))
    else:
        handle = _urlopen(url)

    # We now trust TogoWS to have set an HTTP error code, that
    # suffices for my current unit tests. Previously we would
    # examine the start of the data returned back.
    return _binary_to_string_handle(handle)
예제 #18
0
    def check_by_char(self, old_file, new_file, old_gzip=False):
        for mode in ["r", "rb"]:
            if old_gzip:
                h = gzip.open(old_file,mode)
            else:
                h = open(old_file, mode)
            old = h.read()
            #Seems gzip can return bytes even if mode="r",
            #perhaps a bug in Python 3.2?
            if "b" in mode:
                old = _as_bytes(old)
            else:
                old = _as_string(old)
            h.close()

            for cache in [1,10]:
                h = bgzf.BgzfReader(new_file, mode, max_cache=cache)
                temp = []
                while True:
                    char = h.read(1)
                    if not char: break
                    temp.append(char)
                if "b" in mode:
                    new = _empty_bytes_string.join(temp)
                else:
                    new = "".join(temp)
                del temp
                h.close()

                self.assertEqual(len(old), len(new))
                #If bytes vs unicode mismatch, give a short error message:
                self.assertEqual(old[:10], new[:10], \
                                 "%r vs %r, mode %r" % (old[:10], new[:10], mode))
                self.assertEqual(old, new)
예제 #19
0
def seguid(seq):
    """Returns the SEGUID (string) for a sequence (string or Seq object).

    Given a nucleotide or amino-acid secuence (or any string),
    returns the SEGUID string (A SEquence Globally Unique IDentifier).
    seq type = str.

    For more information about SEGUID, see:
    http://bioinformatics.anl.gov/seguid/
    DOI: 10.1002/pmic.200600032
    """
    import hashlib
    import base64
    m = hashlib.sha1()
    try:
        # Assume it's a Seq object
        seq = str(seq)
    except AttributeError:
        # Assume it's a string
        pass
    m.update(_as_bytes(seq.upper()))
    try:
        # For Python 3+
        return base64.encodebytes(m.digest()).decode().replace("\n", "").rstrip("=")
    except AttributeError:
        pass
    # For all other Pythons
    return base64.b64encode(m.digest()).rstrip("=")
예제 #20
0
    def get_raw(self, offset):
        handle = self._handle
        qresult_raw = _as_bytes('')

        # read header first
        if not self._preamble:
            handle.seek(0)
            while True:
                line = handle.readline()
                if line.startswith(self.qresult_start):
                    break
                qresult_raw += line
        else:
            qresult_raw += self._preamble

        # and read the qresult raw string
        handle.seek(offset)
        while True:
            # preserve whitespace, don't use read_forward
            line = handle.readline()
            qresult_raw += line

            # break when we've reached qresult end
            if line.startswith(self.qresult_end) or not line:
                break

        return qresult_raw
예제 #21
0
    def check_raw(self, filename, id, raw, **kwargs):
        """Index filename using keyword arguments, check get_raw(id)==raw."""
        idx = SearchIO.index(filename, self.fmt, **kwargs)
        raw = _as_bytes(raw)
        # Anticipate cases where the raw string and/or file uses different
        # newline characters ~ we set everything to \n.
        new = idx.get_raw(id)
        self.assertTrue(isinstance(new, bytes),
                        "Didn't get bytes from %s get_raw" % self.fmt)
        self.assertEqual(raw.replace(b'\r\n', b'\n'),
                         new.replace(b'\r\n', b'\n'))
        idx.close()

        # Now again, but using SQLite backend
        if sqlite3:
            idx = SearchIO.index_db(":memory:", filename, self.fmt, **kwargs)
            new = idx.get_raw(id)
            self.assertTrue(isinstance(new, bytes),
                            "Didn't get bytes from %s get_raw" % self.fmt)
            self.assertEqual(raw.replace(b'\r\n', b'\n'),
                             new.replace(b'\r\n', b'\n'))
            idx.close()

        if os.path.isfile(filename + ".bgz"):
            # Do the tests again with the BGZF compressed file
            print("[BONUS %s.bgz]" % filename)
            self.check_raw(filename + ".bgz", id, raw, **kwargs)
예제 #22
0
def _open(cgi, params={}, post=False):
    """Helper function to build the URL and open a handle to it (PRIVATE).

    Open a handle to Entrez.  cgi is the URL for the cgi script to access.
    params is a dictionary with the options to pass to it.  Does some
    simple error checking, and will raise an IOError if it encounters one.

    This function also enforces the "up to three queries per second rule"
    to avoid abusing the NCBI servers.
    """
    # NCBI requirement: At most three queries per second.
    # Equivalently, at least a third of second between queries
    delay = 0.333333334
    current = time.time()
    wait = _open.previous + delay - current
    if wait > 0:
        time.sleep(wait)
        _open.previous = current + wait
    else:
        _open.previous = current
    # Remove None values from the parameters
    for key, value in params.items():
        if value is None:
            del params[key]
    # Tell Entrez that we are using Biopython (or whatever the user has
    # specified explicitly in the parameters or by changing the default)
    if not "tool" in params:
        params["tool"] = tool
    # Tell Entrez who we are
    if not "email" in params:
        if email is not None:
            params["email"] = email
        else:
            warnings.warn("""
Email address is not specified.

To make use of NCBI's E-utilities, NCBI strongly recommends you to specify
your email address with each request. From June 1, 2010, this will be
mandatory. As an example, if your email address is [email protected], you
can specify it as follows:
   from Bio import Entrez
   Entrez.email = '*****@*****.**'
In case of excessive usage of the E-utilities, NCBI will attempt to contact
a user at the email address provided before blocking access to the
E-utilities.""", UserWarning)
    # Open a handle to Entrez.
    options = _urlencode(params, doseq=True)
    #print cgi + "?" + options
    try:
        if post:
            #HTTP POST
            handle = _urlopen(cgi, data=_as_bytes(options))
        else:
            #HTTP GET
            cgi += "?" + options
            handle = _urlopen(cgi)
    except _HTTPError as exception:
        raise exception

    return _binary_to_string_handle(handle)
예제 #23
0
파일: _index.py 프로젝트: olgabot/biopython
 def __iter__(self):
     handle = self._handle
     handle.seek(0)
     marker_re = self._marker_re
     start_acc_marker = _as_bytes("<accession>")
     end_acc_marker = _as_bytes("</accession>")
     end_entry_marker = _as_bytes("</entry>")
     less_than = _as_bytes("<")
     #Skip any header before first record
     while True:
         start_offset = handle.tell()
         line = handle.readline()
         if marker_re.match(line) or not line:
             break
     #Should now be at the start of a record, or end of the file
     while marker_re.match(line):
         length = len(line)
         #We expect the next line to be <accession>xxx</accession>
         #(possibly with leading spaces)
         #but allow it to be later on within the <entry>
         key = None
         done = False
         while True:
             line = handle.readline()
             if key is None and start_acc_marker in line:
                 assert end_acc_marker in line, line
                 key = line[line.find(
                     start_acc_marker) + 11:].split(less_than, 1)[0]
                 length += len(line)
             elif end_entry_marker in line:
                 end_offset = handle.tell() - len(line) \
                     + line.find(end_entry_marker) + 8
                 break
             elif marker_re.match(line) or not line:
                 #Start of next record or end of file
                 raise ValueError("Didn't find end of record")
             else:
                 length += len(line)
         if not key:
             raise ValueError("Did not find <accession> line in bytes %i to %i"
                              % (start_offset, end_offset))
         yield _bytes_to_string(key), start_offset, length
         #Find start of next record
         while not marker_re.match(line) and line:
             start_offset = handle.tell()
             line = handle.readline()
     assert not line, repr(line)
예제 #24
0
 def get_raw(self, offset):
     """Similar to the get method, but returns the record as a raw string."""
     handle = self._handle
     marker_re = self._marker_re
     end_entry_marker = _as_bytes("</entry>")
     handle.seek(offset)
     data = [handle.readline()]
     while True:
         line = handle.readline()
         i = line.find(end_entry_marker)
         if i != -1:
             data.append(line[:i + 8])
             break
         if marker_re.match(line) or not line:
             #End of file, or start of next record
             raise ValueError("Didn't find end of record")
         data.append(line)
     return _as_bytes("").join(data)
예제 #25
0
 def get_raw(self, offset):
     """Return the raw record from the file as a bytes string."""
     # TODO - Refactor this and the __init__ method to reduce code duplication?
     handle = self._handle
     handle.seek(offset)
     line = handle.readline()
     data = line
     at_char = _as_bytes("@")
     plus_char = _as_bytes("+")
     if line[0:1] != at_char:
         raise ValueError("Problem with FASTQ @ line:\n%r" % line)
     # Find the seq line(s)
     seq_len = 0
     while line:
         line = handle.readline()
         data += line
         if line.startswith(plus_char):
             break
         seq_len += len(line.strip())
     if not line:
         raise ValueError("Premature end of file in seq section")
     assert line[0:1] == plus_char
     # Find the qual line(s)
     qual_len = 0
     while line:
         if seq_len == qual_len:
             if seq_len == 0:
                 # Special case, quality line should be just "\n"
                 line = handle.readline()
                 if line.strip():
                     raise ValueError("Expected blank quality line, not %r" % line)
                 data += line
             # Should be end of record...
             line = handle.readline()
             if line and line[0:1] != at_char:
                 raise ValueError("Problem with line %r" % line)
             break
         else:
             line = handle.readline()
             data += line
             qual_len += len(line.strip())
     if seq_len != qual_len:
         raise ValueError("Problem with quality section")
     return data
예제 #26
0
파일: _index.py 프로젝트: azerxu/biopython
 def __iter__(self):
     handle = self._handle
     handle.seek(0)
     id = None
     start_offset = handle.tell()
     line = handle.readline()
     if not line:
         # Empty file!
         return
     at_char = _as_bytes("@")
     plus_char = _as_bytes("+")
     if line[0:1] != at_char:
         raise ValueError("Problem with FASTQ @ line:\n%s" % repr(line))
     while line:
         # assert line[0]=="@"
         # This record seems OK (so far)
         id = line[1:].rstrip().split(None, 1)[0]
         # Find the seq line(s)
         seq_len = 0
         while line:
             line = handle.readline()
             if line.startswith(plus_char):
                 break
             seq_len += len(line.strip())
         if not line:
             raise ValueError("Premature end of file in seq section")
         # assert line[0]=="+"
         # Find the qual line(s)
         qual_len = 0
         while line:
             if seq_len == qual_len:
                 # Should be end of record...
                 line = handle.readline()
                 if line and line[0:1] != at_char:
                     ValueError("Problem with line %s" % repr(line))
                 break
             else:
                 line = handle.readline()
                 qual_len += len(line.strip())
         if seq_len != qual_len:
             raise ValueError("Problem with quality section")
         end_offset = handle.tell() - len(line)
         yield _bytes_to_string(id), start_offset, end_offset - start_offset
         start_offset = end_offset
예제 #27
0
def crc32(seq):
    """Returns the crc32 checksum for a sequence (string or Seq object).

    Note that the case is important:

    >>> crc32("ACGTACGTACGT")
    20049947
    >>> crc32("acgtACGTacgt")
    1688586483

    """
    # NOTE - On Python 2 returns a signed int, on Python 3 it is unsigned
    # Docs suggest should use crc32(x) & 0xffffffff for consistency.
    # TODO - Should we return crc32(x) & 0xffffffff here?
    try:
        # Assume its a Seq object
        return _crc32(_as_bytes(str(seq)))
    except AttributeError:
        # Assume its a string/unicode
        return _crc32(_as_bytes(seq))
예제 #28
0
    def __iter__(self):
        """Iterate over the file handle; yields key, start offset, and length."""
        handle = self._handle
        handle.seek(0)
        query_id_idx = self._query_id_idx
        qresult_key = None
        header_mark = _as_bytes('#')
        split_mark = _as_bytes(' ')
        # set line with initial mock value, to emulate header
        line = header_mark

        # read through header
        while line.startswith(header_mark):
            start_offset = handle.tell()
            line = handle.readline()

        # and index the qresults
        while True:
            end_offset = handle.tell()

            if not line:
                break

            cols = [x for x in line.strip().split(split_mark) if x]
            if qresult_key is None:
                qresult_key = cols[query_id_idx]
            else:
                curr_key = cols[query_id_idx]

                if curr_key != qresult_key:
                    adj_end = end_offset - len(line)
                    yield (_bytes_to_string(qresult_key), start_offset,
                           adj_end - start_offset)
                    qresult_key = curr_key
                    start_offset = adj_end

            line = handle.readline()
            if not line:
                yield (_bytes_to_string(qresult_key), start_offset,
                       end_offset - start_offset)
                break
예제 #29
0
def isReadable(handle):
    """
        Fast check, if this file is readable by this reader. Check if the
        file magic bytes equals to '.scf' as specified in the file format
        specification:
        http://staden.sourceforge.net/manual/formats_unix_2.html

        @param handle The file handle.
        @return True if this (probably) is an abi file.
    """
    handle.seek(0)
    return handle.read(4) == _as_bytes('.scf')
예제 #30
0
파일: _index.py 프로젝트: azerxu/biopython
 def get_raw(self, offset):
     """Similar to the get method, but returns the record as a raw string."""
     # TODO - Refactor this and the __init__ method to reduce code duplication?
     handle = self._handle
     handle.seek(offset)
     line = handle.readline()
     data = line
     at_char = _as_bytes("@")
     plus_char = _as_bytes("+")
     if line[0:1] != at_char:
         raise ValueError("Problem with FASTQ @ line:\n%s" % repr(line))
     identifier = line[1:].rstrip().split(None, 1)[0]
     # Find the seq line(s)
     seq_len = 0
     while line:
         line = handle.readline()
         data += line
         if line.startswith(plus_char):
             break
         seq_len += len(line.strip())
     if not line:
         raise ValueError("Premature end of file in seq section")
     assert line[0:1] == plus_char
     # Find the qual line(s)
     qual_len = 0
     while line:
         if seq_len == qual_len:
             # Should be end of record...
             pos = handle.tell()
             line = handle.readline()
             if line and line[0:1] != at_char:
                 ValueError("Problem with line %s" % repr(line))
             break
         else:
             line = handle.readline()
             data += line
             qual_len += len(line.strip())
     if seq_len != qual_len:
         raise ValueError("Problem with quality section")
     return data
예제 #31
0
from Bio._py3k import _as_bytes, _bytes_to_string
from Bio._py3k import zip

from Bio.Alphabet import generic_dna
from Bio.SearchIO._index import SearchIndexer
from Bio.SearchIO._model import QueryResult, Hit, HSP, HSPFragment


__all__ = ('BlatPslParser', 'BlatPslIndexer', 'BlatPslWriter')


# precompile regex patterns
_PTR_ROW_CHECK = r'^\d+\s+\d+\s+\d+\s+\d+'
_RE_ROW_CHECK = re.compile(_PTR_ROW_CHECK)
_RE_ROW_CHECK_IDX = re.compile(_as_bytes(_PTR_ROW_CHECK))


def _list_from_csv(csv_string, caster=None):
    """Transform the given comma-separated string into a list (PRIVATE).

    :param csv_string: comma-separated input string
    :type csv_string: string
    :param caster: function used to cast each item in the input string
                   to its intended type
    :type caster: callable, accepts string, returns object

    """
    if caster is None:
        return [x for x in csv_string.split(',') if x]
    else:
예제 #32
0
def qblast(program, database, sequence,
           auto_format=None,composition_based_statistics=None,
           db_genetic_code=None,endpoints=None,entrez_query='(none)',
           expect=10.0,filter=None,gapcosts=None,genetic_code=None,
           hitlist_size=50,i_thresh=None,layout=None,lcase_mask=None,
           matrix_name=None,nucl_penalty=None,nucl_reward=None,
           other_advanced=None,perc_ident=None,phi_pattern=None,
           query_file=None,query_believe_defline=None,query_from=None,
           query_to=None,searchsp_eff=None,service=None,threshold=None,
           ungapped_alignment=None,word_size=None,
           alignments=500,alignment_view=None,descriptions=500,
           entrez_links_new_window=None,expect_low=None,expect_high=None,
           format_entrez_query=None,format_object=None,format_type='XML',
           ncbi_gi=None,results_file=None,show_overview=None, megablast=None,
           ):
    """Do a BLAST search using the QBLAST server at NCBI.

    Supports all parameters of the qblast API for Put and Get.
    Some useful parameters:
    program        blastn, blastp, blastx, tblastn, or tblastx (lower case)
    database       Which database to search against (e.g. "nr").
    sequence       The sequence to search.
    ncbi_gi        TRUE/FALSE whether to give 'gi' identifier.
    descriptions   Number of descriptions to show.  Def 500.
    alignments     Number of alignments to show.  Def 500.
    expect         An expect value cutoff.  Def 10.0.
    matrix_name    Specify an alt. matrix (PAM30, PAM70, BLOSUM80, BLOSUM45).
    filter         "none" turns off filtering.  Default no filtering
    format_type    "HTML", "Text", "ASN.1", or "XML".  Def. "XML".
    entrez_query   Entrez query to limit Blast search
    hitlist_size   Number of hits to return. Default 50
    megablast      TRUE/FALSE whether to use MEga BLAST algorithm (blastn only)
    service        plain, psi, phi, rpsblast, megablast (lower case)

    This function does no checking of the validity of the parameters
    and passes the values to the server as is.  More help is available at:
    http://www.ncbi.nlm.nih.gov/BLAST/blast_overview.html

    """
    import urllib, urllib2
    import time

    assert program in ['blastn', 'blastp', 'blastx', 'tblastn', 'tblastx']

    # Format the "Put" command, which sends search requests to qblast.
    # Parameters taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node5.html on 9 July 2007
    # Additional parameters are taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node9.html on 8 Oct 2010
    # To perform a PSI-BLAST or PHI-BLAST search the service ("Put" and "Get" commands) must be specified
    # (e.g. psi_blast = NCBIWWW.qblast("blastp", "refseq_protein", input_sequence, service="psi"))
    parameters = [
        ('AUTO_FORMAT',auto_format),
        ('COMPOSITION_BASED_STATISTICS',composition_based_statistics),
        ('DATABASE',database),
        ('DB_GENETIC_CODE',db_genetic_code),
        ('ENDPOINTS',endpoints),
        ('ENTREZ_QUERY',entrez_query),
        ('EXPECT',expect),
        ('FILTER',filter),
        ('GAPCOSTS',gapcosts),
        ('GENETIC_CODE',genetic_code),
        ('HITLIST_SIZE',hitlist_size),
        ('I_THRESH',i_thresh),
        ('LAYOUT',layout),
        ('LCASE_MASK',lcase_mask),
        ('MEGABLAST',megablast),
        ('MATRIX_NAME',matrix_name),
        ('NUCL_PENALTY',nucl_penalty),
        ('NUCL_REWARD',nucl_reward),
        ('OTHER_ADVANCED',other_advanced),
        ('PERC_IDENT',perc_ident),
        ('PHI_PATTERN',phi_pattern),
        ('PROGRAM',program),
        #('PSSM',pssm), - It is possible to use PSI-BLAST via this API?
        ('QUERY',sequence),
        ('QUERY_FILE',query_file),
        ('QUERY_BELIEVE_DEFLINE',query_believe_defline),
        ('QUERY_FROM',query_from),
        ('QUERY_TO',query_to),
        #('RESULTS_FILE',...), - Can we use this parameter?
        ('SEARCHSP_EFF',searchsp_eff),
        ('SERVICE',service),
        ('THRESHOLD',threshold),
        ('UNGAPPED_ALIGNMENT',ungapped_alignment),
        ('WORD_SIZE',word_size),
        ('CMD', 'Put'),
        ]
    query = [x for x in parameters if x[1] is not None]
    message = _as_bytes(urllib.urlencode(query))

    # Send off the initial query to qblast.
    # Note the NCBI do not currently impose a rate limit here, other
    # than the request not to make say 50 queries at once using multiple
    # threads.
    request = urllib2.Request("http://blast.ncbi.nlm.nih.gov/Blast.cgi",
                              message,
                              {"User-Agent":"BiopythonClient"})
    handle = urllib2.urlopen(request)

    # Format the "Get" command, which gets the formatted results from qblast
    # Parameters taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node6.html on 9 July 2007	
    rid, rtoe = _parse_qblast_ref_page(handle)
    parameters = [
        ('ALIGNMENTS',alignments),
        ('ALIGNMENT_VIEW',alignment_view),
        ('DESCRIPTIONS',descriptions),
        ('ENTREZ_LINKS_NEW_WINDOW',entrez_links_new_window),
        ('EXPECT_LOW',expect_low),
        ('EXPECT_HIGH',expect_high),
        ('FORMAT_ENTREZ_QUERY',format_entrez_query),
        ('FORMAT_OBJECT',format_object),
        ('FORMAT_TYPE',format_type),
        ('NCBI_GI',ncbi_gi),
        ('RID',rid),
        ('RESULTS_FILE',results_file),
        ('SERVICE',service),
        ('SHOW_OVERVIEW',show_overview),
        ('CMD', 'Get'),
        ]
    query = [x for x in parameters if x[1] is not None]
    message = _as_bytes(urllib.urlencode(query))

    # Poll NCBI until the results are ready.  Use a 3 second wait
    delay = 3.0
    previous = time.time()
    while True:
        current = time.time()
        wait = previous + delay - current
        if wait > 0:
            time.sleep(wait)
            previous = current + wait
        else:
            previous = current

        request = urllib2.Request("http://blast.ncbi.nlm.nih.gov/Blast.cgi",
                                  message,
                                  {"User-Agent":"BiopythonClient"})
        handle = urllib2.urlopen(request)
        results = _as_string(handle.read())

        # Can see an "\n\n" page while results are in progress,
        # if so just wait a bit longer...
        if results=="\n\n":
            continue
        # XML results don't have the Status tag when finished
        if results.find("Status=") < 0:
            break
        i = results.index("Status=")
        j = results.index("\n", i)
        status = results[i+len("Status="):j].strip()
        if status.upper() == "READY":
            break

    return StringIO(results)
예제 #33
0
    if (3, 0) <= sys.version_info[:2] <= (3, 1):
        # Workaround for bug in python 3.0 and 3.1,
        # see http://bugs.python.org/issue9257
        from xml.etree import ElementTree as ElementTree
    else:
        from xml.etree import cElementTree as ElementTree
except ImportError:
    from xml.etree import ElementTree as ElementTree

from Bio.Alphabet import generic_dna, generic_protein
from Bio.SearchIO._index import SearchIndexer
from Bio.SearchIO._model import QueryResult, Hit, HSP, HSPFragment

from Bio._py3k import _as_bytes, _bytes_to_string, unicode

_empty_bytes_string = _as_bytes("")

__all__ = ('BlastXmlParser', 'BlastXmlIndexer', 'BlastXmlWriter')

# element - optional qresult attribute name mapping
_ELEM_QRESULT_OPT = {
    'Statistics_db-num': ('stat_db_num', int),
    'Statistics_db-len': ('stat_db_len', int),
    'Statistics_eff-space': ('stat_eff_space', float),
    'Statistics_hsp-len': ('stat_hsp_len', int),
    'Statistics_kappa': ('stat_kappa', float),
    'Statistics_lambda': ('stat_lambda', float),
    'Statistics_entropy': ('stat_entropy', float),
}
# element - hit attribute name mapping
_ELEM_HIT = {
예제 #34
0
class BlastXmlIndexer(SearchIndexer):
    """Indexer class for BLAST XML output."""

    _parser = BlastXmlParser
    qstart_mark = _as_bytes('<Iteration>')
    qend_mark = _as_bytes('</Iteration>')
    block_size = 16384

    def __init__(self, filename, **kwargs):
        """Initialize the class."""
        SearchIndexer.__init__(self, filename)
        # TODO: better way to do this?
        iter_obj = self._parser(self._handle, **kwargs)
        self._meta, self._fallback = iter_obj._meta, iter_obj._fallback

    def __iter__(self):
        """Iterate over BlastXmlIndexer yields qstart_id, start_offset, block's length."""
        qstart_mark = self.qstart_mark
        qend_mark = self.qend_mark
        blast_id_mark = _as_bytes('Query_')
        block_size = self.block_size
        handle = self._handle
        handle.seek(0)
        re_desc = re.compile(
            _as_bytes(r'<Iteration_query-ID>(.*?)'
                      r'</Iteration_query-ID>\s+?'
                      '<Iteration_query-def>'
                      '(.*?)</Iteration_query-def>'))
        re_desc_end = re.compile(_as_bytes(r'</Iteration_query-def>'))
        counter = 0

        while True:
            start_offset = handle.tell()
            line = handle.readline()
            if not line:
                break
            if qstart_mark not in line:
                continue
            # The following requirements are to make supporting BGZF compressed
            # BLAST XML files simpler (avoids complex offset manipulations):
            assert line.count(qstart_mark) == 1, "XML without line breaks?"
            assert line.lstrip().startswith(qstart_mark), line
            if qend_mark in line:
                # Should cope with <Iteration>...</Iteration> on one long line
                block = line
            else:
                # Load the rest of this block up to and including </Iteration>
                block = [line]
                while line and qend_mark not in line:
                    line = handle.readline()
                    assert qstart_mark not in line, line
                    block.append(line)
                assert line.rstrip().endswith(qend_mark), line
                block = _empty_bytes_string.join(block)
            assert block.count(
                qstart_mark) == 1, "XML without line breaks? %r" % block
            assert block.count(
                qend_mark) == 1, "XML without line breaks? %r" % block
            # Now we have a full <Iteration>...</Iteration> block, find the ID
            regx = re.search(re_desc, block)
            try:
                qstart_desc = regx.group(2)
                qstart_id = regx.group(1)
            except AttributeError:
                # use the fallback values
                assert re.search(re_desc_end, block)
                qstart_desc = _as_bytes(self._fallback['description'])
                qstart_id = _as_bytes(self._fallback['id'])
            if qstart_id.startswith(blast_id_mark):
                qstart_id = qstart_desc.split(_as_bytes(' '), 1)[0]
            yield _bytes_to_string(qstart_id), start_offset, len(block)
            counter += 1

    def _parse(self, handle):
        """Overwrite SearchIndexer parse (PRIVATE).

        As we need to set the meta and fallback dictionaries to the parser.
        """
        generator = self._parser(handle, **self._kwargs)
        generator._meta = self._meta
        generator._fallback = self._fallback
        return next(iter(generator))

    def get_raw(self, offset):
        """Return the raw record from the file as a bytes string."""
        qend_mark = self.qend_mark
        handle = self._handle
        handle.seek(offset)

        qresult_raw = handle.readline()
        assert qresult_raw.lstrip().startswith(self.qstart_mark)
        while qend_mark not in qresult_raw:
            qresult_raw += handle.readline()
        assert qresult_raw.rstrip().endswith(qend_mark)
        assert qresult_raw.count(qend_mark) == 1
        # Note this will include any leading and trailing whitespace, in
        # general expecting "    <Iteration>\n...\n    </Iteration>\n"
        return qresult_raw
예제 #35
0
파일: AbiIO.py 프로젝트: gavieira/mitomaker
def AbiIterator(handle, alphabet=None, trim=False):
    """Iterator for the Abi file format.
    """
    # raise exception is alphabet is not dna
    if alphabet is not None:
        if isinstance(Alphabet._get_base_alphabet(alphabet),
                      Alphabet.ProteinAlphabet):
            raise ValueError(
                "Invalid alphabet, ABI files do not hold proteins.")
        if isinstance(Alphabet._get_base_alphabet(alphabet),
                      Alphabet.RNAAlphabet):
            raise ValueError("Invalid alphabet, ABI files do not hold RNA.")

    # raise exception if handle mode is not 'rb'
    if hasattr(handle, 'mode'):
        if set('rb') != set(handle.mode.lower()):
            raise ValueError("ABI files has to be opened in 'rb' mode.")

    # check if input file is a valid Abi file
    handle.seek(0)
    marker = handle.read(4)
    if not marker:
        # handle empty file gracefully
        raise StopIteration
    if marker != _as_bytes('ABIF'):
        raise IOError('File should start ABIF, not %r' % marker)

    # dirty hack for handling time information
    times = {
        'RUND1': '',
        'RUND2': '',
        'RUNT1': '',
        'RUNT2': '',
    }

    # initialize annotations
    annot = dict(zip(_EXTRACT.values(), [None] * len(_EXTRACT)))

    # parse header and extract data from directories
    header = struct.unpack(_HEADFMT, handle.read(struct.calcsize(_HEADFMT)))

    for tag_name, tag_number, tag_data in _abi_parse_header(header, handle):
        # stop iteration if all desired tags have been extracted
        # 4 tags from _EXTRACT + 2 time tags from _SPCTAGS - 3,
        # and seq, qual, id
        # todo

        key = tag_name + str(tag_number)

        # PBAS2 is base-called sequence
        if key == 'PBAS2':
            seq = tag_data
            ambigs = 'KYWMRS'
            if alphabet is None:
                if set(seq).intersection(ambigs):
                    alphabet = ambiguous_dna
                else:
                    alphabet = unambiguous_dna
        # PCON2 is quality values of base-called sequence
        elif key == 'PCON2':
            qual = [ord(val) for val in tag_data]
        # SMPL1 is sample id entered before sequencing run
        elif key == 'SMPL1':
            sample_id = tag_data
        elif key in times:
            times[key] = tag_data
        else:
            # extract sequence annotation as defined in _EXTRACT
            if key in _EXTRACT:
                annot[_EXTRACT[key]] = tag_data

    # set time annotations
    annot['run_start'] = '%s %s' % (times['RUND1'], times['RUNT1'])
    annot['run_finish'] = '%s %s' % (times['RUND2'], times['RUNT2'])

    # use the file name as SeqRecord.name if available
    try:
        file_name = basename(handle.name).replace('.ab1', '')
    except:
        file_name = ""

    record = SeqRecord(Seq(seq, alphabet),
                       id=sample_id,
                       name=file_name,
                       description='',
                       annotations=annot,
                       letter_annotations={'phred_quality': qual})

    if not trim:
        yield record
    else:
        yield _abi_trim(record)
예제 #36
0

__all__ = ['Hmmer3TextParser', 'Hmmer3TextIndexer']


# precompile regex patterns for faster processing
# regex for program name capture
_RE_PROGRAM = re.compile(r'^# (\w*hmm\w+) :: .*$')
# regex for version string capture
_RE_VERSION = re.compile(r'# \w+ ([\w+\.]+) .*; http.*$')
# regex for option string capture
_RE_OPT = re.compile(r'^# (.+):\s+(.+)$')
# regex for parsing query id and length, for parsing and indexing
_QRE_ID_LEN_PTN = r'^Query:\s*(.*)\s+\[\w=(\d+)\]'
_QRE_ID_LEN = re.compile(_QRE_ID_LEN_PTN)
_QRE_ID_LEN_IDX = re.compile(_as_bytes(_QRE_ID_LEN_PTN))
# regex for hsp validation
_HRE_VALIDATE = re.compile(r'score:\s(-?\d+\.?\d+)\sbits.*value:\s(.*)')
# regexes for parsing hsp alignment blocks
_HRE_ANNOT_LINE = re.compile(r'^(\s+)(.+)\s(\w+)')
_HRE_ID_LINE = re.compile(r'^(\s+\S+\s+[0-9-]+ )(.+?)(\s+[0-9-]+)')


def _read_forward(handle):
    """Reads through whitespaces, returns the first non-whitespace line."""
    while True:
        line = handle.readline()
        # if line has characters and stripping does not remove them,
        # return the line
        if line and line.strip():
            return line
예제 #37
0
파일: FastaIO.py 프로젝트: xuyk/biopython
from Bio.SearchIO._index import SearchIndexer
from Bio.SearchIO._model import QueryResult, Hit, HSP, HSPFragment


__all__ = ['FastaM10Parser', 'FastaM10Indexer']

__docformat__ = "restructuredtext en"


# precompile regex patterns
# regex for program name
_RE_FLAVS = re.compile(r't?fast[afmsxy]|pr[sf][sx]|lalign|[gs]?[glso]search')
# regex for sequence ID and length ~ deals with both \n and \r\n
_PTR_ID_DESC_SEQLEN = r'>>>(.+?)\s+(.*?) *- (\d+) (?:aa|nt)\s*$'
_RE_ID_DESC_SEQLEN = re.compile(_PTR_ID_DESC_SEQLEN)
_RE_ID_DESC_SEQLEN_IDX = re.compile(_as_bytes(_PTR_ID_DESC_SEQLEN))
# regex for qresult, hit, or hsp attribute value
_RE_ATTR = re.compile(r'^; [a-z]+(_[ \w-]+):\s+(.*)$')
# regex for capturing excess start and end sequences in alignments
_RE_START_EXC = re.compile(r'^-*')
_RE_END_EXC = re.compile(r'-*$')

# attribute name mappings
_HSP_ATTR_MAP = {
    '_initn': ('initn_score', int),
    '_init1': ('init1_score', int),
    '_opt': ('opt_score', int),
    '_s-w opt': ('opt_score', int),
    '_z-score': ('z_score', float),
    '_bits': ('bitscore', float),
    '_expect': ('evalue', float),