Example #1
0
def charmap():
    """Return a dict that maps a Unicode category, to a tuple of 2-tuples
    covering the codepoint intervals for characters in that category.

    >>> charmap()['Co']
    ((57344, 63743), (983040, 1048573), (1048576, 1114109))
    """
    global _charmap
    # Best-effort caching in the face of missing files and/or unwritable
    # filesystems is fairly simple: check if loaded, else try loading,
    # else calculate and try writing the cache.
    if _charmap is None:
        f = charmap_file()
        try:
            with gzip.GzipFile(f, "rb") as i:
                tmp_charmap = dict(json.loads(i))

        except Exception:
            tmp_charmap = {}
            for i in range(0, sys.maxunicode + 1):
                cat = unicodedata.category(hunichr(i))
                rs = tmp_charmap.setdefault(cat, [])
                if rs and rs[-1][-1] == i - 1:
                    rs[-1][-1] += 1
                else:
                    rs.append([i, i])

            try:
                # Write the Unicode table atomically
                tmpdir = storage_directory("tmp")
                mkdir_p(tmpdir)
                fd, tmpfile = tempfile.mkstemp(dir=tmpdir)
                os.close(fd)
                # Explicitly set the mtime to get reproducible output
                with gzip.GzipFile(tmpfile, "wb", mtime=1) as o:
                    result = json.dumps(sorted(tmp_charmap.items()))
                    o.write(result.encode())

                os.renames(tmpfile, f)
            except Exception:
                pass

        # convert between lists and tuples
        _charmap = {
            k: tuple(tuple(pair) for pair in pairs) for k, pairs in tmp_charmap.items()
        }
        # each value is a tuple of 2-tuples (that is, tuples of length 2)
        # and that both elements of that tuple are integers.
        for vs in _charmap.values():
            ints = list(sum(vs, ()))
            assert all([isinstance(x, int) for x in ints])
            assert ints == sorted(ints)
            assert all([len(tup) == 2 for tup in vs])

    assert _charmap is not None
    return _charmap
Example #2
0
 def save(self, key: bytes, value: bytes) -> None:
     # Note: we attempt to create the dir in question now. We
     # already checked for permissions, but there can still be other issues,
     # e.g. the disk is full
     mkdir_p(self._key_path(key))
     path = self._value_path(key, value)
     if not os.path.exists(path):
         suffix = binascii.hexlify(os.urandom(16)).decode("ascii")
         tmpname = path + "." + suffix
         with open(tmpname, "wb") as o:
             o.write(value)
         try:
             os.rename(tmpname, path)
         except OSError:  # pragma: no cover
             os.unlink(tmpname)
         assert not os.path.exists(tmpname)
def charmap():
    """Return a dict that maps a Unicode category, to a tuple of 2-tuples
    covering the codepoint intervals for characters in that category.

    >>> charmap()['Co']
    ((57344, 63743), (983040, 1048573), (1048576, 1114109))
    """
    global _charmap
    # Best-effort caching in the face of missing files and/or unwritable
    # filesystems is fairly simple: check if loaded, else try loading,
    # else calculate and try writing the cache.
    if _charmap is None:
        f = charmap_file()
        try:
            with gzip.GzipFile(f, "rb") as i:
                # When the minimum Python 3 version becomes 3.6, this can be
                # simplified to `json.load(i)` without needing to decode first.
                data = i.read().decode()
                tmp_charmap = dict(json.loads(data))

        except Exception:
            # This loop is reduced to using only local variables for performance;
            # indexing and updating containers is a ~3x slowdown.  This doesn't fix
            # https://github.com/HypothesisWorks/hypothesis/issues/2108 but it helps.
            category = unicodedata.category  # Local variable -> ~20% speedup!
            tmp_charmap = {}
            last_cat = category(chr(0))
            last_start = 0
            for i in range(1, sys.maxunicode + 1):
                cat = category(chr(i))
                if cat != last_cat:
                    tmp_charmap.setdefault(last_cat,
                                           []).append([last_start, i - 1])
                    last_cat, last_start = cat, i
            tmp_charmap.setdefault(last_cat,
                                   []).append([last_start, sys.maxunicode])

            try:
                # Write the Unicode table atomically
                tmpdir = storage_directory("tmp")
                mkdir_p(tmpdir)
                fd, tmpfile = tempfile.mkstemp(dir=tmpdir)
                os.close(fd)
                # Explicitly set the mtime to get reproducible output
                with gzip.GzipFile(tmpfile, "wb", mtime=1) as o:
                    result = json.dumps(sorted(tmp_charmap.items()))
                    o.write(result.encode())

                os.renames(tmpfile, f)
            except Exception:
                pass

        # convert between lists and tuples
        _charmap = {
            k: tuple(tuple(pair) for pair in pairs)
            for k, pairs in tmp_charmap.items()
        }
        # each value is a tuple of 2-tuples (that is, tuples of length 2)
        # and that both elements of that tuple are integers.
        for vs in _charmap.values():
            ints = list(sum(vs, ()))
            assert all(isinstance(x, int) for x in ints)
            assert ints == sorted(ints)
            assert all(len(tup) == 2 for tup in vs)

    assert _charmap is not None
    return _charmap