Ejemplo n.º 1
0
        def display_at(self, now, total_num_units, unit_abbrev_display,
                       include_elapsed=False):
            pct_complete = '%1.2f%%' % ((
                (self.units_copied * 100.0 if total_num_units else 1)
                /
                (total_num_units or 1)
            ))

            elapsed_total = now - self.begin_time
            rate_mb = rate_units = 0.0
            if elapsed_total:
                rate_mb = self.total_size / elapsed_total
                rate_units = self.units_copied / elapsed_total

            rate_mb_str = byte_display(rate_mb)
            rate_unit_str = '%1.2f' % rate_units

            result = "%d/%d,%7s, %6s/s %6s %s/s, %s" % (
                self.units_copied, total_num_units, pct_complete,
                rate_mb_str, rate_unit_str, unit_abbrev_display,
                byte_display(self.total_size),
            )
            if include_elapsed:
                result += ' %4.1f minutes' % (elapsed_total / 60.0)
            return result
Ejemplo n.º 2
0
 def do_minor_log(self, trans, _num_txn_records, txn_byte_size, _num_txn_blobs, _copy_duration):
     logger.debug("Copied %d objects of size %s (%d blobs) (Most recent oid=%s size=%s)",
                  trans.num_records_since_last_log,
                  byte_display(trans.record_size_since_last_log),
                  trans.num_blobs_since_last_log,
                  trans.last_oid,
                  byte_display(txn_byte_size))
     trans.reset_interval()
Ejemplo n.º 3
0
 def loaded(self, byte_count):
     with self._lock:
         self.bytes_loaded_since_last_check += byte_count
         if self.bytes_loaded_since_last_check >= self.bytes_loaded_check_threshold:
             logger.debug(
                 "Loaded %s bytes (>= %s) into %s, may need to check.",
                 byte_display(self.bytes_loaded_since_last_check),
                 byte_display(self.bytes_loaded_check_threshold),
                 self.blob_dir)
             self._check()
Ejemplo n.º 4
0
    def __run_with_lock(self):
        while 1:
            size, files_by_atime = self.__size_blob_dir()
            self.blob_dir_size = size

            if size <= self.target_size:
                logger.info(
                    'Traversed %s to compute size %s (<= %s); quitting.',
                    self.blob_dir, byte_display(self.blob_dir_size),
                    byte_display(self.target_size))
                break

            self.__shrink_blob_dir(size, files_by_atime)
Ejemplo n.º 5
0
    def bulk_update(self,
                    keys_and_values,
                    source='<unknown>',
                    log_count=None,
                    mem_usage_before=None):
        """
        Insert all the ``(key, value)`` pairs found in *keys_and_values*.

        This will permute the most-recently-used status of any existing entries.
        Entries in the *keys_and_values* iterable should be returned from
        least recent to most recent, as the items at the end will be considered to be
        the most recent. (Alternately, you can think of them as needing to be in order
        from lowest priority to highest priority.)

        This will never evict existing entries from the cache.
        """
        now = time.time()
        mem_usage_before = mem_usage_before if mem_usage_before is not None else get_memory_usage(
        )
        mem_usage_before_this = get_memory_usage()
        log_count = log_count or len(keys_and_values)

        data = self._cache

        if data:
            # Loading more data into an existing bucket.
            # Load only the *new* keys, trying to get the newest ones
            # because LRU is going to get messed up anyway.
            #
            # If we were empty, then take what they give us, LRU
            # first, so that as we iterate the last item in the list
            # becomes the MRU item.
            new_entries_newest_first = [
                t for t in keys_and_values if t[0] not in data
            ]
            new_entries_newest_first.reverse()
            keys_and_values = new_entries_newest_first
            del new_entries_newest_first

        stored = data.add_MRUs(keys_and_values, return_count_only=True)

        then = time.time()
        del keys_and_values  # For memory reporting.
        mem_usage_after = get_memory_usage()
        log.info(
            "Examined %d and stored %d items from %s in %s using %s. "
            "(%s local) (%s)", log_count, stored,
            getattr(source, 'name', source), then - now,
            byte_display(mem_usage_after - mem_usage_before),
            byte_display(mem_usage_after - mem_usage_before_this), self)
        return log_count, stored
Ejemplo n.º 6
0
    def _when_done(self, checker, holding_clean_lock):
        """
        Callback to be run from the cleanup thread.

        Cleans up the state of *self*.
        """
        with self._lock:
            # checker is the BlobCacheSizeChecker, but self._checker
            # is the spawned thread.
            # This is the last thing the BlobCacheSizeChecker does, so by
            # definition it cannot be ready yet.
            self._checker_thread = None
            if not holding_clean_lock:
                self._reduced_event.set()
                return

            # In principle, if the checker finished sizing the directory and got
            # a cache size under its target and wanted to return to us,
            # but then some other threads immediately loaded a bunch of blobs,
            # we could go over that size. We prevent this by checking
            # the size again here, while we're holding our lock, and if we're
            # too big, we'll go again. This happens during the test cases.
            # checker can be None if we did this externally.
            dir_size = -1
            duration = 0.0
            if checker is not None:
                dir_size = checker.blob_dir_size
                duration = checker.duration

            logger.info(
                "Finished checking %s (in %.3fs) with size of %s (max: %s; target %s)",
                self.blob_dir, duration,
                byte_display(dir_size),
                byte_display(self.blob_cache_max_size),
                byte_display(self.blob_cache_target_cleanup_size)
            )
            if self._exceeded_counter or dir_size > self.blob_cache_target_cleanup_size:
                logger.debug(
                    "Requesting new check for %s with size of %s (max: %s; target %s)",
                    self.blob_dir,
                    dir_size,
                    self.blob_cache_max_size,
                    byte_display(self.blob_cache_target_cleanup_size)
                )
                self._check()
            else:
                self._reduced_event.set()
Ejemplo n.º 7
0
 def report(client, duration, extra=''):
     gc.collect()
     gc.collect()
     mem_before = client.b_mem_before
     objects_before = client.b_objects_before
     mem_after = get_memory_usage()
     mem_used = mem_after - mem_before
     objects_after = len(gc.get_objects())
     logger.info(
         "%s:%sExecuted in %s; Mem delta: %s;  Object Delta: %s; "
         "Num keys: %d; Weight: %s; "
         "Mem before: %s; Mem after: %s",
         os.getpid(), extra,
         duration, byte_display(mem_used), objects_after - objects_before,
         len(client), byte_display(client.size),
         byte_display(mem_before), byte_display(mem_after),
     )
Ejemplo n.º 8
0
    def __size_blob_dir(self, is_cache_dir_name=re.compile(r'\d+$').match):
        # Calculate the sizes of the blobs stored in the blob_dir.
        # Return the total size, and a BTree {atime: [full path to blob file]}

        # TODO: nti.zodb.containers has support for mapping
        # time.time() values into integers for use with the (smaller,
        # faster) IOBTree. Use that if we can prove that we can pop
        # the min atime successfully (that is, while the
        # nti.zodb.containers transformation is lossless and
        # reversible, we need to prove that it also maintains order;
        # I'm not sure it does).
        #
        # Other optimizations: Don't use a list until we get more than one
        # file with a matching atime. And/or use tuples and not lists:
        # tuples aren't tracked by the GC like lists are (after they survive one
        # collection, anyway).

        blob_dir = self.blob_dir
        blob_suffix = ZODB.blob.BLOB_SUFFIX
        files_by_atime = OOBTree.BTree()
        size = 0

        # Use os.walk() instead of os.listdir(); on 3.5+ this is much faster
        # thanks to the use of os.scandir(). When we're on Python 3.5+ *only*
        # we could use os.scandir ourself and maybe save some stat calls?
        for dirpath, dirnames, filenames in os.walk(blob_dir):
            # Walk top-down, only recursing into directories matching the
            # OID components (of which there should be one level)
            dirnames[:] = [d for d in dirnames if is_cache_dir_name(d)]
            # Examine blob files.
            blobfile_paths = [
                os.path.join(dirpath, f) for f in filenames
                if f.endswith(blob_suffix)
            ]

            for file_path in blobfile_paths:
                stat = os.stat(file_path)
                size += stat.st_size
                t = stat.st_atime
                if t not in files_by_atime:
                    files_by_atime[t] = []

                # The ZEO version returns a weird version of the path,
                #
                #     os.path.join(dirname, file_name)
                #
                # which it must later re-combine to get an actual path:
                #
                #     os.path.join(blob_dir, file_name)
                #
                # It's not clear why it doesn't return the full path
                # that it already has. Temporary memory savings,
                # perhaps? If so, is that even a concern anymore?
                files_by_atime[t].append(file_path)

        logger.debug("Blob cache size for %s: %s", self.blob_dir,
                     byte_display(size))
        return size, files_by_atime
Ejemplo n.º 9
0
    def __shrink_blob_dir(self, current_size, files_by_atime):
        size = current_size
        target_size = self.target_size
        remove = self.remove_blob_at_path

        while size > target_size and files_by_atime:
            for file_path in files_by_atime.pop(files_by_atime.minKey()):
                size -= remove(file_path)
                if size <= target_size:
                    break

        logger.debug("Reduced blob cache size for %s: %s", self.blob_dir, byte_display(size))
Ejemplo n.º 10
0
 def populate_empty(loops, bucket_kind):
     client = makeOne(bucket_kind, populate=False)
     mem_before = get_memory_usage()
     begin = perf_counter()
     for _ in range(loops):
         for k, v in ALL_DATA:
             client[k] = v
     duration = perf_counter() - begin
     mem_used = get_memory_usage() - mem_before
     logger.info("Populated in %s; took %s mem; size: %d", duration,
                 byte_display(mem_used), len(client))
     return duration
Ejemplo n.º 11
0
        def display_at(self, now, total_num_txns, include_elapsed=False):
            pct_complete = '%1.2f%%' % (self.txns_copied * 100.0 /
                                        total_num_txns)
            elapsed_total = now - self.begin_time
            if elapsed_total:
                rate_mb = self.total_size / elapsed_total
                rate_tx = self.txns_copied / elapsed_total
            else:
                rate_mb = rate_tx = 0.0
            rate_mb_str = byte_display(rate_mb)
            rate_tx_str = '%1.3f' % rate_tx

            result = "%d/%d,%7s, %6s/s %6s TX/s, %s" % (
                self.txns_copied,
                total_num_txns,
                pct_complete,
                rate_mb_str,
                rate_tx_str,
                byte_display(self.total_size),
            )
            if include_elapsed:
                result += ' %4.1f minutes' % (elapsed_total / 60.0)
            return result
Ejemplo n.º 12
0
    def _bulk_update(self,
                     keys_and_values,
                     source='<unknown>',
                     log_count=None,
                     mem_usage_before=None):
        """
        Insert all the
        ``(oid, (state, actual_tid, frozen, frequency)))`` pairs
        found in *keys_and_values*, rejecting entries once we get too full.

        *keys_and_values* should be ordered from least to most recently used.

        This can only be done in an empty cache.
        """
        now = time.time()
        mem_usage_before = mem_usage_before if mem_usage_before is not None else get_memory_usage(
        )
        mem_usage_before_this = get_memory_usage()
        log_count = log_count or len(keys_and_values)

        data = self._cache
        if data:
            raise ValueError("Only bulk load into an empty cache.")

        stored = data.add_MRUs(keys_and_values, return_count_only=True)

        then = time.time()
        del keys_and_values  # For memory reporting.
        mem_usage_after = get_memory_usage()
        logger.info(
            "Examined %d and stored %d items from %s in %s using %s. "
            "(%s local) (%s)", log_count, stored,
            getattr(source, 'name', source), then - now,
            byte_display(mem_usage_after - mem_usage_before),
            byte_display(mem_usage_after - mem_usage_before_this), self)
        return log_count, stored
Ejemplo n.º 13
0
    def __call__(self):
        with self:
            logger.info("Checking blob cache size for %s. (target: %s)",
                        self.blob_dir, byte_display(self.target_size))

            check_lock = self.__acquire_check_lock()
            try:
                if check_lock is None:
                    logger.info(
                        "Failed to get filesystem clean lock (%s); quitting.",
                        self.blob_dir)
                    return

                self.__run_with_lock()
            finally:
                if check_lock is not None:
                    check_lock.close()
                self._finished_callback(self, check_lock is not None)
Ejemplo n.º 14
0
    def populate_not_equal(loops, bucket_kind):
        # Because we will populate when we make,
        # capture memory now to be able to include that.
        mem_before = get_memory_usage()
        client = makeOne(bucket_kind)

        begin = perf_counter()
        for _ in range(loops):
            for k, v in ALL_DATA:
                # install a copy that's not quite equal.
                # This should require saving it.
                state = v[0]
                state = state + b'1'
                new_v = (state, v[1])
                client[k] = new_v
        duration = perf_counter() - begin
        mem_used = get_memory_usage() - mem_before
        logger.info("Populated in %s; took %s mem; size: %d", duration,
                    byte_display(mem_used), len(client))
        return duration
Ejemplo n.º 15
0
    def populate_equal(loops, bucket_kind):
        # Because we will populate when we make,
        # capture memory now to be able to include that.
        mem_before = get_memory_usage()
        client = makeOne(bucket_kind)

        begin = perf_counter()
        for _ in range(loops):
            for k, v in ALL_DATA:
                # install a copy that's equal;
                # this should mean no extra copies.
                state = v[0]
                state = state[:-1] + state[-1:]
                new_v = (state, v[1])
                assert new_v == v
                assert new_v is not v
                client[k] = new_v
        duration = perf_counter() - begin
        mem_used = get_memory_usage() - mem_before
        logger.info("Populated in %s; took %s mem; size: %d", duration,
                    byte_display(mem_used), len(client))
        return duration
Ejemplo n.º 16
0
    def write_to_sqlite(self, connection):
        # pylint:disable=too-many-locals
        mem_before = get_memory_usage()

        cur = connection.cursor()

        db = Database.from_connection(connection)
        begin = time.time()

        # In a large benchmark, store_temp() accounts for 32%
        # of the total time, while move_from_temp accounts for 49%.

        # The batch size depends on how many params a stored proc can
        # have; if we go too big we get OperationalError: too many SQL
        # variables. The default is 999.
        # Note that the multiple-value syntax was added in
        # 3.7.11, 2012-03-20.
        with _timer() as batch_timer:
            cur.execute('BEGIN')
            stored_oid_tid = db.oid_to_tid
            fetch_current = time.time()
            count_written, _ = db.store_temp(
                self._items_to_write(stored_oid_tid))
            cur.execute("COMMIT")

        # Take out (reserved write) locks now; if we don't, we can get
        # 'OperationalError: database is locked' (the SQLITE_BUSY
        # error code; the SQLITE_LOCKED error code is about table
        # locks and has the string 'database table is locked') But
        # beginning immediate lets us stand in line.
        #
        # Note that the SQLITE_BUSY error on COMMIT happens "if an
        # another thread or process has a shared lock on the database
        # that prevented the database from being updated.", But "When
        # COMMIT fails in this way, the transaction remains active and
        # the COMMIT can be retried later after the reader has had a
        # chance to clear." So we could retry the commit a couple
        # times and give up if can't get there. It looks like one
        # production database takes about 2.5s to execute this entire function;
        # it seems like most instances that are shutting down get to this place
        # at roughly the same time and stack up waiting:
        #
        # Instance | Save Time | write_to_sqlite time
        #     1    |   2.36s   |   2.35s
        #     2    |   5.31s   |   5.30s
        #     3    |   6.51s   |   6.30s
        #     4    |   7.91s   |   7.90s
        #
        # That either suggests that all waiting happens just to acquire this lock and
        # commit this transaction (and that subsequent operations are inconsequential
        # in terms of time) OR that the exclusive lock isn't truly getting dropped,
        # OR that some function in subsequent processes is taking longer and longer.
        # And indeed, our logging showed that a gc.collect() at the end of this function was
        # taking more and more time:
        #
        # Instance | GC Time  | Reported memory usage after GC
        #    1     |   2.00s  |     34.4MB
        #    2     |   3.50s  |     83.2MB
        #    3     |   4.94s  |    125.9MB
        #    4     |   6.44   |    202.1MB
        #
        # The exclusive lock time did vary, but not as much; trim time
        # didn't really vary:
        #
        # Instance |  Exclusive Write Time | Batch Insert Time | Fetch Current
        #    1     |   0.09s               |   0.12s           |   0.008s
        #    2     |   1.19s               |   0.44s           |   0.008s
        #    3     |   0.91s               |   0.43s           |   0.026s
        #    4     |   0.92s               |   0.34s           |   0.026s
        with _timer() as exclusive_timer:
            cur.execute('BEGIN IMMEDIATE')
            # During the time it took for us to commit, and the time that we
            # got the lock, it's possible that someone else committed and
            # changed the data in object_state
            # Things might have changed in the database since our
            # snapshot where we put temp objects in, so we can't rely on
            # just assuming that's the truth anymore.
            rows_inserted = db.move_from_temp()
            if self.checkpoints:
                db.update_checkpoints(*self.checkpoints)

            cur.execute('COMMIT')
        # TODO: Maybe use BTrees.family.intersection to get the common keys?
        # Or maybe use a SQLite Python function?
        # 'DELETE from object_state where is_stale(zoid, tid)'
        # What's fastest? The custom function version would require scanning the entire
        # table; unfortunately this sqlite interface doesn't allow defining virtual tables.
        with _timer() as trim_timer:
            # Delete anything we still know is stale, because we
            # saw a new TID for it, but we had nothing to replace it with.
            min_allowed_writeback = OID_TID_MAP_TYPE()
            for k, v in self._min_allowed_writeback.items():
                if stored_oid_tid.get(k, MAX_TID) < v:
                    min_allowed_writeback[k] = v
            db.trim_to_size(self.limit, min_allowed_writeback)
            del min_allowed_writeback
        # Typically we write when we're closing and don't expect to do
        # anything else, so there's no need to keep tracking this info.
        self._min_allowed_writeback = OID_TID_MAP_TYPE()

        # Cleanups
        cur.close()
        del cur
        db.close()  # closes the connection.
        del db
        del stored_oid_tid

        # We're probably shutting down, don't perform a GC; we see
        # that can get quite lengthy.

        end = time.time()
        stats = self.stats()
        mem_after = get_memory_usage()
        logger.info(
            "Wrote %d items to %s in %s "
            "(%s to fetch current, %s to insert batch (%d rows), %s to write, %s to trim). "
            "Used %s memory. (Storage: %s) "
            "Total hits %s; misses %s; ratio %s (stores: %s)", count_written,
            connection, end - begin, fetch_current - begin,
            batch_timer.duration, rows_inserted,
            exclusive_timer.duration, trim_timer.duration,
            byte_display(mem_after - mem_before), self._bucket0, stats['hits'],
            stats['misses'], stats['ratio'], stats['sets'])

        return count_written
Ejemplo n.º 17
0
 def __transaction_display(self, trans, copy_result):
     num_txn_records, txn_byte_size, num_txn_blobs = copy_result
     return 'transaction %s <%4d records, %3d blobs, %9s>' % (
         readable_tid_repr(trans.tid), num_txn_records, num_txn_blobs,
         byte_display(txn_byte_size))