Esempio n. 1
0
 def populate_empty(loops, bucket_kind):
     client = makeOne(bucket_kind, populate=False)
     mem_before = get_memory_usage()
     begin = perf_counter()
     for _ in range(loops):
         for k, v in ALL_DATA:
             client[k] = v
     duration = perf_counter() - begin
     mem_used = get_memory_usage() - mem_before
     logger.info("Populated in %s; took %s mem; size: %d", duration,
                 byte_display(mem_used), len(client))
     return duration
Esempio n. 2
0
    def bulk_update(self,
                    keys_and_values,
                    source='<unknown>',
                    log_count=None,
                    mem_usage_before=None):
        """
        Insert all the ``(key, value)`` pairs found in *keys_and_values*.

        This will permute the most-recently-used status of any existing entries.
        Entries in the *keys_and_values* iterable should be returned from
        least recent to most recent, as the items at the end will be considered to be
        the most recent. (Alternately, you can think of them as needing to be in order
        from lowest priority to highest priority.)

        This will never evict existing entries from the cache.
        """
        now = time.time()
        mem_usage_before = mem_usage_before if mem_usage_before is not None else get_memory_usage(
        )
        mem_usage_before_this = get_memory_usage()
        log_count = log_count or len(keys_and_values)

        data = self._cache

        if data:
            # Loading more data into an existing bucket.
            # Load only the *new* keys, trying to get the newest ones
            # because LRU is going to get messed up anyway.
            #
            # If we were empty, then take what they give us, LRU
            # first, so that as we iterate the last item in the list
            # becomes the MRU item.
            new_entries_newest_first = [
                t for t in keys_and_values if t[0] not in data
            ]
            new_entries_newest_first.reverse()
            keys_and_values = new_entries_newest_first
            del new_entries_newest_first

        stored = data.add_MRUs(keys_and_values, return_count_only=True)

        then = time.time()
        del keys_and_values  # For memory reporting.
        mem_usage_after = get_memory_usage()
        log.info(
            "Examined %d and stored %d items from %s in %s using %s. "
            "(%s local) (%s)", log_count, stored,
            getattr(source, 'name', source), then - now,
            byte_display(mem_usage_after - mem_usage_before),
            byte_display(mem_usage_after - mem_usage_before_this), self)
        return log_count, stored
Esempio n. 3
0
    def read_from_stream(self, cache_file):
        # Unlike write_to_stream, using the raw stream
        # is fine for both Py 2 and 3.
        mem_usage_before = get_memory_usage()
        unpick = Unpickler(cache_file)

        # Local optimizations
        load = unpick.load

        version = load()
        if version != self._FILE_VERSION:  # pragma: no cover
            raise ValueError("Incorrect version of cache_file")

        keys_and_values = []
        try:
            while 1:
                k_v = load()
                keys_and_values.append(k_v)
        except EOFError:
            pass

        # Reclaim memory
        del load
        del unpick

        return self.bulk_update(keys_and_values,
                                cache_file,
                                mem_usage_before=mem_usage_before)
Esempio n. 4
0
    def create_and_populate_client():
        client = LocalClient(cache_options, cache_pfx)
        # Monkey with the internals so we don't have to
        # populate multiple times.
        #print("Testing", type(bucket._dict))

        # Only need to populate in the workers.
        size_dists = [100] * 800 + [300] * 500 + [1024] * 300 + [2048] * 200 + [4096] * 150

        with open('/dev/urandom', 'rb') as rnd:
            data = [rnd.read(x) for x in size_dists]
        data_iter = itertools.cycle(data)
        keys_and_values = []
        len_values = 0
        j = 0
        for j, datum in enumerate(data_iter):
            if len(datum) > client.limit or len_values + len(datum) > client.limit:
                break
            len_values += len(datum)
            # To ensure the pickle memo cache doesn't just write out "use object X",
            # but distinct copies of the strings, we need to copy them
            keys_and_values.append(
                ((j, j), (datum[:-1] + b'x', j))
            )

            # # We need to get the item so its frequency goes up enough to be written
            # # (this is while we're doing an aging at write time, which may go away).
            # # Using an assert statement causes us to write nothing if -O is used.
            # if bucket[(j, j)] is datum:
            #     raise AssertionError()
        mem_before = get_memory_usage()
        client._bulk_update(keys_and_values, mem_usage_before=mem_before)
        del keys_and_values
        #print("Len", len(bucket), "size", bucket.size, "checkpoints", client.get_checkpoints())
        return client
Esempio n. 5
0
    def populate_not_equal(loops, bucket_kind):
        # Because we will populate when we make,
        # capture memory now to be able to include that.
        mem_before = get_memory_usage()
        client = makeOne(bucket_kind)

        begin = perf_counter()
        for _ in range(loops):
            for k, v in ALL_DATA:
                # install a copy that's not quite equal.
                # This should require saving it.
                state = v[0]
                state = state + b'1'
                new_v = (state, v[1])
                client[k] = new_v
        duration = perf_counter() - begin
        mem_used = get_memory_usage() - mem_before
        logger.info("Populated in %s; took %s mem; size: %d", duration,
                    byte_display(mem_used), len(client))
        return duration
Esempio n. 6
0
    def populate_equal(loops, bucket_kind):
        # Because we will populate when we make,
        # capture memory now to be able to include that.
        mem_before = get_memory_usage()
        client = makeOne(bucket_kind)

        begin = perf_counter()
        for _ in range(loops):
            for k, v in ALL_DATA:
                # install a copy that's equal;
                # this should mean no extra copies.
                state = v[0]
                state = state[:-1] + state[-1:]
                new_v = (state, v[1])
                assert new_v == v
                assert new_v is not v
                client[k] = new_v
        duration = perf_counter() - begin
        mem_used = get_memory_usage() - mem_before
        logger.info("Populated in %s; took %s mem; size: %d", duration,
                    byte_display(mem_used), len(client))
        return duration
Esempio n. 7
0
    def _bulk_update(self,
                     keys_and_values,
                     source='<unknown>',
                     log_count=None,
                     mem_usage_before=None):
        """
        Insert all the
        ``(oid, (state, actual_tid, frozen, frequency)))`` pairs
        found in *keys_and_values*, rejecting entries once we get too full.

        *keys_and_values* should be ordered from least to most recently used.

        This can only be done in an empty cache.
        """
        now = time.time()
        mem_usage_before = mem_usage_before if mem_usage_before is not None else get_memory_usage(
        )
        mem_usage_before_this = get_memory_usage()
        log_count = log_count or len(keys_and_values)

        data = self._cache
        if data:
            raise ValueError("Only bulk load into an empty cache.")

        stored = data.add_MRUs(keys_and_values, return_count_only=True)

        then = time.time()
        del keys_and_values  # For memory reporting.
        mem_usage_after = get_memory_usage()
        logger.info(
            "Examined %d and stored %d items from %s in %s using %s. "
            "(%s local) (%s)", log_count, stored,
            getattr(source, 'name', source), then - now,
            byte_display(mem_usage_after - mem_usage_before),
            byte_display(mem_usage_after - mem_usage_before_this), self)
        return log_count, stored
Esempio n. 8
0
    def makeOne(populate=True, data=None):
        mem_before = get_memory_usage()
        gc.collect()
        gc.collect()
        objects_before = len(gc.get_objects())

        client = LocalClient(options, 'pfx')
        client.b_mem_before = mem_before
        client.b_objects_before = objects_before
        if populate:
            client._bulk_update([
                (t[0], (t[1][0], t[1][1], False, 1))
                for t in data or _make_data(random_data, KEY_GROUP_SIZE)
            ])
        return client
Esempio n. 9
0
 def report(client, duration, extra=''):
     gc.collect()
     gc.collect()
     mem_before = client.b_mem_before
     objects_before = client.b_objects_before
     mem_after = get_memory_usage()
     mem_used = mem_after - mem_before
     objects_after = len(gc.get_objects())
     logger.info(
         "%s:%sExecuted in %s; Mem delta: %s;  Object Delta: %s; "
         "Num keys: %d; Weight: %s; "
         "Mem before: %s; Mem after: %s",
         os.getpid(), extra,
         duration, byte_display(mem_used), objects_after - objects_before,
         len(client), byte_display(client.size),
         byte_display(mem_before), byte_display(mem_after),
     )
Esempio n. 10
0
    def read_from_sqlite(self, connection):
        import gc
        gc.collect()  # Free memory, we're about to make big allocations.
        mem_before = get_memory_usage()

        db = Database.from_connection(connection)
        checkpoints = db.checkpoints

        @_log_timed
        def fetch_and_filter_rows():
            # Do this here so that we don't have the result
            # in a local variable and it can be collected
            # before we measure the memory delta.
            #
            # In large benchmarks, this function accounts for 57%
            # of the total time to load data. 26% of the total is
            # fetching rows from sqlite, and 18% of the total is allocating
            # storage for the blob state.
            #
            # We make one call into sqlite and let it handle the iterating.
            # Items are (oid, key_tid, state, actual_tid).
            # key_tid may equal the actual tid, or be -1 when the row was previously
            # frozen;
            # That doesn't matter to us, we always freeze all rows.
            size = 0
            limit = self.limit
            items = []
            rows = db.fetch_rows_by_priority()
            for oid, frozen, state, actual_tid, frequency in rows:
                size += len(state)
                if size > limit:
                    break
                items.append((oid, (state, actual_tid, frozen, frequency)))
            consume(rows)
            # Rows came to us MRU to LRU, but we need to feed them the other way.
            items.reverse()
            return items

        # In the large benchmark, this is 25% of the total time.
        # 18% of the total time is preallocating the entry nodes.
        self._bulk_update(fetch_and_filter_rows(),
                          source=connection,
                          mem_usage_before=mem_before)
        return checkpoints
Esempio n. 11
0
    def write_to_sqlite(self, connection):
        # pylint:disable=too-many-locals
        mem_before = get_memory_usage()

        cur = connection.cursor()

        db = Database.from_connection(connection)
        begin = time.time()

        # In a large benchmark, store_temp() accounts for 32%
        # of the total time, while move_from_temp accounts for 49%.

        # The batch size depends on how many params a stored proc can
        # have; if we go too big we get OperationalError: too many SQL
        # variables. The default is 999.
        # Note that the multiple-value syntax was added in
        # 3.7.11, 2012-03-20.
        with _timer() as batch_timer:
            cur.execute('BEGIN')
            stored_oid_tid = db.oid_to_tid
            fetch_current = time.time()
            count_written, _ = db.store_temp(
                self._items_to_write(stored_oid_tid))
            cur.execute("COMMIT")

        # Take out (reserved write) locks now; if we don't, we can get
        # 'OperationalError: database is locked' (the SQLITE_BUSY
        # error code; the SQLITE_LOCKED error code is about table
        # locks and has the string 'database table is locked') But
        # beginning immediate lets us stand in line.
        #
        # Note that the SQLITE_BUSY error on COMMIT happens "if an
        # another thread or process has a shared lock on the database
        # that prevented the database from being updated.", But "When
        # COMMIT fails in this way, the transaction remains active and
        # the COMMIT can be retried later after the reader has had a
        # chance to clear." So we could retry the commit a couple
        # times and give up if can't get there. It looks like one
        # production database takes about 2.5s to execute this entire function;
        # it seems like most instances that are shutting down get to this place
        # at roughly the same time and stack up waiting:
        #
        # Instance | Save Time | write_to_sqlite time
        #     1    |   2.36s   |   2.35s
        #     2    |   5.31s   |   5.30s
        #     3    |   6.51s   |   6.30s
        #     4    |   7.91s   |   7.90s
        #
        # That either suggests that all waiting happens just to acquire this lock and
        # commit this transaction (and that subsequent operations are inconsequential
        # in terms of time) OR that the exclusive lock isn't truly getting dropped,
        # OR that some function in subsequent processes is taking longer and longer.
        # And indeed, our logging showed that a gc.collect() at the end of this function was
        # taking more and more time:
        #
        # Instance | GC Time  | Reported memory usage after GC
        #    1     |   2.00s  |     34.4MB
        #    2     |   3.50s  |     83.2MB
        #    3     |   4.94s  |    125.9MB
        #    4     |   6.44   |    202.1MB
        #
        # The exclusive lock time did vary, but not as much; trim time
        # didn't really vary:
        #
        # Instance |  Exclusive Write Time | Batch Insert Time | Fetch Current
        #    1     |   0.09s               |   0.12s           |   0.008s
        #    2     |   1.19s               |   0.44s           |   0.008s
        #    3     |   0.91s               |   0.43s           |   0.026s
        #    4     |   0.92s               |   0.34s           |   0.026s
        with _timer() as exclusive_timer:
            cur.execute('BEGIN IMMEDIATE')
            # During the time it took for us to commit, and the time that we
            # got the lock, it's possible that someone else committed and
            # changed the data in object_state
            # Things might have changed in the database since our
            # snapshot where we put temp objects in, so we can't rely on
            # just assuming that's the truth anymore.
            rows_inserted = db.move_from_temp()
            if self.checkpoints:
                db.update_checkpoints(*self.checkpoints)

            cur.execute('COMMIT')
        # TODO: Maybe use BTrees.family.intersection to get the common keys?
        # Or maybe use a SQLite Python function?
        # 'DELETE from object_state where is_stale(zoid, tid)'
        # What's fastest? The custom function version would require scanning the entire
        # table; unfortunately this sqlite interface doesn't allow defining virtual tables.
        with _timer() as trim_timer:
            # Delete anything we still know is stale, because we
            # saw a new TID for it, but we had nothing to replace it with.
            min_allowed_writeback = OID_TID_MAP_TYPE()
            for k, v in self._min_allowed_writeback.items():
                if stored_oid_tid.get(k, MAX_TID) < v:
                    min_allowed_writeback[k] = v
            db.trim_to_size(self.limit, min_allowed_writeback)
            del min_allowed_writeback
        # Typically we write when we're closing and don't expect to do
        # anything else, so there's no need to keep tracking this info.
        self._min_allowed_writeback = OID_TID_MAP_TYPE()

        # Cleanups
        cur.close()
        del cur
        db.close()  # closes the connection.
        del db
        del stored_oid_tid

        # We're probably shutting down, don't perform a GC; we see
        # that can get quite lengthy.

        end = time.time()
        stats = self.stats()
        mem_after = get_memory_usage()
        logger.info(
            "Wrote %d items to %s in %s "
            "(%s to fetch current, %s to insert batch (%d rows), %s to write, %s to trim). "
            "Used %s memory. (Storage: %s) "
            "Total hits %s; misses %s; ratio %s (stores: %s)", count_written,
            connection, end - begin, fetch_current - begin,
            batch_timer.duration, rows_inserted,
            exclusive_timer.duration, trim_timer.duration,
            byte_display(mem_after - mem_before), self._bucket0, stats['hits'],
            stats['misses'], stats['ratio'], stats['sets'])

        return count_written
Esempio n. 12
0
    def read_from_sqlite(self, connection, row_filter=None):
        import gc
        gc.collect()  # Free memory, we're about to make big allocations.
        mem_before = get_memory_usage()

        db = Database.from_connection(connection)
        checkpoints = db.checkpoints
        if checkpoints:
            self.store_checkpoints(*checkpoints)

        self._min_allowed_writeback = db.oid_to_tid

        # XXX: The cache_ring is going to consume all the items we
        # give it, even if it doesn't actually add them to the cache.
        # It also creates a very large preallocated array for all to
        # hold the `total_count` of items. We don't want to read more
        # rows than necessary, to keep the delta maps small and to
        # avoid the overhead; we could pass the COUNT(zoid) to
        # `bulk_update`, and have our row iterable be a generator that
        # reads a row and breaks when it reaches the limit, but then
        # we have that whole preallocated array hanging around, which
        # is probably much bigger than we need. So we need to give an
        # accurate count, and the easiest way to do that is to materialize
        # the list of rows.
        #
        # In practice, we can assume that the limit hasn't changed between this
        # run and the last run, so the database will already have been trimmed
        # to fit the desired size, so essentially all the rows in the database
        # should go in our cache.
        @_log_timed
        def fetch_and_filter_rows():
            # Do this here so that we don't have the result
            # in a local variable and it can be collected
            # before we measure the memory delta.
            #
            # In large benchmarks, this function accounts for 57%
            # of the total time to load data. 26% of the total is
            # fetching rows from sqlite, and 18% of the total is allocating
            # storage for the blob state.
            #
            # We make one call into sqlite and let it handle the iterating.
            # Items are (oid, key_tid, state, actual_tid). Currently,
            # key_tid == actual_tid
            items = db.list_rows_by_priority()
            # row_filter produces the sequence ((oid, key_tid) (state, actual_tid))
            if row_filter is not None:
                row_iter = row_filter(checkpoints, items)
                items = list(row_iter)
            else:
                items = [(r[:2], r[2:]) for r in items]

            # Look at them from most to least recent,
            # but insert them the other way.
            items.reverse()
            return items

        f = fetch_and_filter_rows.__wrapped__
        f.__name__ = f.__name__ + ':' + (str(row_filter) if row_filter
                                         is not None else '<no filter>')

        # In the large benchmark, this is 25% of the total time.
        # 18% of the total time is preallocating the entry nodes.
        self.__bucket.bulk_update(fetch_and_filter_rows(),
                                  source=connection,
                                  log_count=len(self._min_allowed_writeback),
                                  mem_usage_before=mem_before)