Exemple #1
0
    def _update_batch(cls, tuples, trx=True):
        steemd = SteemClient.instance()
        timer = Timer(total=len(tuples), entity='post', laps=['rps', 'wps'])
        tuples = sorted(tuples, key=lambda x: x[1]) # enforce ASC id's

        for tups in partition_all(1000, tuples):
            timer.batch_start()
            buffer = []

            post_args = [tup[0].split('/') for tup in tups]
            posts = steemd.get_content_batch(post_args)
            post_ids = [tup[1] for tup in tups]
            post_levels = [tup[2] for tup in tups]
            for pid, post, level in zip(post_ids, posts, post_levels):
                if post['author']:
                    buffer.append(cls._sql(pid, post, level=level))
                else:
                    print("WARNING: ignoring deleted post {}".format(pid))
                cls._bump_last_id(pid)

            timer.batch_lap()
            cls._batch_queries(buffer, trx)

            timer.batch_finish(len(posts))
            if len(tuples) >= 1000:
                print(timer.batch_status())
Exemple #2
0
    def _cache_accounts(cls, accounts, trx=True):
        timer = Timer(len(accounts), 'account', ['rps', 'wps'])
        for batch in partition_all(1000, accounts):

            timer.batch_start()
            sqls = cls._generate_cache_sqls(batch)
            timer.batch_lap()
            cls._batch_update(sqls, trx)

            timer.batch_finish(len(batch))
            if trx or len(accounts) > 1000:
                print(timer.batch_status())
Exemple #3
0
    def _cache_accounts(cls, accounts, trx=True):
        """Fetch all `accounts` and write to db."""
        timer = Timer(len(accounts), 'account', ['rps', 'wps'])
        for batch in partition_all(1000, accounts):

            timer.batch_start()
            sqls = cls._generate_cache_sqls(batch)
            timer.batch_lap()
            DB.batch_queries(sqls, trx)

            timer.batch_finish(len(batch))
            if trx or len(accounts) > 1000:
                log.info(timer.batch_status())
Exemple #4
0
def sync_from_steemd():
    is_initial_sync = DbState.is_initial_sync()
    steemd = get_adapter()

    lbound = Blocks.head_num() + 1
    ubound = steemd.last_irreversible()
    if ubound <= lbound:
        return

    _abort = False
    try:
        print("[SYNC] start block %d, +%d to sync" % (lbound, ubound-lbound+1))
        timer = Timer(ubound - lbound, entity='block', laps=['rps', 'wps'])
        while lbound < ubound:
            to = min(lbound + 1000, ubound)
            timer.batch_start()
            blocks = steemd.get_blocks_range(lbound, to)
            timer.batch_lap()
            Blocks.process_multi(blocks, is_initial_sync)
            timer.batch_finish(len(blocks))
            date = blocks[-1]['timestamp']
            print(timer.batch_status("[SYNC] Got block %d @ %s" % (to-1, date)))
            lbound = to

    except KeyboardInterrupt:
        traceback.print_exc()
        print("\n\n[SYNC] Aborted.. cleaning up..")
        _abort = True

    if not is_initial_sync:
        # Follows flushing may need to be moved closer to core (i.e. moved
        # into main block transactions). Important to keep in sync since
        # we need to prevent expensive recounts. This will fail if we aborted
        # in the middle of a transaction, meaning data loss. Better than
        # forcing it, however, since in-memory cache will be out of sync
        # with db state.
        Follow.flush(trx=True)

        # This flush is low importance; accounts are swept regularly.
        if not _abort:
            Accounts.flush(trx=True)

        # If this flush fails, all that could potentially be lost here is
        # edits and pre-payout votes. If the post has not been paid out yet,
        # then the worst case is it will be synced upon payout. If the post
        # is already paid out, worst case is to lose an edit.
        CachedPost.flush(trx=True)

    if _abort:
        print("[SYNC] Aborted")
        exit()
Exemple #5
0
    def from_steemd(cls, is_initial_sync=False, chunk_size=1000):
        """Fast sync strategy: read/process blocks in batches."""
        steemd = SteemClient.instance()
        lbound = Blocks.head_num() + 1
        ubound = steemd.last_irreversible()
        count = ubound - lbound
        if count < 1:
            return

        _abort = False
        try:
            print("[SYNC] start block %d, +%d to sync" % (lbound, count))
            timer = Timer(count, entity='block', laps=['rps', 'wps'])
            while lbound < ubound:
                timer.batch_start()

                # fetch blocks
                to = min(lbound + chunk_size, ubound)
                blocks = steemd.get_blocks_range(lbound, to)
                lbound = to
                timer.batch_lap()

                # process blocks
                Blocks.process_multi(blocks, is_initial_sync)
                timer.batch_finish(len(blocks))
                date = blocks[-1]['timestamp']
                print(
                    timer.batch_status("[SYNC] Got block %d @ %s" %
                                       (to - 1, date)))

        except KeyboardInterrupt:
            traceback.print_exc()
            print("\n\n[SYNC] Aborted.. cleaning up..")
            _abort = True

        if not is_initial_sync:
            # This flush is low importance; accounts are swept regularly.
            if not _abort:
                Accounts.flush(trx=True)

            # If this flush fails, all that could potentially be lost here is
            # edits and pre-payout votes. If the post has not been paid out yet,
            # then the worst case is it will be synced upon payout. If the post
            # is already paid out, worst case is to lose an edit.
            CachedPost.flush(trx=True)

        if _abort:
            print("[SYNC] Aborted")
            exit()
    def _update_batch(cls, tuples, trx=True, full_total=None):
        """Fetch, process, and write a batch of posts.

        Given a set of posts, fetch from steemd and write them to the
        db. The `tuples` arg is the form of `[(url, id, level)*]`
        representing posts which are to be fetched from steemd and
        updated in cache.

        Regarding _bump_last_id: there's a rare edge case when the last
        hive_post entry has been deleted "in the future" (ie, we haven't
        seen the delete op yet). So even when the post is not found
        (i.e. `not post['author']`), it's important to advance _last_id,
        because this cursor is used to deduce any missing cache entries.
        """

        steemd = SteemClient.instance()
        timer = Timer(total=len(tuples),
                      entity='post',
                      laps=['rps', 'wps'],
                      full_total=full_total)
        tuples = sorted(tuples, key=lambda x: x[1])  # enforce ASC id's

        for tups in partition_all(1000, tuples):
            timer.batch_start()
            buffer = []

            post_args = [tup[0].split('/') for tup in tups]
            posts = steemd.get_content_batch(post_args)
            post_ids = [tup[1] for tup in tups]
            post_levels = [tup[2] for tup in tups]
            for pid, post, level in zip(post_ids, posts, post_levels):
                if post['author']:
                    buffer.extend(cls._sql(pid, post, level=level))
                else:
                    # When a post has been deleted (or otherwise DNE),
                    # steemd simply returns a blank post  object w/ all
                    # fields blank. While it's best to not try to cache
                    # already-deleted posts, it can happen during missed
                    # post sweep and while using `trail_blocks` > 0.
                    pass
                cls._bump_last_id(pid)

            timer.batch_lap()
            DB.batch_queries(buffer, trx)

            timer.batch_finish(len(posts))
            if len(tuples) >= 1000:
                log.info(timer.batch_status())
Exemple #7
0
    def _cache_accounts(cls, accounts, steem, trx=True):
        """Fetch all `accounts` and write to db."""
        timer = Timer(len(accounts), 'account', ['rps', 'wps'])
        for name_batch in partition_all(1000, accounts):
            cached_at = datetime.now().strftime('%Y-%m-%dT%H:%M:%S')

            timer.batch_start()
            batch = steem.get_accounts(name_batch)

            timer.batch_lap()
            sqls = [cls._sql(acct, cached_at) for acct in batch]
            DB.batch_queries(sqls, trx)

            timer.batch_finish(len(batch))
            if trx or len(accounts) > 1000:
                log.info(timer.batch_status())
    def _update_batch(cls, tuples, trx=True, full_total=None):
        """Fetch, process, and write a batch of posts.

        Given a set of posts, fetch from steemd and write them to the
        db. The `tuples` arg is the form of `[(url, id, level)*]`
        representing posts which are to be fetched from steemd and
        updated in cache.

        Regarding _bump_last_id: there's a rare edge case when the last
        hive_post entry has been deleted "in the future" (ie, we haven't
        seen the delete op yet). So even when the post is not found
        (i.e. `not post['author']`), it's important to advance _last_id,
        because this cursor is used to deduce any missing cache entries.
        """

        steemd = SteemClient.instance()
        timer = Timer(total=len(tuples), entity='post',
                      laps=['rps', 'wps'], full_total=full_total)
        tuples = sorted(tuples, key=lambda x: x[1]) # enforce ASC id's

        for tups in partition_all(1000, tuples):
            timer.batch_start()
            buffer = []

            post_args = [tup[0].split('/') for tup in tups]
            posts = steemd.get_content_batch(post_args)
            post_ids = [tup[1] for tup in tups]
            post_levels = [tup[2] for tup in tups]
            for pid, post, level in zip(post_ids, posts, post_levels):
                if post['author']:
                    buffer.append(cls._sql(pid, post, level=level))
                else:
                    # expected to happen when sweeping missed posts as
                    # part of initial sync or crash recovery routine,
                    # otherwise indicates potential bug. TODO: assert?
                    if not cls._sweeping_missed:
                        print("WARNING: missing/deleted post %d" % pid)
                cls._bump_last_id(pid)

            timer.batch_lap()
            cls._batch_queries(buffer, trx)

            timer.batch_finish(len(posts))
            if len(tuples) >= 1000:
                print(timer.batch_status())
Exemple #9
0
    def from_dpayd(self, is_initial_sync=False, chunk_size=1000):
        """Fast sync strategy: read/process blocks in batches."""
        # pylint: disable=no-self-use
        dpayd = self._dpay
        lbound = Blocks.head_num() + 1
        ubound = self._conf.get('test_max_block') or dpayd.last_irreversible()

        count = ubound - lbound
        if count < 1:
            return

        log.info("[SYNC] start block %d, +%d to sync", lbound, count)
        timer = Timer(count, entity='block', laps=['rps', 'wps'])
        while lbound < ubound:
            timer.batch_start()

            # fetch blocks
            to = min(lbound + chunk_size, ubound)
            blocks = dpayd.get_blocks_range(lbound, to)
            lbound = to
            timer.batch_lap()

            # process blocks
            Blocks.process_multi(blocks, is_initial_sync)
            timer.batch_finish(len(blocks))

            _prefix = ("[SYNC] Got block %d @ %s" %
                       (to - 1, blocks[-1]['timestamp']))
            log.info(timer.batch_status(_prefix))

        if not is_initial_sync:
            # This flush is low importance; accounts are swept regularly.
            Accounts.flush(dpayd, trx=True)

            # If this flush fails, all that could potentially be lost here is
            # edits and pre-payout votes. If the post has not been paid out yet,
            # then the worst case is it will be synced upon payout. If the post
            # is already paid out, worst case is to lose an edit.
            CachedPost.flush(dpayd, trx=True)
Exemple #10
0
def _block_consumer(blocks_data_provider, is_initial_sync, lbound, ubound):
    from hive.utils.stats import minmax
    is_debug = log.isEnabledFor(10)
    num = 0
    time_start = OPSM.start()
    rate = {}
    LIMIT_FOR_PROCESSED_BLOCKS = 1000

    rate = minmax(rate, 0, 1.0, 0)
    sync_type_prefix = "[INITIAL SYNC]" if is_initial_sync else "[FAST SYNC]"

    def print_summary():
        stop = OPSM.stop(time_start)
        log.info("=== TOTAL STATS ===")
        wtm = WSM.log_global("Total waiting times")
        ftm = FSM.log_global("Total flush times")
        otm = OPSM.log_global("All operations present in the processed blocks")
        ttm = ftm + otm + wtm
        log.info(
            f"Elapsed time: {stop :.4f}s. Calculated elapsed time: {ttm :.4f}s. Difference: {stop - ttm :.4f}s"
        )
        if rate:
            log.info(
                f"Highest block processing rate: {rate['max'] :.4f} bps. From: {rate['max_from']} To: {rate['max_to']}"
            )
            log.info(
                f"Lowest block processing rate: {rate['min'] :.4f} bps. From: {rate['min_from']} To: {rate['min_to']}"
            )
        log.info("=== TOTAL STATS ===")

    try:
        Blocks.set_end_of_sync_lib(ubound)
        count = ubound - lbound
        timer = Timer(count, entity='block', laps=['rps', 'wps'])

        while lbound < ubound:
            number_of_blocks_to_proceed = min(
                [LIMIT_FOR_PROCESSED_BLOCKS, ubound - lbound])
            time_before_waiting_for_data = perf()

            blocks = blocks_data_provider.get(number_of_blocks_to_proceed)

            if not can_continue_thread():
                break

            assert len(blocks) == number_of_blocks_to_proceed

            to = min(lbound + number_of_blocks_to_proceed, ubound)
            timer.batch_start()

            block_start = perf()
            Blocks.process_multi(blocks, is_initial_sync)
            block_end = perf()

            timer.batch_lap()
            timer.batch_finish(len(blocks))
            time_current = perf()

            prefix = ("%s Got block %d @ %s" %
                      (sync_type_prefix, to - 1, blocks[-1].get_date()))
            log.info(timer.batch_status(prefix))
            log.info("%s Time elapsed: %fs", sync_type_prefix,
                     time_current - time_start)
            log.info("%s Current system time: %s", sync_type_prefix,
                     datetime.now().strftime("%H:%M:%S"))
            log.info(log_memory_usage())
            rate = minmax(rate, len(blocks),
                          time_current - time_before_waiting_for_data, lbound)

            if block_end - block_start > 1.0 or is_debug:
                otm = OPSM.log_current(
                    "Operations present in the processed blocks")
                ftm = FSM.log_current("Flushing times")
                wtm = WSM.log_current("Waiting times")
                log.info(f"Calculated time: {otm+ftm+wtm :.4f} s.")

            OPSM.next_blocks()
            FSM.next_blocks()
            WSM.next_blocks()

            lbound = to
            PC.broadcast(
                BroadcastObject('sync_current_block', lbound, 'blocks'))

            num = num + 1

            if not can_continue_thread():
                break
    except Exception:
        log.exception("Exception caught during processing blocks...")
        set_exception_thrown()
        print_summary()
        raise

    print_summary()
    return num
Exemple #11
0
    def _update_batch(cls, steem, tuples, trx=True, full_total=None):
        """Fetch, process, and write a batch of posts.

        Given a set of posts, fetch from steemd and write them to the
        db. The `tuples` arg is the form of `[(url, id, level)*]`
        representing posts which are to be fetched from steemd and
        updated in cache.

        Regarding _bump_last_id: there's a rare edge case when the last
        hive_post entry has been deleted "in the future" (ie, we haven't
        seen the delete op yet). So even when the post is not found
        (i.e. `not post['author']`), it's important to advance _last_id,
        because this cursor is used to deduce any missing cache entries.
        """
        # pylint: disable=too-many-locals

        timer = Timer(total=len(tuples), entity='post',
                      laps=['rps', 'wps'], full_total=full_total)
        tuples = sorted(tuples, key=lambda x: x[1]) # enforce ASC id's

        for tups in partition_all(1000, tuples):
            timer.batch_start()
            buffer = []

            post_args = [tup[0].split('/') for tup in tups]
            posts = steem.get_content_batch(post_args)
            post_ids = [tup[1] for tup in tups]
            post_levels = [tup[2] for tup in tups]

            coremap = cls._get_core_fields(tups)
            for pid, post, level in zip(post_ids, posts, post_levels):
                if post['author']:
                    assert pid in coremap, 'pid not in coremap'
                    if pid in coremap:
                        core = coremap[pid]
                        post['category'] = core['category']
                        post['community_id'] = core['community_id']
                        post['gray'] = core['is_muted']
                        post['hide'] = not core['is_valid']
                    buffer.extend(cls._sql(pid, post, level=level))
                else:
                    # When a post has been deleted (or otherwise DNE),
                    # steemd simply returns a blank post  object w/ all
                    # fields blank. While it's best to not try to cache
                    # already-deleted posts, it can happen during missed
                    # post sweep and while using `trail_blocks` > 0.

                    # monitor: post not found which should def. exist; see #173
                    sql = """SELECT id, author, permlink, is_deleted
                               FROM hive_posts WHERE id = :id"""
                    row = DB.query_row(sql, id=pid)
                    if row['is_deleted']:
                        # rare or impossible -- report if detected
                        log.error("found deleted post for %s: %s", level, row)
                    else:
                        # most likely cause of this condition is that the post
                        # has been deleted (e.g. sync trails by 2 blocks, post
                        # was inserted at head-2, deleted at head). another
                        # possible cause is that a node behind a load balancer
                        # is behind; we detected a new post but querying a node
                        # that hasn't seen it yet.
                        log.warning("post not found -- DEFER %s %s", level, row)
                        cls._dirty(level, row['author'], row['permlink'], pid)

                cls._bump_last_id(pid)

            timer.batch_lap()
            DB.batch_queries(buffer, trx)

            timer.batch_finish(len(posts))
            if len(tuples) >= 1000:
                log.info(timer.batch_status())