def _update_batch(cls, tuples, trx=True): steemd = SteemClient.instance() timer = Timer(total=len(tuples), entity='post', laps=['rps', 'wps']) tuples = sorted(tuples, key=lambda x: x[1]) # enforce ASC id's for tups in partition_all(1000, tuples): timer.batch_start() buffer = [] post_args = [tup[0].split('/') for tup in tups] posts = steemd.get_content_batch(post_args) post_ids = [tup[1] for tup in tups] post_levels = [tup[2] for tup in tups] for pid, post, level in zip(post_ids, posts, post_levels): if post['author']: buffer.append(cls._sql(pid, post, level=level)) else: print("WARNING: ignoring deleted post {}".format(pid)) cls._bump_last_id(pid) timer.batch_lap() cls._batch_queries(buffer, trx) timer.batch_finish(len(posts)) if len(tuples) >= 1000: print(timer.batch_status())
def _cache_accounts(cls, accounts, trx=True): timer = Timer(len(accounts), 'account', ['rps', 'wps']) for batch in partition_all(1000, accounts): timer.batch_start() sqls = cls._generate_cache_sqls(batch) timer.batch_lap() cls._batch_update(sqls, trx) timer.batch_finish(len(batch)) if trx or len(accounts) > 1000: print(timer.batch_status())
def _cache_accounts(cls, accounts, trx=True): """Fetch all `accounts` and write to db.""" timer = Timer(len(accounts), 'account', ['rps', 'wps']) for batch in partition_all(1000, accounts): timer.batch_start() sqls = cls._generate_cache_sqls(batch) timer.batch_lap() DB.batch_queries(sqls, trx) timer.batch_finish(len(batch)) if trx or len(accounts) > 1000: log.info(timer.batch_status())
def sync_from_steemd(): is_initial_sync = DbState.is_initial_sync() steemd = get_adapter() lbound = Blocks.head_num() + 1 ubound = steemd.last_irreversible() if ubound <= lbound: return _abort = False try: print("[SYNC] start block %d, +%d to sync" % (lbound, ubound-lbound+1)) timer = Timer(ubound - lbound, entity='block', laps=['rps', 'wps']) while lbound < ubound: to = min(lbound + 1000, ubound) timer.batch_start() blocks = steemd.get_blocks_range(lbound, to) timer.batch_lap() Blocks.process_multi(blocks, is_initial_sync) timer.batch_finish(len(blocks)) date = blocks[-1]['timestamp'] print(timer.batch_status("[SYNC] Got block %d @ %s" % (to-1, date))) lbound = to except KeyboardInterrupt: traceback.print_exc() print("\n\n[SYNC] Aborted.. cleaning up..") _abort = True if not is_initial_sync: # Follows flushing may need to be moved closer to core (i.e. moved # into main block transactions). Important to keep in sync since # we need to prevent expensive recounts. This will fail if we aborted # in the middle of a transaction, meaning data loss. Better than # forcing it, however, since in-memory cache will be out of sync # with db state. Follow.flush(trx=True) # This flush is low importance; accounts are swept regularly. if not _abort: Accounts.flush(trx=True) # If this flush fails, all that could potentially be lost here is # edits and pre-payout votes. If the post has not been paid out yet, # then the worst case is it will be synced upon payout. If the post # is already paid out, worst case is to lose an edit. CachedPost.flush(trx=True) if _abort: print("[SYNC] Aborted") exit()
def from_steemd(cls, is_initial_sync=False, chunk_size=1000): """Fast sync strategy: read/process blocks in batches.""" steemd = SteemClient.instance() lbound = Blocks.head_num() + 1 ubound = steemd.last_irreversible() count = ubound - lbound if count < 1: return _abort = False try: print("[SYNC] start block %d, +%d to sync" % (lbound, count)) timer = Timer(count, entity='block', laps=['rps', 'wps']) while lbound < ubound: timer.batch_start() # fetch blocks to = min(lbound + chunk_size, ubound) blocks = steemd.get_blocks_range(lbound, to) lbound = to timer.batch_lap() # process blocks Blocks.process_multi(blocks, is_initial_sync) timer.batch_finish(len(blocks)) date = blocks[-1]['timestamp'] print( timer.batch_status("[SYNC] Got block %d @ %s" % (to - 1, date))) except KeyboardInterrupt: traceback.print_exc() print("\n\n[SYNC] Aborted.. cleaning up..") _abort = True if not is_initial_sync: # This flush is low importance; accounts are swept regularly. if not _abort: Accounts.flush(trx=True) # If this flush fails, all that could potentially be lost here is # edits and pre-payout votes. If the post has not been paid out yet, # then the worst case is it will be synced upon payout. If the post # is already paid out, worst case is to lose an edit. CachedPost.flush(trx=True) if _abort: print("[SYNC] Aborted") exit()
def _update_batch(cls, tuples, trx=True, full_total=None): """Fetch, process, and write a batch of posts. Given a set of posts, fetch from steemd and write them to the db. The `tuples` arg is the form of `[(url, id, level)*]` representing posts which are to be fetched from steemd and updated in cache. Regarding _bump_last_id: there's a rare edge case when the last hive_post entry has been deleted "in the future" (ie, we haven't seen the delete op yet). So even when the post is not found (i.e. `not post['author']`), it's important to advance _last_id, because this cursor is used to deduce any missing cache entries. """ steemd = SteemClient.instance() timer = Timer(total=len(tuples), entity='post', laps=['rps', 'wps'], full_total=full_total) tuples = sorted(tuples, key=lambda x: x[1]) # enforce ASC id's for tups in partition_all(1000, tuples): timer.batch_start() buffer = [] post_args = [tup[0].split('/') for tup in tups] posts = steemd.get_content_batch(post_args) post_ids = [tup[1] for tup in tups] post_levels = [tup[2] for tup in tups] for pid, post, level in zip(post_ids, posts, post_levels): if post['author']: buffer.extend(cls._sql(pid, post, level=level)) else: # When a post has been deleted (or otherwise DNE), # steemd simply returns a blank post object w/ all # fields blank. While it's best to not try to cache # already-deleted posts, it can happen during missed # post sweep and while using `trail_blocks` > 0. pass cls._bump_last_id(pid) timer.batch_lap() DB.batch_queries(buffer, trx) timer.batch_finish(len(posts)) if len(tuples) >= 1000: log.info(timer.batch_status())
def _cache_accounts(cls, accounts, steem, trx=True): """Fetch all `accounts` and write to db.""" timer = Timer(len(accounts), 'account', ['rps', 'wps']) for name_batch in partition_all(1000, accounts): cached_at = datetime.now().strftime('%Y-%m-%dT%H:%M:%S') timer.batch_start() batch = steem.get_accounts(name_batch) timer.batch_lap() sqls = [cls._sql(acct, cached_at) for acct in batch] DB.batch_queries(sqls, trx) timer.batch_finish(len(batch)) if trx or len(accounts) > 1000: log.info(timer.batch_status())
def _update_batch(cls, tuples, trx=True, full_total=None): """Fetch, process, and write a batch of posts. Given a set of posts, fetch from steemd and write them to the db. The `tuples` arg is the form of `[(url, id, level)*]` representing posts which are to be fetched from steemd and updated in cache. Regarding _bump_last_id: there's a rare edge case when the last hive_post entry has been deleted "in the future" (ie, we haven't seen the delete op yet). So even when the post is not found (i.e. `not post['author']`), it's important to advance _last_id, because this cursor is used to deduce any missing cache entries. """ steemd = SteemClient.instance() timer = Timer(total=len(tuples), entity='post', laps=['rps', 'wps'], full_total=full_total) tuples = sorted(tuples, key=lambda x: x[1]) # enforce ASC id's for tups in partition_all(1000, tuples): timer.batch_start() buffer = [] post_args = [tup[0].split('/') for tup in tups] posts = steemd.get_content_batch(post_args) post_ids = [tup[1] for tup in tups] post_levels = [tup[2] for tup in tups] for pid, post, level in zip(post_ids, posts, post_levels): if post['author']: buffer.append(cls._sql(pid, post, level=level)) else: # expected to happen when sweeping missed posts as # part of initial sync or crash recovery routine, # otherwise indicates potential bug. TODO: assert? if not cls._sweeping_missed: print("WARNING: missing/deleted post %d" % pid) cls._bump_last_id(pid) timer.batch_lap() cls._batch_queries(buffer, trx) timer.batch_finish(len(posts)) if len(tuples) >= 1000: print(timer.batch_status())
def from_dpayd(self, is_initial_sync=False, chunk_size=1000): """Fast sync strategy: read/process blocks in batches.""" # pylint: disable=no-self-use dpayd = self._dpay lbound = Blocks.head_num() + 1 ubound = self._conf.get('test_max_block') or dpayd.last_irreversible() count = ubound - lbound if count < 1: return log.info("[SYNC] start block %d, +%d to sync", lbound, count) timer = Timer(count, entity='block', laps=['rps', 'wps']) while lbound < ubound: timer.batch_start() # fetch blocks to = min(lbound + chunk_size, ubound) blocks = dpayd.get_blocks_range(lbound, to) lbound = to timer.batch_lap() # process blocks Blocks.process_multi(blocks, is_initial_sync) timer.batch_finish(len(blocks)) _prefix = ("[SYNC] Got block %d @ %s" % (to - 1, blocks[-1]['timestamp'])) log.info(timer.batch_status(_prefix)) if not is_initial_sync: # This flush is low importance; accounts are swept regularly. Accounts.flush(dpayd, trx=True) # If this flush fails, all that could potentially be lost here is # edits and pre-payout votes. If the post has not been paid out yet, # then the worst case is it will be synced upon payout. If the post # is already paid out, worst case is to lose an edit. CachedPost.flush(dpayd, trx=True)
def _block_consumer(blocks_data_provider, is_initial_sync, lbound, ubound): from hive.utils.stats import minmax is_debug = log.isEnabledFor(10) num = 0 time_start = OPSM.start() rate = {} LIMIT_FOR_PROCESSED_BLOCKS = 1000 rate = minmax(rate, 0, 1.0, 0) sync_type_prefix = "[INITIAL SYNC]" if is_initial_sync else "[FAST SYNC]" def print_summary(): stop = OPSM.stop(time_start) log.info("=== TOTAL STATS ===") wtm = WSM.log_global("Total waiting times") ftm = FSM.log_global("Total flush times") otm = OPSM.log_global("All operations present in the processed blocks") ttm = ftm + otm + wtm log.info( f"Elapsed time: {stop :.4f}s. Calculated elapsed time: {ttm :.4f}s. Difference: {stop - ttm :.4f}s" ) if rate: log.info( f"Highest block processing rate: {rate['max'] :.4f} bps. From: {rate['max_from']} To: {rate['max_to']}" ) log.info( f"Lowest block processing rate: {rate['min'] :.4f} bps. From: {rate['min_from']} To: {rate['min_to']}" ) log.info("=== TOTAL STATS ===") try: Blocks.set_end_of_sync_lib(ubound) count = ubound - lbound timer = Timer(count, entity='block', laps=['rps', 'wps']) while lbound < ubound: number_of_blocks_to_proceed = min( [LIMIT_FOR_PROCESSED_BLOCKS, ubound - lbound]) time_before_waiting_for_data = perf() blocks = blocks_data_provider.get(number_of_blocks_to_proceed) if not can_continue_thread(): break assert len(blocks) == number_of_blocks_to_proceed to = min(lbound + number_of_blocks_to_proceed, ubound) timer.batch_start() block_start = perf() Blocks.process_multi(blocks, is_initial_sync) block_end = perf() timer.batch_lap() timer.batch_finish(len(blocks)) time_current = perf() prefix = ("%s Got block %d @ %s" % (sync_type_prefix, to - 1, blocks[-1].get_date())) log.info(timer.batch_status(prefix)) log.info("%s Time elapsed: %fs", sync_type_prefix, time_current - time_start) log.info("%s Current system time: %s", sync_type_prefix, datetime.now().strftime("%H:%M:%S")) log.info(log_memory_usage()) rate = minmax(rate, len(blocks), time_current - time_before_waiting_for_data, lbound) if block_end - block_start > 1.0 or is_debug: otm = OPSM.log_current( "Operations present in the processed blocks") ftm = FSM.log_current("Flushing times") wtm = WSM.log_current("Waiting times") log.info(f"Calculated time: {otm+ftm+wtm :.4f} s.") OPSM.next_blocks() FSM.next_blocks() WSM.next_blocks() lbound = to PC.broadcast( BroadcastObject('sync_current_block', lbound, 'blocks')) num = num + 1 if not can_continue_thread(): break except Exception: log.exception("Exception caught during processing blocks...") set_exception_thrown() print_summary() raise print_summary() return num
def _update_batch(cls, steem, tuples, trx=True, full_total=None): """Fetch, process, and write a batch of posts. Given a set of posts, fetch from steemd and write them to the db. The `tuples` arg is the form of `[(url, id, level)*]` representing posts which are to be fetched from steemd and updated in cache. Regarding _bump_last_id: there's a rare edge case when the last hive_post entry has been deleted "in the future" (ie, we haven't seen the delete op yet). So even when the post is not found (i.e. `not post['author']`), it's important to advance _last_id, because this cursor is used to deduce any missing cache entries. """ # pylint: disable=too-many-locals timer = Timer(total=len(tuples), entity='post', laps=['rps', 'wps'], full_total=full_total) tuples = sorted(tuples, key=lambda x: x[1]) # enforce ASC id's for tups in partition_all(1000, tuples): timer.batch_start() buffer = [] post_args = [tup[0].split('/') for tup in tups] posts = steem.get_content_batch(post_args) post_ids = [tup[1] for tup in tups] post_levels = [tup[2] for tup in tups] coremap = cls._get_core_fields(tups) for pid, post, level in zip(post_ids, posts, post_levels): if post['author']: assert pid in coremap, 'pid not in coremap' if pid in coremap: core = coremap[pid] post['category'] = core['category'] post['community_id'] = core['community_id'] post['gray'] = core['is_muted'] post['hide'] = not core['is_valid'] buffer.extend(cls._sql(pid, post, level=level)) else: # When a post has been deleted (or otherwise DNE), # steemd simply returns a blank post object w/ all # fields blank. While it's best to not try to cache # already-deleted posts, it can happen during missed # post sweep and while using `trail_blocks` > 0. # monitor: post not found which should def. exist; see #173 sql = """SELECT id, author, permlink, is_deleted FROM hive_posts WHERE id = :id""" row = DB.query_row(sql, id=pid) if row['is_deleted']: # rare or impossible -- report if detected log.error("found deleted post for %s: %s", level, row) else: # most likely cause of this condition is that the post # has been deleted (e.g. sync trails by 2 blocks, post # was inserted at head-2, deleted at head). another # possible cause is that a node behind a load balancer # is behind; we detected a new post but querying a node # that hasn't seen it yet. log.warning("post not found -- DEFER %s %s", level, row) cls._dirty(level, row['author'], row['permlink'], pid) cls._bump_last_id(pid) timer.batch_lap() DB.batch_queries(buffer, trx) timer.batch_finish(len(posts)) if len(tuples) >= 1000: log.info(timer.batch_status())