def prometheus_metrics_exporter(): PrometheusMetric.populate_collectors() registry = CollectorRegistry() multiprocess.MultiProcessCollector(registry) data = generate_latest(registry) headers = { "Content-type": CONTENT_TYPE_LATEST, "Content-Length": str(len(data)), } return Response(data, headers=headers, mimetype=CONTENT_TYPE_LATEST)
def wrap(*args, **kwargs): try: application_key, application_name = extract_app_name_key() route_key, route = extract_route_key() REDIS.hincrby(route_key, route, 1) if application_name: REDIS.hincrby(application_key, application_name, 1) record_aggregate_metrics() except Exception as e: logger.error("Error while recording metrics: %s", e.message) metric = PrometheusMetric( "flask_route_latency_seconds", "Runtimes for flask routes", ( "route", "code", ), ) result = func(*args, **kwargs) try: if isinstance(result, fResponse): code = result.status_code elif isinstance(result, wResponse): code = result.status else: code = result[1] except Exception as e: code = -1 logger.error( "Error extracting response code from type<%s>: %s", type(result), e, ) route = route.split("?")[0] if "/v1/full/" in route or "/users/intersection/" in route: route = "/".join(route.split("/")[:4]) elif "/v1/users/" in route and ("/followers" in route or "/following" in route): route = "/".join( route.split("/")[:3] + ["*"] + route.split("/")[-1:]) else: route = "/".join(route.split("/")[:3]) metric.save_time({"route": route, "code": code}) return result
def health_check_prometheus_exporter(): health_results, is_unhealthy = get_health({}) PrometheusMetric( "health_check_block_difference_current", "Difference between the latest block and the latest indexed block", metric_type=PrometheusType.GAUGE, ).save(health_results["block_difference"]) PrometheusMetric( "health_check_latest_indexed_block_num_current", "Latest indexed block number", metric_type=PrometheusType.GAUGE, ).save(health_results["web"]["blocknumber"])
def celery_tasks_prometheus_exporter(): tasks = get_celery_tasks()["celery_tasks"] metric = PrometheusMetric( "celery_running_tasks", "The currently running celery tasks", labelnames=["task_name"], metric_type=PrometheusType.GAUGE, ) for task in tasks: try: metric.save_time({"task_name": task["task_name"]}, start_time=task["started_at"]) except: logger.exception(f"Processing failed for task: {task}")
def update_view(session: Session, mat_view_name: str): start_time = time.time() metric = PrometheusMetric( "update_trending_view_runtime_seconds", "Runtimes for src.task.index_trending:update_view()", ("mat_view_name", ), ) session.execute(f"REFRESH MATERIALIZED VIEW {mat_view_name}") update_time = time.time() - start_time metric.save_time({"mat_view_name": mat_view_name}) logger.info( f"index_trending.py | Finished updating {mat_view_name} in: {time.time()-start_time} sec", extra={ "job": "index_trending", "update_time": update_time, "mat_view_name": mat_view_name, }, )
def update_metrics(self): # Cache custom task class properties # Details regarding custom task context can be found in wiki # Custom Task definition can be found in src/app.py db = update_metrics.db redis = update_metrics.redis # Define lock acquired boolean have_lock = False # Define redis lock object update_lock = redis.lock("update_metrics_lock", blocking_timeout=25) try: # Attempt to acquire lock - do not block if unable to acquire have_lock = update_lock.acquire(blocking=False) if have_lock: logger.info( f"index_metrics.py | update_metrics | {self.request.id} | Acquired update_metrics_lock" ) metric = PrometheusMetric( "index_metrics_runtime_seconds", "Runtimes for src.task.index_metrics:celery.task()", ("task_name", ), ) sweep_metrics(db, redis) refresh_metrics_matviews(db) metric.save_time({"task_name": "update_metrics"}) logger.info( f"index_metrics.py | update_metrics | {self.request.id} | Processing complete within session" ) else: logger.error( f"index_metrics.py | update_metrics | {self.request.id} | Failed to acquire update_metrics_lock" ) except Exception as e: logger.error("Fatal error in main loop of update_metrics: %s", e, exc_info=True) raise e finally: if have_lock: update_lock.release()
def update_aggregate_table( logger, session, table_name, query, checkpoint_name, current_checkpoint, ): metric = PrometheusMetric( "update_aggregate_table_latency_seconds", "Runtimes for src.task.aggregates:update_aggregate_table()", ("table_name", "task_name"), ) # get name of the caller function task_name = f"{currentframe().f_back.f_code.co_name}()" # get the last updated id that counted towards the current aggregate track prev_checkpoint = get_last_indexed_checkpoint(session, table_name) if not current_checkpoint or current_checkpoint == prev_checkpoint: logger.info( f"{task_name} | Skipping aggregation update because there are no new blocks" f" | checkpoint: ({prev_checkpoint}, {current_checkpoint}]") return # update aggregate track with new tracks that came after the prev_checkpoint logger.info(f"{task_name} | Updating {table_name}" f" | checkpoint: ({prev_checkpoint}, {current_checkpoint}]") session.execute( text(query), { f"prev_{checkpoint_name}": prev_checkpoint, f"current_{checkpoint_name}": current_checkpoint, }, ) metric.save_time({"table_name": table_name, "task_name": task_name}) # update indexing_checkpoints with the new id save_indexed_checkpoint(session, table_name, current_checkpoint)
health_results, is_unhealthy = get_health({}) PrometheusMetric( "health_check_block_difference_current", "Difference between the latest block and the latest indexed block", metric_type=PrometheusType.GAUGE, ).save(health_results["block_difference"]) PrometheusMetric( "health_check_latest_indexed_block_num_current", "Latest indexed block number", metric_type=PrometheusType.GAUGE, ).save(health_results["web"]["blocknumber"]) PrometheusMetric.register_collector("health_check_prometheus_exporter", health_check_prometheus_exporter) class SolHealthInfo(TypedDict): is_unhealthy: bool tx_info: Dict time_diff_general: int class PlayHealthInfo(SolHealthInfo): oldest_unarchived_play_created_at: str # Aggregate play health info across Solana and legacy storage def get_play_health_info( redis: Redis, plays_count_max_drift: Optional[int]) -> PlayHealthInfo:
def index_trending(self, db: SessionManager, redis: Redis, timestamp): logger.info("index_trending.py | starting indexing") update_start = time.time() metric = PrometheusMetric( "index_trending_runtime_seconds", "Runtimes for src.task.index_trending:index_trending()", ) with db.scoped_session() as session: genres = get_genres(session) # Make sure to cache empty genre genres.append(None) # type: ignore trending_track_versions = trending_strategy_factory.get_versions_for_type( TrendingType.TRACKS).keys() update_view(session, AGGREGATE_INTERVAL_PLAYS) update_view(session, TRENDING_PARAMS) for version in trending_track_versions: strategy = trending_strategy_factory.get_strategy( TrendingType.TRACKS, version) if strategy.use_mat_view: strategy.update_track_score_query(session) for version in trending_track_versions: strategy = trending_strategy_factory.get_strategy( TrendingType.TRACKS, version) for genre in genres: for time_range in time_ranges: cache_start_time = time.time() if strategy.use_mat_view: res = generate_unpopulated_trending_from_mat_views( session, genre, time_range, strategy) else: res = generate_unpopulated_trending( session, genre, time_range, strategy) key = make_trending_cache_key(time_range, genre, version) set_json_cached_key(redis, key, res) cache_end_time = time.time() total_time = cache_end_time - cache_start_time logger.info( f"index_trending.py | Cached trending ({version.name} version) \ for {genre}-{time_range} in {total_time} seconds") # Cache underground trending underground_trending_versions = trending_strategy_factory.get_versions_for_type( TrendingType.UNDERGROUND_TRACKS).keys() for version in underground_trending_versions: strategy = trending_strategy_factory.get_strategy( TrendingType.UNDERGROUND_TRACKS, version) cache_start_time = time.time() res = make_get_unpopulated_tracks(session, redis, strategy)() key = make_underground_trending_cache_key(version) set_json_cached_key(redis, key, res) cache_end_time = time.time() total_time = cache_end_time - cache_start_time logger.info( f"index_trending.py | Cached underground trending ({version.name} version) \ in {total_time} seconds") update_end = time.time() update_total = update_end - update_start metric.save_time() logger.info( f"index_trending.py | Finished indexing trending in {update_total} seconds", extra={ "job": "index_trending", "total_time": update_total }, ) # Update cache key to track the last time trending finished indexing redis.set(trending_tracks_last_completion_redis_key, int(update_end)) set_last_trending_datetime(redis, timestamp)
def convert_epoch_to_datetime(epoch): utc_dt = datetime.utcfromtimestamp(epoch).replace(tzinfo=pytz.utc) tz = pytz.timezone("America/New_York") # keep US east as default timezone dt = utc_dt.astimezone(tz) return dt def celery_tasks_prometheus_exporter(): tasks = get_celery_tasks()["celery_tasks"] metric = PrometheusMetric( "celery_running_tasks", "The currently running celery tasks", labelnames=["task_name"], metric_type=PrometheusType.GAUGE, ) for task in tasks: try: metric.save_time({"task_name": task["task_name"]}, start_time=task["started_at"]) except: logger.exception(f"Processing failed for task: {task}") PrometheusMetric.register_collector("celery_tasks_prometheus_exporter", celery_tasks_prometheus_exporter)
def user_state_update( self, update_task: DatabaseTask, session: Session, user_factory_txs, block_number, block_timestamp, block_hash, ipfs_metadata, blacklisted_cids, ) -> Tuple[int, Set]: """Return tuple containing int representing number of User model state changes found in transaction and set of processed user IDs.""" begin_user_state_update = datetime.now() metric = PrometheusMetric( "user_state_update_runtime_seconds", "Runtimes for src.task.users:user_state_update()", ("scope", ), ) blockhash = update_task.web3.toHex(block_hash) num_total_changes = 0 skipped_tx_count = 0 user_ids: Set[int] = set() if not user_factory_txs: return num_total_changes, user_ids challenge_bus = update_task.challenge_event_bus # This stores the state of the user object along with all the events applied to it # before it gets committed to the db # Data format is {"user_id": {"user", "events": []}} # NOTE - events are stored only for debugging purposes and not used or persisted anywhere user_events_lookup: Dict[int, Dict[str, Any]] = {} # Array of transactions by user to be applied in parallel # Map(user_id=1 <-> [tx1, tx2], user_id=2 <-> [tx1]) user_transactions_lookup: Dict[int, List[Tuple]] = {} # For each user factory transaction, loop through every tx # loop through all audius event types within that tx and get all event logs # for each event, apply changes to the user in user_events_lookup for tx_receipt in user_factory_txs: txhash = update_task.web3.toHex(tx_receipt.transactionHash) for event_type in user_event_types_arr: user_events_tx = get_user_events_tx(update_task, event_type, tx_receipt) # if record does not get added, do not count towards num_total_changes for entry in user_events_tx: user_id = helpers.get_tx_arg(entry, "_userId") if user_id not in user_transactions_lookup: user_transactions_lookup[user_id] = [] # Append to user level list user_transactions_lookup[user_id].append( (entry, event_type, tx_receipt, txhash)) # num_total_changes += processedEntries # Process each user in parallel with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor: process_user_txs_futures = {} for user_id in user_transactions_lookup.keys(): user_txs = user_transactions_lookup[user_id] process_user_txs_futures[executor.submit( process_user_txs_serial, self, user_id, user_txs, session, user_events_lookup, update_task, blacklisted_cids, block_number, block_timestamp, blockhash, ipfs_metadata, user_ids, skipped_tx_count, )] = user_id for future in concurrent.futures.as_completed( process_user_txs_futures): try: processed_entries = future.result() num_total_changes += processed_entries except Exception as exc: raise exc logger.info( f"index.py | users.py | There are {num_total_changes} events processed and {skipped_tx_count} skipped transactions." ) # For each record in user_events_lookup, invalidate the old record and add the new record # we do this after all processing has completed so the user record is atomic by block, not tx for user_id, value_obj in user_events_lookup.items(): logger.info(f"index.py | users.py | Adding {value_obj['user']}") if value_obj["events"]: invalidate_old_user(session, user_id) challenge_bus.dispatch(ChallengeEvent.profile_update, block_number, user_id) session.add(value_obj["user"]) if num_total_changes: metric.save_time({"scope": "full"}) logger.info( f"index.py | users.py | user_state_update | finished user_state_update in {datetime.now() - begin_user_state_update} // per event: {(datetime.now() - begin_user_state_update) / num_total_changes} secs" ) return num_total_changes, user_ids
def process_user_txs_serial( self, user_id, user_txs, session, user_events_lookup, update_task, blacklisted_cids, block_number, block_timestamp, blockhash, ipfs_metadata, user_ids, skipped_tx_count, ): metric = PrometheusMetric( "user_state_update_runtime_seconds", "Runtimes for src.task.users:user_state_update()", ("scope", ), ) processed_entries = 0 for user_tx in user_txs: try: process_user_txs_start_time = time() entry = user_tx[0] event_type = user_tx[1] tx_receipt = user_tx[2] txhash = user_tx[3] # look up or populate existing record if user_id in user_events_lookup: existing_user_record = user_events_lookup[user_id]["user"] else: existing_user_record = lookup_user_record( update_task, session, entry, block_number, block_timestamp, txhash, ) # parse user event to add metadata to record if event_type == user_event_types_lookup["update_multihash"]: metadata_multihash = helpers.multihash_digest_to_cid( helpers.get_tx_arg(entry, "_multihashDigest")) user_record = (parse_user_event( self, update_task, session, tx_receipt, block_number, entry, event_type, existing_user_record, ipfs_metadata[metadata_multihash], block_timestamp, ) if metadata_multihash not in blacklisted_cids else None) else: user_record = parse_user_event( self, update_task, session, tx_receipt, block_number, entry, event_type, existing_user_record, None, block_timestamp, ) # process user record if user_record is not None: if user_id not in user_events_lookup: user_events_lookup[user_id] = { "user": user_record, "events": [], } else: user_events_lookup[user_id]["user"] = user_record user_events_lookup[user_id]["events"].append(event_type) user_ids.add(user_id) processed_entries += 1 metric.save_time({"scope": "user_tx"}, start_time=process_user_txs_start_time) except EntityMissingRequiredFieldError as e: logger.warning(f"Skipping tx {txhash} with error {e}") skipped_tx_count += 1 add_node_level_skipped_transaction(session, block_number, blockhash, txhash) pass except Exception as e: logger.error("Error in parse user transaction") raise IndexingError("user", block_number, blockhash, txhash, str(e)) from e return processed_entries
def track_state_update( self, update_task: DatabaseTask, session: Session, track_factory_txs, block_number, block_timestamp, block_hash, ipfs_metadata, blacklisted_cids, ) -> Tuple[int, Set]: """Return tuple containing int representing number of Track model state changes found in transaction and set of processed track IDs.""" begin_track_state_update = datetime.now() metric = PrometheusMetric( "track_state_update_runtime_seconds", "Runtimes for src.task.tracks:track_state_update()", ("scope", ), ) blockhash = update_task.web3.toHex(block_hash) num_total_changes = 0 skipped_tx_count = 0 # This stores the track_ids created or updated in the set of transactions track_ids: Set[int] = set() if not track_factory_txs: return num_total_changes, track_ids pending_track_routes: List[TrackRoute] = [] track_events: Dict[int, Dict[str, Any]] = {} for tx_receipt in track_factory_txs: txhash = update_task.web3.toHex(tx_receipt.transactionHash) for event_type in track_event_types_arr: track_events_tx = get_track_events_tx(update_task, event_type, tx_receipt) processedEntries = 0 # if record does not get added, do not count towards num_total_changes for entry in track_events_tx: track_event_start_time = time() event_args = entry["args"] track_id = (helpers.get_tx_arg(entry, "_trackId") if "_trackId" in event_args else helpers.get_tx_arg( entry, "_id")) existing_track_record = None track_metadata = None try: # look up or populate existing record if track_id in track_events: existing_track_record = track_events[track_id]["track"] else: existing_track_record = lookup_track_record( update_task, session, entry, track_id, block_number, blockhash, txhash, ) # parse track event to add metadata to record if event_type in [ track_event_types_lookup["new_track"], track_event_types_lookup["update_track"], ]: track_metadata_digest = event_args._multihashDigest.hex( ) track_metadata_hash_fn = event_args._multihashHashFn buf = multihash.encode( bytes.fromhex(track_metadata_digest), track_metadata_hash_fn) cid = multihash.to_b58_string(buf) # do not process entry if cid is blacklisted if cid in blacklisted_cids: continue track_metadata = ipfs_metadata[cid] parsed_track = parse_track_event( self, session, update_task, entry, event_type, existing_track_record, block_number, block_timestamp, track_metadata, pending_track_routes, ) # If track record object is None, it has a blacklisted metadata CID if parsed_track is not None: if track_id not in track_events: track_events[track_id] = { "track": parsed_track, "events": [], } else: track_events[track_id]["track"] = parsed_track track_events[track_id]["events"].append(event_type) track_ids.add(track_id) processedEntries += 1 except EntityMissingRequiredFieldError as e: logger.warning(f"Skipping tx {txhash} with error {e}") skipped_tx_count += 1 add_node_level_skipped_transaction(session, block_number, blockhash, txhash) pass except Exception as e: logger.info("Error in parse track transaction") raise IndexingError("track", block_number, blockhash, txhash, str(e)) from e metric.save_time({"scope": "track_event"}, start_time=track_event_start_time) num_total_changes += processedEntries logger.info( f"index.py | tracks.py | [track indexing] There are {num_total_changes} events processed and {skipped_tx_count} skipped transactions." ) for track_id, value_obj in track_events.items(): if value_obj["events"]: logger.info(f"index.py | tracks.py | Adding {value_obj['track']}") invalidate_old_track(session, track_id) session.add(value_obj["track"]) if num_total_changes: metric.save_time({"scope": "full"}) logger.info( f"index.py | tracks.py | track_state_update | finished track_state_update in {datetime.now() - begin_track_state_update} // per event: {(datetime.now() - begin_track_state_update) / num_total_changes} secs" ) return num_total_changes, track_ids