def update_keepalive(params: inputs.Inputs, keepalive_state: KeepaliveState, cache: Cache): ''' Update the keepalive state in cache. Also check if the current backing job owns the keepalive. If not, exit ''' try: cache_keys = keepalive_state.cache_keys exit_if_necessary(keepalive_state, cache) keepalive_state.last_keepalive_ms = utils.millitime() cache.set(cache_keys.keepalive, pickle.dumps(keepalive_state)) except Exception as e: print("update_keepalive: exception", e, traceback.format_exc())
def metadata_consumer_thread_fn(metadata_queue: Queue, keepalive_state: KeepaliveState, cache: Cache): ''' Thread that consumes metadata messages, updates full metadata in cache ''' print("metadata_consumer_thread_fn: started") try: cache_keys = keepalive_state.cache_keys metadata = {} while (True): try: publish = False while (not metadata_queue.empty()): msg = metadata_queue.get() metadata[msg.tsid] = msg.properties publish = True if publish: cache.set(cache_keys.metadata, metadata) except Exception as e: print("metadata_consumer_thread_fn: exception", e, traceback.format_exc()) time.sleep(1.0) finally: print("metadata_consumer_thread_fn: ended")
def get_cached_result(params: inputs.Inputs, context: LambdaContext, cache: Cache): ''' Backing job is already running. So just query cached data from and return result ''' def wait_for_backing_job_to_exit_batch_phase( keepalive_state: KeepaliveState, cache: Cache, cache_keys: CacheKeys, wait_until_ms: int): print("wait_for_backing_job_to_exit_batch_phase: started", cache_keys.keepalive) while not keepalive_state or not keepalive_state.in_streaming_phase: # wait for backing job to be running and advance to streaming state if utils.millitime() > wait_until_ms: raise Exception( "wait_for_backing_job_to_exit_batch_phase: timed out") print( "get_cached_result: waiting for batch phase to end. keepalive_state=", keepalive_state) time.sleep(1) try: keepalive_state: KeepaliveState = pickle.loads( cache.get(cache_keys.keepalive)) except Exception as e: print( "wait_for_backing_job_to_exit_batch_phase: failed to read keepalive from cache", cache_keys.keepalive, e) print("wait_for_backing_job_to_exit_batch_phase: backing job is ready", keepalive_state) return keepalive_state print("get_cached_result: started") # Update 'lastaccess' timestamp in memcache to indicate the corresponding backing job's data was recently queried cache_keys: CacheKeys = CacheKeys(params.cache_key_prefix()) now_ms = params.invoke_time_ms try: cache.set(cache_keys.lastaccess, now_ms) except Exception as e: print( "get_cached_result: failed to set lastaccess cache key {}={}, {}". format(cache_keys.lastaccess, now_ms, e)) # start the backing job if one is not running, or if the backing job's keepalive timestamp is stale keepalive_state: KeepaliveState = start_backing_job_if_necessary( params, context, cache) # now that backing job is surely running, wait for it to become 'ready' - i.e. go from batch to streaming phase keepalive_state = wait_for_backing_job_to_exit_batch_phase( keepalive_state, cache, cache_keys, now_ms + defaults.API_TIMEOUT_MS) # compute which cache keys need to be fetched if not params.is_streaming(): tstart = params.absolute_ms(params.start_time_ms) tend = params.absolute_ms(params.end_time_ms) else: tend = now_ms tstart = tend - params.duration_ms() timestamps = sorted([ ts for ts in keepalive_state.data_timestamps if ts >= tstart and ts <= tend ]) data_keys = [cache_keys.data_prefix + str(ts) for ts in timestamps] # retrieve metadata and data from cache. retry if necessary metadata = cache.get(cache_keys.metadata) if len(timestamps): print( "get_cached_result: fetching {} timestamps {} - {} @ {}ms".format( len(timestamps), time.ctime(timestamps[0] / 1000), time.ctime(timestamps[-1] / 1000), keepalive_state.resolution_ms)) data = cache.multiget(data_keys) missing_keys = set(data_keys) - set(data.keys()) if (len(missing_keys)): print("get_cached_result: retrying fetch of {}/{} keys: {}".format( len(missing_keys), len(data_keys), sorted(missing_keys))) data.update(cache.multiget(list(missing_keys))) # Fill in results in results struct result = { "start_time_ms": tstart, "end_time_ms": tend, "earliest_result_ms": 0, "latest_result_ms": 0, "resolution_ms": keepalive_state.resolution_ms, "metadata": metadata, "data": {}, "missing_timestamps_ms": [] } # First fill in retrieved data tsids = set() missing_timestamps = [] for timestamp in timestamps: k = cache_keys.data_prefix + str(timestamp) if k in data.keys(): for tsid, value in data[k].items(): if not result["earliest_result_ms"]: result["earliest_result_ms"] = timestamp if timestamp > result["latest_result_ms"]: result["latest_result_ms"] = timestamp tsids.add(tsid) result["data"].setdefault(tsid, []) result["data"][tsid].append([timestamp, value]) else: missing_timestamps.append(timestamp) # Second, fill in metadata of only the relevant mts that have data remove_metadata_ids = set(metadata.keys()).difference(tsids) for tsid in remove_metadata_ids: metadata.pop(tsid) result["missing_timestamps_ms"] = missing_timestamps return result
def start_backing_job_if_necessary(params: inputs.Inputs, context: LambdaContext, cache: Cache): ''' If no backing job is running for a given signalflow program and duration, start one Returns keepalive_state from cache if active backing job is found (to prevent a duplicate cache read by callers ''' def start_backing_job_as_lambda(params: inputs.Inputs, tstart, tend, context: LambdaContext): # Start new backing job that runs as a lambda function print("start_backing_job_as_lambda: started") import boto3 lambda_client = boto3.client('lambda') lambda_client.invoke(FunctionName=context.invoked_function_arn, InvocationType='Event', Payload=json.dumps({ "program": params.program, "start_time_ms": tstart, "end_time_ms": tend, "resolution_hint_ms": params.resolution_hint_ms, "api_token": params.api_token, "api_endpoint": params.api_endpoint, "daemon": True })) def start_backing_job_as_process(params: inputs.Inputs, tstart, tend): # Start new backing job that runs as a python process print("start_backing_job_as_process: started") cmd: str = "nohup python3 {script} --program=\"{program}\" --token={token} \ --start_time_ms={tstart} --end_time_ms={tend} --resolution_hint_ms={res} --endpoint={endpoint}".format( script=__file__, program=params.program, tstart=tstart, tend=tend, res=params.resolution_hint_ms, token=params.api_token, endpoint=params.api_endpoint) cmd += " --daemon > /tmp/{}.log 2>&1 &".format( params.cache_key_prefix()) print("start_backing_job_as_process:", cmd) os.system(cmd) # begin code for start_backing_job_if_necessary() try: cache_keys = CacheKeys(params.cache_key_prefix()) print("start_backing_job_if_necessary: started", cache_keys) now_ms = utils.millitime() cached_state: KeepaliveState = pickle.loads( cache.get(cache_keys.keepalive)) keepalive_age_ms = now_ms - cached_state.last_keepalive_ms expiry_ms = defaults.KEEPALIVE_EXPIRY_MULTIPLE * defaults.KEEPALIVE_INTERVAL_SEC * 1000 if keepalive_age_ms < expiry_ms: print( "start_backing_job_if_necessary: found active backing job already running. keepalive_age_ms =", keepalive_age_ms) return cached_state print( "start_backing_job_if_necessary: found expired keepalive_age_ms =", keepalive_age_ms) cache.set(cache_keys.keepalive, None) except Exception as e: print("start_backing_job_if_necessary: no keeplive found in cache", e) tstart = params.start_time_ms tend = params.end_time_ms if not params.is_streaming(): tstart = params.absolute_ms(tstart) tend = params.absolute_ms(tend) if context.invoked_function_arn: # This backing job was invoked as a lambda. So invoke a new lambda start_backing_job_as_lambda(params, tstart, tend, context) else: start_backing_job_as_process(params, tstart, tend) return None
def data_consumer_thread_fn(params: inputs.Inputs, context: LambdaContext, data_queue: Queue, keepalive_state: KeepaliveState, cache: Cache): ''' Thread that consumes data messages from analytics job, and sticks each one individually into cache. Also detects when job moves from batch to stream phase. Unfortunately that requires 'auto-detection' where data does not arrive for close to a second :-( ''' print("data_consumer_thread_fn: started") try: cache_keys = keepalive_state.cache_keys data_to_encache = {} last_datamsg_walltime_ms = 0 while True: now_ms = utils.millitime() try: if params.is_streaming(): # remove trailing data keys that are beyond the scope of the current 'window' of a streaming job valid_timestamps = [ ts for ts in keepalive_state.data_timestamps if ts >= (now_ms - params.job_duration_ms() - keepalive_state.resolution_ms) ] keepalive_state.data_timestamps = set(valid_timestamps) msg = data_queue.get(False) last_datamsg_walltime_ms = utils.millitime() data_to_encache.setdefault(msg.logical_timestamp_ms, {}) data_to_encache[msg.logical_timestamp_ms].update(msg.data) except Exception as e: # No data found in queue. However there may be pending data from previous messages that need caching timestamps_encached = set() for timestamp, values in data_to_encache.items(): try: cache.set(cache_keys.data_prefix + str(timestamp), values) timestamps_encached.add(timestamp) except Exception as e: # Failed to set data in cache None for timestamp_encached in timestamps_encached: data_to_encache.pop(timestamp_encached) keepalive_state.data_timestamps.add(timestamp_encached) if data_to_encache: print( "data_consumer_thread_fn: will retry writing {} data keys to cache {}" .format(len(data_to_encache), list(data_to_encache))) elif not keepalive_state.in_streaming_phase: # Now that all data is successfully published, 'Auto-detect' whether we have completed batch phase # and entered stream phase. If so, update keepalive_state if last_datamsg_walltime_ms > 0 and ( now_ms - last_datamsg_walltime_ms ) > defaults.STREAM_PHASE_DETECTION_INTERVAL_MS: keepalive_state.in_streaming_phase = True print( "data_consumer_thread_fn: backing job entered stream phase after {} datapoints. now={}, last={}" .format(len(keepalive_state.data_timestamps), now_ms, last_datamsg_walltime_ms)) # start healthcheck thread now that data is flowing in threading.Thread(target=healthcheck_thread_fn, args=(params, context, keepalive_state, cache)).start() time.sleep(defaults.STREAM_PHASE_DETECTION_INTERVAL_MS / 1000 / 5) except Exception as e: print("data_consumer_thread_fn exception", e, traceback.format_exc()) finally: print("data_consumer_thread_fn: ended")