def __init__(self, pplan_helper, in_stream, out_stream, looper): super(SpoutInstance, self).__init__(pplan_helper, in_stream, out_stream, looper) self.topology_state = topology_pb2.TopologyState.Value("PAUSED") if not self.pplan_helper.is_spout: raise RuntimeError("No spout in physicial plan") context = self.pplan_helper.context self.spout_metrics = SpoutMetrics(self.pplan_helper) # acking related mode = context.get_cluster_config().get(api_constants.TOPOLOGY_RELIABILITY_MODE, api_constants.TopologyReliabilityMode.ATMOST_ONCE) self.acking_enabled = bool(mode == api_constants.TopologyReliabilityMode.ATLEAST_ONCE) self.enable_message_timeouts = \ context.get_cluster_config().get(api_constants.TOPOLOGY_ENABLE_MESSAGE_TIMEOUTS) self._initialized_metrics_and_tasks = False Log.info("Enable ACK: %s" % str(self.acking_enabled)) Log.info("Enable Message Timeouts: %s" % str(self.enable_message_timeouts)) # map <tuple_info.key -> tuple_info>, ordered by insertion time self.in_flight_tuples = collections.OrderedDict() self.immediate_acks = collections.deque() self.total_tuples_emitted = 0 # load user's spout class spout_impl_class = super(SpoutInstance, self).load_py_instance(is_spout=True) self.spout_impl = spout_impl_class(delegate=self)
class SpoutInstance(BaseInstance): """The base class for all heron spouts in Python""" def __init__(self, pplan_helper, in_stream, out_stream, looper): super(SpoutInstance, self).__init__(pplan_helper, in_stream, out_stream, looper) self.topology_state = topology_pb2.TopologyState.Value("PAUSED") if not self.pplan_helper.is_spout: raise RuntimeError("No spout in physicial plan") context = self.pplan_helper.context self.spout_metrics = SpoutMetrics(self.pplan_helper) # acking related mode = context.get_cluster_config().get(api_constants.TOPOLOGY_RELIABILITY_MODE, api_constants.TopologyReliabilityMode.ATMOST_ONCE) self.acking_enabled = bool(mode == api_constants.TopologyReliabilityMode.ATLEAST_ONCE) self.enable_message_timeouts = \ context.get_cluster_config().get(api_constants.TOPOLOGY_ENABLE_MESSAGE_TIMEOUTS) self._initialized_metrics_and_tasks = False Log.info("Enable ACK: %s" % str(self.acking_enabled)) Log.info("Enable Message Timeouts: %s" % str(self.enable_message_timeouts)) # map <tuple_info.key -> tuple_info>, ordered by insertion time self.in_flight_tuples = collections.OrderedDict() self.immediate_acks = collections.deque() self.total_tuples_emitted = 0 # load user's spout class spout_impl_class = super(SpoutInstance, self).load_py_instance(is_spout=True) self.spout_impl = spout_impl_class(delegate=self) def start_component(self, stateful_state): context = self.pplan_helper.context if not self._initialized_metrics_and_tasks: self.spout_metrics.register_metrics(context) if self.is_stateful and isinstance(self.spout_impl, StatefulComponent): self.spout_impl.init_state(stateful_state) self.spout_impl.initialize(config=context.get_cluster_config(), context=context) if not self._initialized_metrics_and_tasks: self._add_spout_task() self._initialized_metrics_and_tasks = True self.topology_state = topology_pb2.TopologyState.Value("RUNNING") def stop_component(self): self.spout_impl.close() self.topology_state = topology_pb2.TopologyState.Value("PAUSED") def invoke_activate(self): Log.info("Spout is activated") self.spout_impl.activate() self.topology_state = topology_pb2.TopologyState.Value("RUNNING") def invoke_deactivate(self): Log.info("Spout is deactivated") self.spout_impl.deactivate() self.topology_state = topology_pb2.TopologyState.Value("PAUSED") def emit(self, tup, tup_id=None, stream=Stream.DEFAULT_STREAM_ID, direct_task=None, need_task_ids=False): """Emits a new tuple from this Spout It is compatible with StreamParse API. :type tup: list or tuple :param tup: the new output Tuple to send from this spout, should contain only serializable data. :type tup_id: str or object :param tup_id: the ID for the Tuple. Leave this blank for an unreliable emit. (Same as messageId in Java) :type stream: str :param stream: the ID of the stream this Tuple should be emitted to. Leave empty to emit to the default stream. :type direct_task: int :param direct_task: the task to send the Tuple to if performing a direct emit. :type need_task_ids: bool :param need_task_ids: indicate whether or not you would like the task IDs the Tuple was emitted. """ # first check whether this tuple is sane self.pplan_helper.check_output_schema(stream, tup) # get custom grouping target task ids; get empty list if not custom grouping custom_target_task_ids = self.pplan_helper.choose_tasks_for_custom_grouping(stream, tup) self.pplan_helper.context.invoke_hook_emit(tup, stream, None) data_tuple = tuple_pb2.HeronDataTuple() data_tuple.key = 0 if direct_task is not None: if not isinstance(direct_task, int): raise TypeError("direct_task argument needs to be an integer, given: %s" % str(type(direct_task))) # performing emit-direct data_tuple.dest_task_ids.append(direct_task) elif custom_target_task_ids is not None: # for custom grouping for task_id in custom_target_task_ids: data_tuple.dest_task_ids.append(task_id) if tup_id is not None: tuple_info = TupleHelper.make_root_tuple_info(stream, tup_id) if self.acking_enabled: # this message is rooted root = data_tuple.roots.add() root.taskid = self.pplan_helper.my_task_id root.key = tuple_info.key self.in_flight_tuples[tuple_info.key] = tuple_info else: self.immediate_acks.append(tuple_info) tuple_size_in_bytes = 0 start_time = time.time() # Serialize for obj in tup: serialized = self.serializer.serialize(obj) data_tuple.values.append(serialized) tuple_size_in_bytes += len(serialized) serialize_latency_ns = (time.time() - start_time) * system_constants.SEC_TO_NS self.spout_metrics.serialize_data_tuple(stream, serialize_latency_ns) super(SpoutInstance, self).admit_data_tuple(stream_id=stream, data_tuple=data_tuple, tuple_size_in_bytes=tuple_size_in_bytes) self.total_tuples_emitted += 1 self.spout_metrics.update_emit_count(stream) if need_task_ids: sent_task_ids = custom_target_task_ids or [] if direct_task is not None: sent_task_ids.append(direct_task) return sent_task_ids # pylint: disable=no-self-use def process_incoming_tuples(self): self.looper.wake_up() def _read_tuples(self): start_cycle_time = time.time() ack_batch_time = self.sys_config[system_constants.INSTANCE_ACK_BATCH_TIME_MS] * \ system_constants.MS_TO_SEC while not self.in_stream.is_empty(): try: tuples = self.in_stream.poll() except Queue.Empty: break if isinstance(tuples, tuple_pb2.HeronTupleSet): if tuples.HasField("data"): raise RuntimeError("Spout cannot get incoming data tuples from other components") elif tuples.HasField("control"): for ack_tuple in tuples.control.acks: self._handle_ack_tuple(ack_tuple, True) for fail_tuple in tuples.control.fails: self._handle_ack_tuple(fail_tuple, False) else: Log.error("Received tuple neither data nor control") elif isinstance(tuples, ckptmgr_pb2.InitiateStatefulCheckpoint): self.handle_initiate_stateful_checkpoint(tuples, self.spout_impl) else: Log.error("Received tuple not instance of HeronTupleSet") # avoid spending too much time here if time.time() - start_cycle_time - ack_batch_time > 0: break def _produce_tuple(self): # TOPOLOGY_MAX_SPOUT_PENDING must be provided (if not included, raise KeyError) max_spout_pending = \ self.pplan_helper.context.get_cluster_config().get(api_constants.TOPOLOGY_MAX_SPOUT_PENDING) total_tuples_emitted_before = self.total_tuples_emitted total_data_emitted_bytes_before = self.get_total_data_emitted_in_bytes() emit_batch_time = \ float(self.sys_config[system_constants.INSTANCE_EMIT_BATCH_TIME_MS]) * \ system_constants.MS_TO_SEC emit_batch_size = int(self.sys_config[system_constants.INSTANCE_EMIT_BATCH_SIZE_BYTES]) start_cycle_time = time.time() while (self.acking_enabled and max_spout_pending > len(self.in_flight_tuples)) or \ not self.acking_enabled: start_time = time.time() self.spout_impl.next_tuple() next_tuple_latency_ns = (time.time() - start_time) * system_constants.SEC_TO_NS self.spout_metrics.next_tuple(next_tuple_latency_ns) if (self.total_tuples_emitted == total_tuples_emitted_before) or \ (time.time() - start_cycle_time - emit_batch_time > 0) or \ (self.get_total_data_emitted_in_bytes() - total_data_emitted_bytes_before > emit_batch_size): # no tuples to emit or batch reached break total_tuples_emitted_before = self.total_tuples_emitted def _add_spout_task(self): Log.info("Adding spout task...") def spout_task(): # don't do anything when topology is paused if not self._is_topology_running(): return self._read_tuples() if self._should_produce_tuple(): self._produce_tuple() self.output_helper.send_out_tuples() self.looper.wake_up() # so emitted tuples would be added to buffer now else: self.spout_metrics.update_out_queue_full_count() if self.acking_enabled: self.spout_metrics.update_pending_tuples_count(len(self.in_flight_tuples)) else: self._do_immediate_acks() if self._is_continue_to_work(): self.looper.wake_up() self.looper.add_wakeup_task(spout_task) # look for the timeout's tuples if self.enable_message_timeouts: self._look_for_timeouts() def _is_topology_running(self): return self.topology_state == topology_pb2.TopologyState.Value("RUNNING") def _should_produce_tuple(self): """Checks whether we could produce tuples, i.e. invoke spout.next_tuple()""" return self._is_topology_running() and self.output_helper.is_out_queue_available() def _is_continue_to_work(self): """Checks whether we still need to do more work When the topology state is RUNNING: 1. if the out_queue is not full and ack is not enabled, we could wake up next time to produce more tuples and push to the out_queue 2. if the out_queue is not full but the acking is enabled, we need to make sure that the number of pending tuples is smaller than max_spout_pending 3. if there are more to read, we will wake up itself next time. """ if not self._is_topology_running(): return False max_spout_pending = \ self.pplan_helper.context.get_cluster_config().get(api_constants.TOPOLOGY_MAX_SPOUT_PENDING) if not self.acking_enabled and self.output_helper.is_out_queue_available(): return True elif self.acking_enabled and self.output_helper.is_out_queue_available() and \ len(self.in_flight_tuples) < max_spout_pending: return True elif self.acking_enabled and not self.in_stream.is_empty(): return True else: return False def _look_for_timeouts(self): spout_config = self.pplan_helper.context.get_cluster_config() timeout_sec = spout_config.get(api_constants.TOPOLOGY_MESSAGE_TIMEOUT_SECS) n_bucket = self.sys_config.get(system_constants.INSTANCE_ACKNOWLEDGEMENT_NBUCKETS) now = time.time() timeout_lst = [] while self.in_flight_tuples: key = next(iter(self.in_flight_tuples)) tuple_info = self.in_flight_tuples[key] if tuple_info.is_expired(now, timeout_sec): timeout_lst.append(tuple_info) self.in_flight_tuples.pop(key) else: # in_flight_tuples are ordered by insertion time break for tuple_info in timeout_lst: self.spout_metrics.timeout_tuple(tuple_info.stream_id) self._invoke_fail(tuple_info.tuple_id, tuple_info.stream_id, timeout_sec * system_constants.SEC_TO_NS) # register this method to timer again self.looper.register_timer_task_in_sec(self._look_for_timeouts, float(timeout_sec) / n_bucket) # ACK/FAIL related def _handle_ack_tuple(self, tup, is_success): for rt in tup.roots: if rt.taskid != self.pplan_helper.my_task_id: raise RuntimeError("Receiving tuple for task: %s in task: %s" % (str(rt.taskid), str(self.pplan_helper.my_task_id))) try: tuple_info = self.in_flight_tuples.pop(rt.key) except KeyError: # rt.key is not in in_flight_tuples -> already removed due to time-out return # pylint: disable=no-member if tuple_info.tuple_id is not None: latency_ns = (time.time() - tuple_info.insertion_time) * system_constants.SEC_TO_NS if is_success: self._invoke_ack(tuple_info.tuple_id, tuple_info.stream_id, latency_ns) else: self._invoke_fail(tuple_info.tuple_id, tuple_info.stream_id, latency_ns) def _do_immediate_acks(self): size = len(self.immediate_acks) for _ in range(size): tuple_info = self.immediate_acks.pop() self._invoke_ack(tuple_info.tuple_id, tuple_info.stream_id, 0) def _invoke_ack(self, tuple_id, stream_id, complete_latency_ns): Log.debug("In invoke_ack(): Acking %s from stream: %s" % (str(tuple_id), stream_id)) self.spout_impl.ack(tuple_id) self.pplan_helper.context.invoke_hook_spout_ack(tuple_id, complete_latency_ns) self.spout_metrics.acked_tuple(stream_id, complete_latency_ns) def _invoke_fail(self, tuple_id, stream_id, fail_latency_ns): Log.debug("In invoke_fail(): Failing %s from stream: %s" % (str(tuple_id), stream_id)) self.spout_impl.fail(tuple_id) self.pplan_helper.context.invoke_hook_spout_fail(tuple_id, fail_latency_ns) self.spout_metrics.failed_tuple(stream_id, fail_latency_ns)