def make_data_tuple_from_list(lst, serializer=PythonSerializer()): """Make HeronDataTuple from a list of objects""" data_tuple = tuple_pb2.HeronDataTuple() data_tuple.key = 0 tuple_size_in_bytes = 0 for obj in lst: serialized = serializer.serialize(obj) data_tuple.values.append(serialized) tuple_size_in_bytes += len(serialized) return data_tuple, tuple_size_in_bytes
def emit(self, tup, stream=Stream.DEFAULT_STREAM_ID, anchors=None, direct_task=None, need_task_ids=False): """Emits a new tuple from this Bolt It is compatible with StreamParse API. :type tup: list or tuple :param tup: the new output Tuple to send from this bolt, should only contain only serializable data. :type stream: str :param stream: the ID of the stream to emit this Tuple to. Leave empty to emit to the default stream. :type anchors: list :param anchors: a list of HeronTuples to which the emitted Tuples should be anchored. :type direct_task: int :param direct_task: the task to send the Tupel to if performing a direct emit. :type need_task_ids: bool :param need_task_ids: indicate whether or not you would like the task IDs the Tuple was emitted. """ # first check whether this tuple is sane self.pplan_helper.check_output_schema(stream, tup) # get custom grouping target task ids; get empty list if not custom grouping custom_target_task_ids = self.pplan_helper.choose_tasks_for_custom_grouping( stream, tup) self.pplan_helper.context.invoke_hook_emit(tup, stream, None) data_tuple = tuple_pb2.HeronDataTuple() data_tuple.key = 0 if direct_task is not None: if not isinstance(direct_task, int): raise TypeError( "direct_task argument needs to be an integer, given: %s" % str(type(direct_task))) # performing emit-direct data_tuple.dest_task_ids.append(direct_task) elif custom_target_task_ids is not None: for task_id in custom_target_task_ids: # for custom grouping data_tuple.dest_task_ids.append(task_id) # Set the anchors for a tuple if anchors is not None: merged_roots = set() for tup in [ t for t in anchors if isinstance(t, HeronTuple) and t.roots is not None ]: merged_roots.update(tup.roots) for rt in merged_roots: to_add = data_tuple.roots.add() to_add.CopyFrom(rt) tuple_size_in_bytes = 0 start_time = time.time() # Serialize for obj in tup: serialized = self.serializer.serialize(obj) data_tuple.values.append(serialized) tuple_size_in_bytes += len(serialized) serialize_latency_ns = (time.time() - start_time) * constants.SEC_TO_NS self.bolt_metrics.serialize_data_tuple(stream, serialize_latency_ns) super(BoltInstance, self).admit_data_tuple(stream_id=stream, data_tuple=data_tuple, tuple_size_in_bytes=tuple_size_in_bytes) self.bolt_metrics.update_emit_count(stream) if need_task_ids: sent_task_ids = custom_target_task_ids or [] if direct_task is not None: sent_task_ids.append(direct_task) return sent_task_ids
def emit(self, tup, tup_id=None, stream=Stream.DEFAULT_STREAM_ID, direct_task=None, need_task_ids=False): """Emits a new tuple from this Spout It is compatible with StreamParse API. :type tup: list or tuple :param tup: the new output Tuple to send from this spout, should contain only serializable data. :type tup_id: str or object :param tup_id: the ID for the Tuple. Leave this blank for an unreliable emit. (Same as messageId in Java) :type stream: str :param stream: the ID of the stream this Tuple should be emitted to. Leave empty to emit to the default stream. :type direct_task: int :param direct_task: the task to send the Tuple to if performing a direct emit. :type need_task_ids: bool :param need_task_ids: indicate whether or not you would like the task IDs the Tuple was emitted. """ # first check whether this tuple is sane self.pplan_helper.check_output_schema(stream, tup) # get custom grouping target task ids; get empty list if not custom grouping custom_target_task_ids = self.pplan_helper.choose_tasks_for_custom_grouping( stream, tup) self.pplan_helper.context.invoke_hook_emit(tup, stream, None) data_tuple = tuple_pb2.HeronDataTuple() data_tuple.key = 0 if direct_task is not None: if not isinstance(direct_task, int): raise TypeError( "direct_task argument needs to be an integer, given: %s" % str(type(direct_task))) # performing emit-direct data_tuple.dest_task_ids.append(direct_task) elif custom_target_task_ids is not None: # for custom grouping for task_id in custom_target_task_ids: data_tuple.dest_task_ids.append(task_id) if tup_id is not None: tuple_info = TupleHelper.make_root_tuple_info(stream, tup_id) if self.acking_enabled: # this message is rooted root = data_tuple.roots.add() root.taskid = self.pplan_helper.my_task_id root.key = tuple_info.key self.in_flight_tuples[tuple_info.key] = tuple_info else: self.immediate_acks.append(tuple_info) tuple_size_in_bytes = 0 start_time = time.time() # Serialize for obj in tup: serialized = self.serializer.serialize(obj) data_tuple.values.append(serialized) tuple_size_in_bytes += len(serialized) serialize_latency_ns = (time.time() - start_time) * constants.SEC_TO_NS self.spout_metrics.serialize_data_tuple(stream, serialize_latency_ns) super(SpoutInstance, self).admit_data_tuple(stream_id=stream, data_tuple=data_tuple, tuple_size_in_bytes=tuple_size_in_bytes) self.total_tuples_emitted += 1 self.spout_metrics.update_emit_count(stream) if need_task_ids: sent_task_ids = custom_target_task_ids or [] if direct_task is not None: sent_task_ids.append(direct_task) return sent_task_ids
class BaseInstance(object): """The base class for heron bolt/spout instance Implements the following functionality: 1. Basic output collector API and pushing tuples to Out-Stream 2. Run tasks continually :ivar pplan_helper: Physical Plan Helper for this component :ivar in_stream: In-Stream Heron Communicator :ivar output_helper: Outgoing Tuple Helper :ivar serializer: Implementation of Heron Serializer """ make_data_tuple = lambda _: tuple_pb2.HeronDataTuple() def __init__(self, pplan_helper, in_stream, out_stream, looper): self.pplan_helper = pplan_helper self.in_stream = in_stream self.output_helper = OutgoingTupleHelper(self.pplan_helper, out_stream) self.looper = looper self.sys_config = system_config.get_sys_config() # will set a root logger here self.logger = logging.getLogger() context = pplan_helper.context mode = context.get_cluster_config().get(api_constants.TOPOLOGY_RELIABILITY_MODE, api_constants.TopologyReliabilityMode.ATMOST_ONCE) self.is_stateful = bool(mode == api_constants.TopologyReliabilityMode.EFFECTIVELY_ONCE) self._stateful_state = None self.serializer = SerializerHelper.get_serializer(pplan_helper.context) self._initialized_global_metrics = False def log(self, message, level=None): """Log message, optionally providing a logging level It is compatible with StreamParse API. :type message: str :param message: the log message to send :type level: str :param level: the logging level, one of: trace (=debug), debug, info, warn or error (default: info) """ if level is None: _log_level = logging.INFO else: if level == "trace" or level == "debug": _log_level = logging.DEBUG elif level == "info": _log_level = logging.INFO elif level == "warn": _log_level = logging.WARNING elif level == "error": _log_level = logging.ERROR else: raise ValueError("%s is not supported as logging level" % str(level)) self.logger.log(_log_level, message) def admit_data_tuple(self, stream_id, data_tuple, tuple_size_in_bytes): self.output_helper.add_data_tuple(stream_id, data_tuple, tuple_size_in_bytes) def admit_control_tuple(self, control_tuple, tuple_size_in_bytes, is_ack): self.output_helper.add_control_tuple(control_tuple, tuple_size_in_bytes, is_ack) def admit_ckpt_state(self, ckpt_id, ckpt_state): self.output_helper.add_ckpt_state(ckpt_id, self.serializer.serialize(ckpt_state)) def get_total_data_emitted_in_bytes(self): return self.output_helper.total_data_emitted_in_bytes def load_py_instance(self, is_spout): """Loads user defined component (spout/bolt)""" try: if is_spout: spout_proto = self.pplan_helper.get_my_spout() py_classpath = spout_proto.comp.class_name self.logger.info("Loading Spout from: %s", py_classpath) else: bolt_proto = self.pplan_helper.get_my_bolt() py_classpath = bolt_proto.comp.class_name self.logger.info("Loading Bolt from: %s", py_classpath) pex_loader.load_pex(self.pplan_helper.topology_pex_abs_path) spbl_class = pex_loader.import_and_get_class(self.pplan_helper.topology_pex_abs_path, py_classpath) except Exception as e: spbl = "spout" if is_spout else "bolt" self.logger.error(traceback.format_exc()) raise RuntimeError("Error when loading a %s from pex: %s" % (spbl, str(e))) return spbl_class def handle_initiate_stateful_checkpoint(self, ckptmsg, component): Log.info("Received initiate state checkpoint message for %s" % ckptmsg.checkpoint_id) if not self.is_stateful: raise RuntimeError("Received state checkpoint message but we are not stateful topology") if isinstance(component, StatefulComponent): component.pre_save(ckptmsg.checkpoint_id) else: Log.info("Trying to checkponit a non stateful component. Send empty state") self.admit_ckpt_state(ckptmsg.checkpoint_id, self._stateful_state) def clear_collector(self): self.output_helper.clear() def start(self, stateful_state): self._stateful_state = stateful_state self.start_component(stateful_state) context = self.pplan_helper.context context.invoke_hook_prepare() # prepare global metrics if not self._initialized_global_metrics: interval = float(self.sys_config[system_constants.HERON_METRICS_EXPORT_INTERVAL_SEC]) collector = context.get_metrics_collector() global_metrics.init(collector, interval) self._initialized_global_metrics = True # prepare for custom grouping self.pplan_helper.prepare_custom_grouping(context) def stop(self): self.pplan_helper.context.invoke_hook_cleanup() self.stop_component() ################################################################## # The followings are to be implemented by Spout/Bolt independently ################################################################## @abstractmethod def start_component(self, stateful_state): """Do the basic setup for Heron Instance""" raise NotImplementedError() @abstractmethod def stop_component(self): """Do the basic clean for Heron Instance Note that this method is not guaranteed to be invoked """ raise NotImplementedError() @abstractmethod def process_incoming_tuples(self): """Should be called when a tuple was buffered into in_stream""" raise NotImplementedError() @abstractmethod def invoke_activate(self): """Activate the instance""" raise NotImplementedError() @abstractmethod def invoke_deactivate(self): """Deactivate the instance""" raise NotImplementedError()
class BaseInstance(object): """The base class for heron bolt/spout instance Implements the following functionality: 1. Basic output collector API and pushing tuples to Out-Stream 2. Run tasks continually :ivar pplan_helper: Physical Plan Helper for this component :ivar in_stream: In-Stream Heron Communicator :ivar output_helper: Outgoing Tuple Helper :ivar serializer: Implementation of Heron Serializer """ make_data_tuple = lambda _: tuple_pb2.HeronDataTuple() def __init__(self, pplan_helper, in_stream, out_stream, looper): self.pplan_helper = pplan_helper self.in_stream = in_stream self.output_helper = OutgoingTupleHelper(self.pplan_helper, out_stream) self.looper = looper self.sys_config = system_config.get_sys_config() # will set a root logger here self.logger = logging.getLogger() def log(self, message, level=None): """Log message, optionally providing a logging level It is compatible with StreamParse API. :type message: str :param message: the log message to send :type level: str :param level: the logging level, one of: trace (=debug), debug, info, warn or error (default: info) """ if level is None: _log_level = logging.INFO else: if level == "trace" or level == "debug": _log_level = logging.DEBUG elif level == "info": _log_level = logging.INFO elif level == "warn": _log_level = logging.WARNING elif level == "error": _log_level = logging.ERROR else: raise ValueError("%s is not supported as logging level" % str(level)) self.logger.log(_log_level, message) def admit_data_tuple(self, stream_id, data_tuple, tuple_size_in_bytes): self.output_helper.add_data_tuple(stream_id, data_tuple, tuple_size_in_bytes) def admit_control_tuple(self, control_tuple, tuple_size_in_bytes, is_ack): self.output_helper.add_control_tuple(control_tuple, tuple_size_in_bytes, is_ack) def get_total_data_emitted_in_bytes(self): return self.output_helper.total_data_emitted_in_bytes def load_py_instance(self, is_spout): """Loads user defined component (spout/bolt)""" try: if is_spout: spout_proto = self.pplan_helper.get_my_spout() py_classpath = spout_proto.comp.class_name self.logger.info("Loading Spout from: %s", py_classpath) else: bolt_proto = self.pplan_helper.get_my_bolt() py_classpath = bolt_proto.comp.class_name self.logger.info("Loading Bolt from: %s", py_classpath) pex_loader.load_pex(self.pplan_helper.topology_pex_abs_path) spbl_class = pex_loader.import_and_get_class( self.pplan_helper.topology_pex_abs_path, py_classpath) except Exception as e: spbl = "spout" if is_spout else "bolt" self.logger.error(traceback.format_exc()) raise RuntimeError("Error when loading a %s from pex: %s" % (spbl, e.message)) return spbl_class ################################################################## # The followings are to be implemented by Spout/Bolt independently ################################################################## @abstractmethod def start(self): """Do the basic setup for Heron Instance""" raise NotImplementedError() @abstractmethod def stop(self): """Do the basic clean for Heron Instance Note that this method is not guaranteed to be invoked """ raise NotImplementedError() @abstractmethod def process_incoming_tuples(self): """Should be called when a tuple was buffered into in_stream""" raise NotImplementedError() @abstractmethod def _read_tuples_and_execute(self): """Read tuples from a queue and process the tuples""" raise NotImplementedError() @abstractmethod def invoke_activate(self): """Activate the instance""" raise NotImplementedError() @abstractmethod def invoke_deactivate(self): """Deactivate the instance""" raise NotImplementedError()