Example #1
0
class SampleBolt(Bolt, StatefulComponent):
    """SampleBolt"""
    # output declarer
    outputs = [Stream(fields=['_output_'], name='output')]
    FRACTION = 'fraction'

    def initState(self, stateful_state):
        # sample does not have any state
        pass

    def preSave(self, checkpoint_id):
        # sample does not have any state
        pass

    def initialize(self, config, context):
        self.logger.debug("SampleBolt's Component-specific config: \n%s" %
                          str(config))
        self.processed = 0
        self.emitted = 0
        if SampleBolt.FRACTION in config:
            self.sample_fraction = config[SampleBolt.FRACTION]
            if not isinstance(self.sample_fraction, float):
                raise RuntimeError("Sample fraction has to be a float")
            if self.sample_fraction > 1.0:
                raise RuntimeError("Sample fraction has to be <= 1.0")
        else:
            raise RuntimeError("SampleBolt needs to be passed filter function")

    def process(self, tup):
        self.processed += 1
        self.ack(tup)
        raise RuntimeError("SampleBolt not fully functional")
Example #2
0
class MapBolt(Bolt, StatefulComponent):
    """MapBolt"""
    # output declarer
    outputs = [Stream(fields=['_output_'], name='output')]
    FUNCTION = 'function'

    def initState(self, stateful_state):
        # mapBolt does not have any state
        pass

    def preSave(self, checkpoint_id):
        # mapBolt does not have any state
        pass

    def initialize(self, config, context):
        self.logger.debug("MapBolt's Component-specific config: \n%s" %
                          str(config))
        self.processed = 0
        self.emitted = 0
        if MapBolt.FUNCTION in config:
            self.map_function = config[MapBolt.FUNCTION]
            if not callable(self.map_function):
                raise RuntimeError("Map function has to be callable")
        else:
            raise RuntimeError("MapBolt needs to be passed map function")

    def process(self, tup):
        retval = self.map_function(tup.values[0])
        self.emit([retval], stream='output')
        self.processed += 1
        self.emitted += 1
        self.ack(tup)
Example #3
0
class FlatMapBolt(Bolt):
    """FlatMapBolt"""
    # output declarer
    outputs = [Stream(fields=['_output_'], name='output')]
    FUNCTION = 'function'

    def initialize(self, config, context):
        self.logger.debug("FlatMapBolt's Component-specific config: \n%s" %
                          str(config))
        self.processed = 0
        self.emitted = 0
        if FlatMapBolt.FUNCTION in config:
            self.flatmap_function = config[FlatMapBolt.FUNCTION]
            if not callable(self.flatmap_function):
                raise RuntimeError("FlatMap function has to be callable")
        else:
            raise RuntimeError(
                "FlatMapBolt needs to be passed flatMap function")

    def process(self, tup):
        retval = self.flatmap_function(tup.values[0])
        if isinstance(retval, collections.Iterable):
            for value in retval:
                self.emit([value], stream='output')
                self.emitted += 1
        else:
            self.emit([retval], stream='output')
            self.emitted += 1
        self.processed += 1
        self.ack(tup)
Example #4
0
class TestSane(Topology):
    config = {
        "topology.wide.config.1": "value",
        "spout.overriden.config": True
    }
    spout = HeronComponentSpec(None,
                               "sp_class",
                               True,
                               3,
                               inputs=None,
                               outputs=[
                                   "word", "count",
                                   Stream(fields=['error_msg'],
                                          name='error_stream')
                               ],
                               config={
                                   "spout.specific.config.1": "value",
                                   "spout.specific.config.2": True,
                                   "spout.specific.config.3": -12.4,
                                   "spout.specific.config.4": [1, 2, 3],
                                   "spout.overriden.config": False
                               })
    bolt = HeronComponentSpec(None,
                              "bl_class",
                              False,
                              4,
                              inputs={
                                  spout: Grouping.SHUFFLE,
                                  spout['error_stream']: Grouping.ALL
                              })
Example #5
0
class JoinBolt(SlidingWindowBolt):
    """JoinBolt"""
    # output declarer
    outputs = [Stream(fields=['_output_'], name='output')]
    WINDOWDURATION = SlidingWindowBolt.WINDOW_DURATION_SECS
    SLIDEINTERVAL = SlidingWindowBolt.WINDOW_SLIDEINTERVAL_SECS

    @staticmethod
    def _add(key, value, mymap):
        if key in mymap:
            mymap[key].append(value)
        else:
            mymap[key] = [value]

    def processWindow(self, window_config, tuples):
        # our temporary map
        mymap = {}
        for tup in tuples:
            userdata = tup.values[0]
            if not isinstance(userdata,
                              collections.Iterable) or len(userdata) != 2:
                raise RuntimeError("Join tuples must be iterable of length 2")
            self._add(userdata[0], userdata[1], mymap)
        for (key, values) in mymap.items():
            self.emit([(key, values)], stream='output')
Example #6
0
class RepartitionBolt(Bolt):
  """RepartitionBolt"""
  # output declarer
  outputs = [Stream(fields=['_output_'], name='output')]

  def initialize(self, config, context):
    self.logger.debug("RepartitionBolt's Component-specific config: \n%s" % str(config))
    self.processed = 0
    self.emitted = 0

  def process(self, tup):
    self.emit(tup.values, stream='output')
    self.processed += 1
    self.emitted += 1
    self.ack(tup)
class ReduceByKeyAndWindowBolt(SlidingWindowBolt):
    """ReduceByKeyAndWindowBolt"""
    # output declarer
    outputs = [Stream(fields=['_output_'], name='output')]
    FUNCTION = 'function'
    WINDOWDURATION = SlidingWindowBolt.WINDOW_DURATION_SECS
    SLIDEINTERVAL = SlidingWindowBolt.WINDOW_SLIDEINTERVAL_SECS

    def initialize(self, config, context):
        super(ReduceByKeyAndWindowBolt, self).initialize(config, context)
        if ReduceByKeyAndWindowBolt.FUNCTION not in config:
            raise RuntimeError(
                "FUNCTION not specified in reducebywindow operator")
        self.reduce_function = config[ReduceByKeyAndWindowBolt.FUNCTION]
        if not callable(self.reduce_function):
            raise RuntimeError("Reduce Function has to be callable")

    @staticmethod
    def _add(key, value, mymap):
        if key in mymap:
            mymap[key].append(value)
        else:
            mymap[key] = [value]

    def processWindow(self, window_config, tuples):
        # our temporary map
        mymap = {}
        for tup in tuples:
            userdata = tup.values[0]
            if not isinstance(userdata,
                              collections.Iterable) or len(userdata) != 2:
                raise RuntimeError(
                    "ReduceByWindow tuples must be iterable of length 2")
            self._add(userdata[0], userdata[1], mymap)
        for (key, values) in mymap.items():
            result = values[0]
            for value in values[1:]:
                self.reduce_function(result, value)
            self.emit([(key, result)], stream='output')
Example #8
0
class RepartitionBolt(Bolt, StatefulComponent):
  """RepartitionBolt"""
  # output declarer
  outputs = [Stream(fields=['_output_'], name='output')]

  def initState(self, stateful_state):
    # repartition does not have any state
    pass

  def preSave(self, checkpoint_id):
    # repartition does not have any state
    pass

  def initialize(self, config, context):
    self.logger.debug("RepartitionBolt's Component-specific config: \n%s" % str(config))
    self.processed = 0
    self.emitted = 0

  def process(self, tup):
    self.emit(tup.values, stream='output')
    self.processed += 1
    self.emitted += 1
    self.ack(tup)
Example #9
0
class MultiStreamSpout(Spout):
    """WordSpout: emits a set of words repeatedly"""
    # output field declarer
    outputs = ['word', Stream(fields=['error_msg'], name='error')]

    def initialize(self, config, context):
        self.logger.info("In initialize() of WordSpout")
        self.words = cycle(["hello", "bye", "good", "bad", "heron", "storm"])

        self.emit_count = 0

        self.logger.info("Component-specific config: \n%s" % str(config))
        self.logger.info("Context: \n%s" % str(context))

    def next_tuple(self):
        word = next(self.words)
        self.emit([word])
        self.emit_count += 1

        if self.emit_count % 100000 == 0:
            self.logger.info("Emitted %s" % str(self.emit_count))
            self.logger.info("Emitting to error stream")
            self.emit(["test error message"], stream='error')
Example #10
0
class FilterBolt(Bolt):
    """FilterBolt"""
    # output declarer
    outputs = [Stream(fields=['_output_'], name='output')]
    FUNCTION = 'function'

    def initialize(self, config, context):
        self.logger.debug("FilterBolt's Component-specific config: \n%s" %
                          str(config))
        self.processed = 0
        self.emitted = 0
        if FilterBolt.FUNCTION in config:
            self.filter_function = config[FilterBolt.FUNCTION]
            if not callable(self.filter_function):
                raise RuntimeError("Filter function has to be callable")
        else:
            raise RuntimeError("FilterBolt needs to be passed filter function")

    def process(self, tup):
        if self.filter_function(tup.values[0]):
            self.emit([tup.values], stream='output')
            self.emitted += 1
        self.processed += 1
        self.ack(tup)
Example #11
0
class IntegrationTestBolt(Bolt):
    """Base bolt for integration test

  Every bolt of integration test topology consists of this instance, each delegating user's bolt.
  """
    outputs = [
        Stream(fields=[integ_const.INTEGRATION_TEST_TERMINAL],
               name=integ_const.INTEGRATION_TEST_CONTROL_STREAM_ID)
    ]

    @classmethod
    def spec(cls,
             name,
             par,
             inputs,
             config,
             user_bolt_classpath,
             user_output_fields=None):
        python_class_path = "%s.%s" % (cls.__module__, cls.__name__)
        config[integ_const.USER_BOLT_CLASSPATH] = user_bolt_classpath
        # avoid modification to cls.outputs
        _outputs = copy.copy(cls.outputs)
        if user_output_fields is not None:
            _outputs.extend(user_output_fields)
        return HeronComponentSpec(name,
                                  python_class_path,
                                  is_spout=False,
                                  par=par,
                                  inputs=inputs,
                                  outputs=_outputs,
                                  config=config)

    def initialize(self, config, context):
        user_bolt_classpath = config.get(integ_const.USER_BOLT_CLASSPATH, None)
        if user_bolt_classpath is None:
            raise RuntimeError("User defined integration bolt was not found")
        user_bolt_cls = self._load_user_bolt(context.get_topology_pex_path(),
                                             user_bolt_classpath)
        self.user_bolt = user_bolt_cls(delegate=self)

        upstream_components = set()
        self.terminal_to_receive = 0
        for streamId in context.get_this_sources().keys():
            # streamId is topology_pb2.StreamId protobuf message
            upstream_components.add(streamId.component_name)
        for comp_name in upstream_components:
            self.terminal_to_receive += len(
                context.get_component_tasks(comp_name))

        self.tuple_received = 0
        self.tuples_processed = 0
        self.current_tuple_processing = None

        Log.info("Terminals to receive: %d" % self.terminal_to_receive)
        self.user_bolt.initialize(config, context)

    @staticmethod
    def _load_user_bolt(pex_file, classpath):
        pex_loader.load_pex(pex_file)
        cls = pex_loader.import_and_get_class(pex_file, classpath)
        return cls

    @property
    def is_done(self):
        return self.terminal_to_receive == 0

    def process(self, tup):
        self.tuple_received += 1
        stream_id = tup.stream

        Log.info("Received a tuple: %s from %s" % (tup, stream_id))
        if stream_id == integ_const.INTEGRATION_TEST_CONTROL_STREAM_ID:
            self.terminal_to_receive -= 1
            if self.is_done:
                if isinstance(self.user_bolt, BatchBolt):
                    Log.info("Invoke bolt to do finish batch")
                    self.user_bolt.finish_batch()

                Log.info("Populating the terminals to downstream")
                super(IntegrationTestBolt, self).emit(
                    [integ_const.INTEGRATION_TEST_TERMINAL],
                    stream=integ_const.INTEGRATION_TEST_CONTROL_STREAM_ID)
        else:
            self.current_tuple_processing = tup
            self.user_bolt.process(tup)
            self.ack(tup)

    def emit(self,
             tup,
             stream=Stream.DEFAULT_STREAM_ID,
             anchors=None,
             direct_task=None,
             need_task_ids=False):
        if tup is None:
            super(IntegrationTestBolt,
                  self).emit(list(self.current_tuple_processing),
                             stream=stream,
                             anchors=anchors,
                             direct_task=direct_task,
                             need_task_ids=need_task_ids)
        else:
            super(IntegrationTestBolt, self).emit(tup, stream, anchors,
                                                  direct_task, need_task_ids)

    def ack(self, tup):
        Log.info("Trying to do an ack. tuples processed: %d, received: %d" %
                 (self.tuples_processed, self.tuple_received))
        if self.tuples_processed < self.tuple_received:
            super(IntegrationTestBolt, self).ack(tup)
            self.tuples_processed += 1

    def fail(self, tup):
        Log.info("Trying to do a fail. tuples processed: %d, received: %d" %
                 (self.tuples_processed, self.tuple_received))
        if self.tuples_processed < self.tuple_received:
            super(IntegrationTestBolt, self).fail(tup)
            self.tuples_processed += 1
Example #12
0
class PulsarSpout(Spout):
    """PulsarSpout: reads from a pulsar topic"""

    # pylint: disable=too-many-instance-attributes
    # pylint: disable=no-self-use

    outputs = [Stream(fields=['_output_'], name='output')]

    def default_deserializer(self, msg):
        return [str(msg)]

    # TopologyBuilder uses these constants to set
    # cluster/topicname
    serviceUrl = "PULSAR_SERVICE_URL"
    topicName = "PULSAR_TOPIC"
    receiveTimeoutMs = "PULSAR_RECEIVE_TIMEOUT_MS"
    deserializer = "PULSAR_MESSAGE_DESERIALIZER"

    def initialize(self, config, context):
        """Implements Pulsar Spout's initialize method"""
        self.logger.info("Initializing PulsarSpout with the following")
        self.logger.info("Component-specific config: \n%s" % str(config))
        self.logger.info("Context: \n%s" % str(context))

        self.emit_count = 0
        self.ack_count = 0
        self.fail_count = 0

        if not PulsarSpout.serviceUrl in config or not PulsarSpout.topicName in config:
            self.logger.fatal("Need to specify both serviceUrl and topicName")
        self.pulsar_cluster = str(config[PulsarSpout.serviceUrl])
        self.topic = str(config[PulsarSpout.topicName])
        mode = config[api_constants.TOPOLOGY_RELIABILITY_MODE]
        if mode == api_constants.TopologyReliabilityMode.ATLEAST_ONCE:
            self.acking_timeout = 1000 * int(
                config[api_constants.TOPOLOGY_MESSAGE_TIMEOUT_SECS])
        else:
            self.acking_timeout = 30000
        if PulsarSpout.receiveTimeoutMs in config:
            self.receive_timeout_ms = config[PulsarSpout.receiveTimeoutMs]
        else:
            self.receive_timeout_ms = 10
        if PulsarSpout.deserializer in config:
            self.deserializer = config[PulsarSpout.deserializer]
            if not callable(self.deserializer):
                self.logger.fatal(
                    "Pulsar Message Deserializer needs to be callable")
        else:
            self.deserializer = self.default_deserializer

        # First generate the config
        self.logConfFileName = GenerateLogConfig(context)
        self.logger.info("Generated LogConf at %s" % self.logConfFileName)

        # We currently use the high level consumer api
        # For supporting exactly once, we will need to switch
        # to using lower level Reader api, when it becomes
        # available in python
        self.client = pulsar.Client(self.pulsar_cluster,
                                    log_conf_file_path=self.logConfFileName)
        self.logger.info("Setup Client with cluster %s" % self.pulsar_cluster)
        try:
            self.consumer = self.client.subscribe(
                self.topic,
                context.get_topology_name(),
                consumer_type=pulsar.ConsumerType.Failover,
                unacked_messages_timeout_ms=self.acking_timeout)
        except Exception as e:
            self.logger.fatal("Pulsar client subscription failed: %s" % str(e))

        self.logger.info("Subscribed to topic %s" % self.topic)

    def next_tuple(self):
        try:
            msg = self.consumer.receive(timeout_millis=self.receive_timeout_ms)
        except Exception as e:
            self.logger.debug("Exception during recieve: %s" % str(e))
            return

        try:
            self.emit(self.deserializer(msg.data()), tup_id=msg.message_id())
            self.emit_count += 1
        except Exception as e:
            self.logger.info("Exception during emit: %s" % str(e))

    def ack(self, tup_id):
        self.ack_count += 1
        self.consumer.acknowledge(tup_id)

    def fail(self, tup_id):
        self.fail_count += 1
        self.logger.debug("Failed tuple %s" % str(tup_id))
Example #13
0
class IntegrationTestSpout(Spout):
    """Base spout for integration test

  Every spout of integration test topology consists of this instance, each delegating user's spout.
  """
    outputs = [
        Stream(fields=[integ_const.INTEGRATION_TEST_TERMINAL],
               name=integ_const.INTEGRATION_TEST_CONTROL_STREAM_ID)
    ]

    @classmethod
    def spec(cls,
             name,
             par,
             config,
             user_spout_classpath,
             user_output_fields=None):
        python_class_path = "%s.%s" % (cls.__module__, cls.__name__)

        config[integ_const.USER_SPOUT_CLASSPATH] = user_spout_classpath
        # avoid modification to cls.outputs
        _outputs = copy.copy(cls.outputs)
        if user_output_fields is not None:
            _outputs.extend(user_output_fields)
        return HeronComponentSpec(name,
                                  python_class_path,
                                  is_spout=True,
                                  par=par,
                                  inputs=None,
                                  outputs=_outputs,
                                  config=config)

    def initialize(self, config, context):
        user_spout_classpath = config.get(integ_const.USER_SPOUT_CLASSPATH,
                                          None)
        if user_spout_classpath is None:
            raise RuntimeError(
                "User defined integration test spout was not found")
        user_spout_cls = self._load_user_spout(context.get_topology_pex_path(),
                                               user_spout_classpath)
        self.user_spout = user_spout_cls(delegate=self)

        self.max_executions = config.get(integ_const.USER_MAX_EXECUTIONS,
                                         integ_const.MAX_EXECUTIONS)
        assert isinstance(self.max_executions, int) and self.max_executions > 0
        Log.info("Max executions: %d" % self.max_executions)
        self.tuples_to_complete = 0

        self.user_spout.initialize(config, context)

    @staticmethod
    def _load_user_spout(pex_file, classpath):
        pex_loader.load_pex(pex_file)
        cls = pex_loader.import_and_get_class(pex_file, classpath)
        return cls

    @property
    def is_done(self):
        return self.max_executions == 0

    def next_tuple(self):
        if self.is_done:
            return

        self.max_executions -= 1
        Log.info("max executions: %d" % self.max_executions)

        self.user_spout.next_tuple()

        if self.is_done:
            self._emit_terminal_if_needed()
            Log.info("This topology is finished.")

    def ack(self, tup_id):
        Log.info("Received an ack with tuple id: %s" % str(tup_id))
        self.tuples_to_complete -= 1
        if tup_id != integ_const.INTEGRATION_TEST_MOCK_MESSAGE_ID:
            self.user_spout.ack(tup_id)
        self._emit_terminal_if_needed()

    def fail(self, tup_id):
        Log.info("Received a fail message with tuple id: %s" % str(tup_id))
        self.tuples_to_complete -= 1
        if tup_id != integ_const.INTEGRATION_TEST_MOCK_MESSAGE_ID:
            self.user_spout.fail(tup_id)
        self._emit_terminal_if_needed()

    def emit(self,
             tup,
             tup_id=None,
             stream=Stream.DEFAULT_STREAM_ID,
             direct_task=None,
             need_task_ids=None):
        """Emits from this integration test spout

    Overriden method which will be called when user's spout calls emit()
    """
        # if is_control True -> control stream should not count
        self.tuples_to_complete += 1

        if tup_id is None:
            Log.info("Add tup_id for tuple: %s" % str(tup))
            _tup_id = integ_const.INTEGRATION_TEST_MOCK_MESSAGE_ID
        else:
            _tup_id = tup_id

        super(IntegrationTestSpout, self).emit(tup, _tup_id, stream,
                                               direct_task, need_task_ids)

    def _emit_terminal_if_needed(self):
        Log.info("is_done: %s, tuples_to_complete: %s" %
                 (self.is_done, self.tuples_to_complete))
        if self.is_done and self.tuples_to_complete == 0:
            Log.info("Emitting terminals to downstream")
            super(IntegrationTestSpout, self).emit(
                [integ_const.INTEGRATION_TEST_TERMINAL],
                stream=integ_const.INTEGRATION_TEST_CONTROL_STREAM_ID)