def test_to_klio_message_raises(klio_config, logger, monkeypatch): incoming = b"Not a klio message" with pytest.raises(gproto_message.DecodeError): serializer.to_klio_message(incoming, klio_config, logger) # Just asserting it's called - not testing the error string itself # to avoid making brittle tests assert 1 == logger.error.call_count
def update_kmsg_metadata(self, raw_kmsg): """Update KlioMessage to enable partial bottom-up execution. Args: raw_kmsg (bytes): Unserialized KlioMessage Returns: bytes: KlioMessage deserialized to ``bytes`` with updated intended recipients metadata. """ # Use `serializer.to_klio_message` instead of @handle_klio in order to # get the full KlioMessage object (not just the data). kmsg = serializer.to_klio_message( raw_kmsg, kconfig=self._klio.config, logger=self._klio.logger ) # Make sure upstream job doesn't skip the message upstream_job = self._generate_upstream_job_object() lmtd = kmsg.metadata.intended_recipients.limited lmtd.recipients.extend([upstream_job]) # Assign the current job to `trigger_children_of` so that top-down # execution resumes after this job is done. current_job = self._generate_current_job_object() lmtd.recipients.extend([current_job]) lmtd.trigger_children_of.CopyFrom(current_job) return serializer.from_klio_message(kmsg)
def print_debug(self, raw_message): klio_message = serializer.to_klio_message(raw_message, self._klio.config, self._klio.logger) self._klio.logger.log(self.log_level, "{}{}".format(self.prefix, klio_message)) return raw_message
def test_to_klio_message(klio_message, klio_message_str, klio_config, logger): actual_message = serializer.to_klio_message( klio_message_str, klio_config, logger ) assert klio_message == actual_message logger.error.assert_not_called()
def process(self, raw_message): klio_message = serializer.to_klio_message(raw_message) if self._should_process(klio_message): yield pvalue.TaggedOutput( _helpers.TaggedStates.PROCESS.value, raw_message ) else: yield pvalue.TaggedOutput( _helpers.TaggedStates.DROP.value, raw_message )
def process(self, raw_message): klio_message = serializer.to_klio_message(raw_message) if self._should_process(klio_message): # the message could have updated, so let's re-serialize to a new # raw message raw_message = klio_message.SerializeToString() yield pvalue.TaggedOutput( _helpers.TaggedStates.PROCESS.value, raw_message ) else: yield pvalue.TaggedOutput( _helpers.TaggedStates.DROP.value, raw_message )
def process(self, klio_message): # In batch, the read transform produces a KlioMessage. However, in # streaming, it's still bytes. And for some reason this isn't # pickleable when it's in its own transform. # TODO: maybe create a read/write klio pub/sub transform to do # this for us. if not isinstance(klio_message, klio_pb2.KlioMessage): klio_message = serializer.to_klio_message(klio_message) if klio_message.version == klio_pb2.Version.V2: yield pvalue.TaggedOutput("v2", klio_message.SerializeToString()) else: yield pvalue.TaggedOutput("v1", klio_message.SerializeToString())
def wrapper(self, incoming_item, *args, **kwargs): try: kmsg = serializer.to_klio_message(incoming_item, self._klio.config, self._klio.logger) yield from meth(self, kmsg, *args, **kwargs) except Exception as err: self._klio.logger.error( "Dropping KlioMessage - exception occurred when serializing " "'%s' to a KlioMessage.\nError: %s" % (incoming_item, err), exc_info=True, ) return
def test_to_klio_message_allow_non_kmsg(klio_config, logger, monkeypatch): monkeypatch.setattr( klio_config.job_config, "allow_non_klio_messages", True ) incoming = b"Not a klio message" expected = klio_pb2.KlioMessage() expected.data.element = incoming expected.version = klio_pb2.Version.V2 actual_message = serializer.to_klio_message(incoming, klio_config, logger) assert expected == actual_message logger.error.assert_not_called()
def __serialize_klio_message_generator(self, meth, incoming_item, *args, **kwargs): try: kmsg = serializer.to_klio_message(incoming_item, self._klio.config, self._klio.logger) except Exception as err: self._klio.logger.error( _ERROR_MSG_KMSG_FROM_BYTES.format(incoming_item, err), exc_info=True, ) # Since the yielded value in the `try` clause is not tagged, that # one will be used by default by whatever executed this function, # and anything that has a tagged output value (like this dropped one) # will just be ignored, which is fine for dropped values. # But if the caller function wanted to, they could access this via # pcoll.drop. yield pvalue.TaggedOutput("drop", incoming_item) # explicitly return so that Beam doesn't call `next` and # executes the next `yield` return try: payload = meth(self, kmsg.data, *args, **kwargs) except Exception as err: func_path = self.__class__.__name__ + "." + meth.__name__ log_msg, exc_info = __get_user_error_message(err, func_path, kmsg) self._klio.logger.error(log_msg, exc_info=exc_info) # Since the yielded value in the `try` clause is not tagged, that # one will be used by default by whatever executed this function, # and anything that has a tagged output value (like this dropped one) # will just be ignored, which is fine for dropped values. # But if the caller function wanted to, they could access this via # pcoll.drop. # We won't try to serialize kmsg to bytes since something already # went wrong. yield pvalue.TaggedOutput("drop", incoming_item) # explicitly return so that Beam doesn't call `next` and # executes the next `yield` return else: if isinstance(payload, types.GeneratorType): for pl in payload: yield from __from_klio_message_generator( self, kmsg, pl, incoming_item) else: yield from __from_klio_message_generator(self, kmsg, payload, incoming_item)
def process(self, raw_message): klio_message = serializer.to_klio_message(raw_message, self._klio.config, self._klio.logger) audit_log_item = self._create_audit_item() klio_message.metadata.job_audit_log.extend([audit_log_item]) audit_log = klio_message.metadata.job_audit_log traversed_dag = " -> ".join("{}::{}".format( str(al.klio_job.gcp_project), str(al.klio_job.job_name)) for al in audit_log) traversed_dag = "{} (current job)".format(traversed_dag) base_log_msg = "KlioMessage full audit log" log_msg = "{} - Entity ID: {} - Path: {}".format( base_log_msg, klio_message.data.entity_id, traversed_dag) self._klio.logger.debug(log_msg) yield klio_message.SerializeToString()
def __serialize_klio_message(metrics, ctx, func, incoming_item, *args, **kwargs): metrics.received.inc() # manipulate `ctx` to handle both methods and functions depending on # what we're wrapping. Functions just have `ctx` object, but methods # have `self._klio` as its context, and we also need access to `self` # in order to call the method _self = ctx if not isinstance(ctx, core.KlioContext): ctx = _self._klio with metrics.timer: try: kmsg = serializer.to_klio_message(incoming_item, ctx.config, ctx.logger) except Exception as err: ctx.logger.error( _ERROR_MSG_KMSG_FROM_BYTES.format(incoming_item, err), exc_info=True, ) metrics.error.inc() __ack_pubsub_if_direct_gke(incoming_item, ctx) # Since the returned value in the `try` clause is not tagged, that # one will be used by default by whatever executed this function, # and anything that has a tagged output value (like this dropped # one) will just be ignored, which is fine for dropped values. # But if the caller function wanted to, they could access this via # pcoll.drop. return pvalue.TaggedOutput("drop", incoming_item) try: ret = func(_self, kmsg.data, *args, **kwargs) if isinstance(ret, types.GeneratorType): raise TypeError("can't pickle generator object: '{}'".format( func.__name__)) except TypeError: metrics.error.inc() # If we get here, we threw a type error because we found a generator # and those can't be pickled. But there's no need to do any special # error handling - this will contain enough info for the user so # we just re-raise raise except Exception as err: log_msg, exc_info = __get_user_error_message( err, func.__name__, kmsg) ctx.logger.error(log_msg, exc_info=exc_info) metrics.error.inc() __ack_pubsub_if_direct_gke(kmsg, ctx) # Since the returned value in the `try` clause is not tagged, that # one will be used by default by whatever executed this function, # and anything that has a tagged output value (like this dropped # one) will just be ignored, which is fine for dropped values. # But if the caller function wanted to, they could access this via # pcoll.drop. # We won't try to serialize kmsg to bytes since something already # went wrong. return pvalue.TaggedOutput("drop", incoming_item) try: to_ret = serializer.from_klio_message(kmsg, ret) metrics.success.inc() return to_ret except Exception as err: ctx.logger.error(_ERROR_MSG_KMSG_TO_BYTES.format(kmsg, err), exc_info=True) metrics.error.inc() __ack_pubsub_if_direct_gke(kmsg, ctx) # Since the returned value in the `try` clause is not tagged, that # one will be used by default by whatever executed this function, # and anything that has a tagged output value (like this dropped # one) will just be ignored, which is fine for dropped values. # But if the caller function wanted to, they could access this via # pcoll.drop. # We won't try to serialize kmsg to bytes since something already # went wrong. return pvalue.TaggedOutput("drop", incoming_item)
def __serialize_klio_message_generator(metrics, self, meth, incoming_item, *args, **kwargs): metrics.received.inc() with metrics.timer: try: kmsg = serializer.to_klio_message(incoming_item, self._klio.config, self._klio.logger) except Exception as err: self._klio.logger.error( _ERROR_MSG_KMSG_FROM_BYTES.format(incoming_item, err), exc_info=True, ) metrics.error.inc() __ack_pubsub_if_direct_gke(incoming_item, self._klio) # Since the yielded value in the `try` clause is not tagged, that # one will be used by default by whatever executed this function, # and anything that has a tagged output value (like this dropped # one) will just be ignored, which is fine for dropped values. # But if the caller function wanted to, they could access this via # pcoll.drop. yield pvalue.TaggedOutput("drop", incoming_item) # explicitly return so that Beam doesn't call `next` and # executes the next `yield` return try: payload = meth(self, kmsg.data, *args, **kwargs) except Exception as err: func_path = self.__class__.__name__ + "." + meth.__name__ log_msg, exc_info = __get_user_error_message(err, func_path, kmsg) self._klio.logger.error(log_msg, exc_info=exc_info) metrics.error.inc() __ack_pubsub_if_direct_gke(kmsg, self._klio) # Since the yielded value in the `try` clause is not tagged, that # one will be used by default by whatever executed this function, # and anything that has a tagged output value (like this dropped # one) will just be ignored, which is fine for dropped values. # But if the caller function wanted to, they could access this via # pcoll.drop. # We won't try to serialize kmsg to bytes since something already # went wrong. yield pvalue.TaggedOutput("drop", incoming_item) # explicitly return so that Beam doesn't call `next` and # executes the next `yield` return else: if isinstance(payload, types.GeneratorType): try: for pl in payload: yield from __from_klio_message_generator( metrics, self, kmsg, pl, incoming_item) # This exception block will the execute # if the pl item is an Exception except Exception as err: func_path = self.__class__.__name__ + "." + meth.__name__ log_msg, exc_info = __get_user_error_message( err, func_path, kmsg) self._klio.logger.error(log_msg, exc_info=exc_info) metrics.error.inc() __ack_pubsub_if_direct_gke(kmsg, self._klio) # This will catch an exception present in the generator # containing items yielded by a function/method # decorated by @handle_klio. # Following items in the generator will be ignored # since an exception has already been detected. # We won't try to serialize kmsg to bytes since # something already went wrong. yield pvalue.TaggedOutput("drop", incoming_item) # explicitly return so that Beam doesn't call `next` and # executes the next `yield` return else: yield from __from_klio_message_generator( metrics, self, kmsg, payload, incoming_item)