def _filter_intended_recipients(self, in_pcol, label_pfx=None): pfx = "" if label_pfx is not None: pfx = "[{}] ".format(label_pfx) def lbl(label): return "{}{}".format(pfx, label) # TODO: this "tagging by version then processing each version # differently" should only be temporary and removed once v2 # migration is done version_lbl = lbl("Tag Message Versions") msg_version = in_pcol | version_lbl >> helpers._KlioTagMessageVersion() # tag each v1 message as 'process' or to 'drop' depending on if this # job should actually be handling the received message. v1_proc_lbl = lbl("Should Process v1 Message") v1_to_process = (msg_version.v1 | v1_proc_lbl >> helpers._KlioV1CheckRecipients()) v2_proc_lbl = lbl("Should Process v2 Message") v2_to_process = (msg_version.v2 | v2_proc_lbl >> helpers.KlioCheckRecipients()) flatten_ign_lbl = lbl("Flatten to Drop Messages to Ignore") to_drop_flatten = (v1_to_process.drop, v2_to_process.drop) to_drop = to_drop_flatten | flatten_ign_lbl >> beam.Flatten() ignore_lbl = lbl("Drop Messages to Ignore") _ = to_drop | ignore_lbl >> helpers.KlioDrop() flatten_proc_lbl = lbl("Flatten to Process Intended Messages") to_process_flatten = (v1_to_process.process, v2_to_process.process) to_process = to_process_flatten | flatten_proc_lbl >> beam.Flatten() return to_process
def test_klio_drop(mock_config, caplog): kmsg = klio_pb2.KlioMessage() with test_pipeline.TestPipeline() as p: p | beam.Create([kmsg.SerializeToString()]) | helpers.KlioDrop() # beam produces 50+ log messages so let's just iterate and find what # we're looking for *shrug* for rec in caplog.records: if "Dropping KlioMessage" in rec.message: assert True break else: assert False, "Expected log message not found" actual_counters = p.result.metrics().query()["counters"] assert 3 == len(actual_counters) received_ctr = actual_counters[0] drop_ctr = actual_counters[1] success_ctr = actual_counters[2] assert 1 == received_ctr.committed assert "KlioDrop.process" == received_ctr.key.metric.namespace assert "kmsg-received" == received_ctr.key.metric.name assert 1 == drop_ctr.committed assert "KlioDrop" == drop_ctr.key.metric.namespace assert "kmsg-drop" == drop_ctr.key.metric.name assert 1 == success_ctr.committed assert "KlioDrop.process" == success_ctr.key.metric.namespace assert "kmsg-success" == success_ctr.key.metric.name
def _filter_intended_recipients(self, in_pcol, label_pfx=None): pfx = "" if label_pfx is not None: pfx = "[{}] ".format(label_pfx) def lbl(label): return "{}{}".format(pfx, label) # TODO: this "tagging by version then processing each version # differently" should only be temporary and removed once v2 # migration is done version_lbl = lbl("Tag Message Versions") msg_version = in_pcol | version_lbl >> helpers._KlioTagMessageVersion() # tag each v1 message as 'process' or to 'drop' depending on if this # job should actually be handling the received message. v1_proc_lbl = lbl("Should Process v1 Message") v1_to_process = (msg_version.v1 | v1_proc_lbl >> helpers._KlioV1CheckRecipients()) v2_proc_lbl = lbl("Should Process v2 Message") v2_to_process = (msg_version.v2 | v2_proc_lbl >> helpers.KlioCheckRecipients()) flatten_ign_lbl = lbl("Flatten to Drop Messages to Ignore") to_drop_flatten = (v1_to_process.drop, v2_to_process.drop) to_drop = to_drop_flatten | flatten_ign_lbl >> beam.Flatten() # TODO: update me to `var.KlioRunner.DIRECT_GKE_RUNNER` once # direct_on_gke_runner_clean is merged if self.config.pipeline_options.runner == "DirectGKERunner": ack_inp_lbl = lbl("Ack Dropped Input Message") _ = to_drop | ack_inp_lbl >> beam.ParDo( helpers.KlioAckInputMessage()) ignore_lbl = lbl("Drop Messages to Ignore") _ = to_drop | ignore_lbl >> helpers.KlioDrop() flatten_proc_lbl = lbl("Flatten to Process Intended Messages") to_process_flatten = (v1_to_process.process, v2_to_process.process) to_process = to_process_flatten | flatten_proc_lbl >> beam.Flatten() return to_process
def _setup_data_io_filters(self, in_pcol, label_prefix=None): # label prefixes are required for multiple inputs (to avoid label # name collisions in Beam) if self._has_multi_data_inputs or self._has_multi_data_outputs: logging.error( "Klio does not (yet) support multiple data inputs and outputs." ) raise SystemExit(1) data_in_config, data_out_config = None, None if self._has_data_inputs: data_in_config = self.config.job_config.data.inputs[0] if self._has_data_outputs: data_out_config = self.config.job_config.data.outputs[0] pfx = "" if label_prefix is not None: pfx = "[{}] ".format(label_prefix) def lbl(label): return "{}{}".format(pfx, label) to_process_output = in_pcol pass_thru = None if data_in_config: pings = in_pcol | lbl("Ping Filter") >> helpers.KlioFilterPing() to_process_output = pings.process pass_thru = pings.pass_thru if data_out_config and not data_out_config.skip_klio_existence_check: output_exists = (to_process_output | lbl("Output Exists Filter") >> helpers.KlioGcsCheckOutputExists()) output_force = ( output_exists.found | lbl("Output Force Filter") >> helpers.KlioFilterForce()) to_pass_thru_tuple = (pass_thru, output_force.pass_thru) to_pass_thru = (to_pass_thru_tuple | lbl("Flatten to Pass Thru") >> beam.Flatten()) to_filter_input_tuple = ( output_exists.not_found, output_force.process, ) to_filter_input = (to_filter_input_tuple | lbl("Flatten to Process") >> beam.Flatten()) else: to_pass_thru = pass_thru to_filter_input = to_process_output if data_in_config and not data_in_config.skip_klio_existence_check: input_exists = (to_filter_input | lbl("Input Exists Filter") >> helpers.KlioGcsCheckInputExists()) _ = (input_exists.not_found | lbl("Drop Not Found Data") >> helpers.KlioDrop()) to_process = input_exists.found else: to_process = to_filter_input return to_process, to_pass_thru
def _setup_data_io_filters(self, in_pcol, label_prefix=None): # label prefixes are required for multiple inputs (to avoid label # name collisions in Beam) if self._has_multi_data_inputs or self._has_multi_data_outputs: logging.error( "Klio does not (yet) support multiple data inputs and outputs." ) raise SystemExit(1) data_in_config, data_out_config = None, None if self._has_data_inputs: data_in_config = self.config.job_config.data.inputs[0] if self._has_data_outputs: data_out_config = self.config.job_config.data.outputs[0] pfx = "" if label_prefix is not None: pfx = "[{}] ".format(label_prefix) def lbl(label): return "{}{}".format(pfx, label) to_process_output = in_pcol pass_thru = None if data_in_config: pings = in_pcol | lbl("Ping Filter") >> helpers.KlioFilterPing() to_process_output = pings.process pass_thru = pings.pass_thru if data_out_config and not data_out_config.skip_klio_existence_check: output_exists = (to_process_output | lbl("Output Exists Filter") >> helpers.KlioGcsCheckOutputExists()) output_force = ( output_exists.found | lbl("Output Force Filter") >> helpers.KlioFilterForce()) if pass_thru is not None: to_pass_thru_tuple = (pass_thru, output_force.pass_thru) to_pass_thru = (to_pass_thru_tuple | lbl("Flatten to Pass Thru") >> beam.Flatten()) else: to_pass_thru = output_force.pass_thru to_filter_input_tuple = ( output_exists.not_found, output_force.process, ) to_filter_input = (to_filter_input_tuple | lbl("Flatten to Process") >> beam.Flatten()) else: to_pass_thru = pass_thru to_filter_input = to_process_output if data_in_config and not data_in_config.skip_klio_existence_check: input_exists = (to_filter_input | lbl("Input Exists Filter") >> helpers.KlioGcsCheckInputExists()) # TODO: update me to `var.KlioRunner.DIRECT_GKE_RUNNER` once # direct_on_gke_runner_clean is merged if self.config.pipeline_options.runner == "DirectGKERunner": ack_inp_lbl = lbl("Ack Input Message from No Data Input Found") _ = input_exists.not_found | ack_inp_lbl >> beam.ParDo( helpers.KlioAckInputMessage()) _ = (input_exists.not_found | lbl("Drop Not Found Data") >> helpers.KlioDrop()) to_process = input_exists.found else: to_process = to_filter_input return to_process, to_pass_thru