Beispiel #1
0
    def _filter_intended_recipients(self, in_pcol, label_pfx=None):
        pfx = ""
        if label_pfx is not None:
            pfx = "[{}] ".format(label_pfx)

        def lbl(label):
            return "{}{}".format(pfx, label)

        # TODO: this "tagging by version then processing each version
        # differently" should only be temporary and removed once v2
        # migration is done
        version_lbl = lbl("Tag Message Versions")
        msg_version = in_pcol | version_lbl >> helpers._KlioTagMessageVersion()

        # tag each v1 message as 'process' or to 'drop' depending on if this
        # job should actually be handling the received message.
        v1_proc_lbl = lbl("Should Process v1 Message")
        v1_to_process = (msg_version.v1
                         | v1_proc_lbl >> helpers._KlioV1CheckRecipients())
        v2_proc_lbl = lbl("Should Process v2 Message")
        v2_to_process = (msg_version.v2
                         | v2_proc_lbl >> helpers.KlioCheckRecipients())

        flatten_ign_lbl = lbl("Flatten to Drop Messages to Ignore")
        to_drop_flatten = (v1_to_process.drop, v2_to_process.drop)
        to_drop = to_drop_flatten | flatten_ign_lbl >> beam.Flatten()
        ignore_lbl = lbl("Drop Messages to Ignore")
        _ = to_drop | ignore_lbl >> helpers.KlioDrop()

        flatten_proc_lbl = lbl("Flatten to Process Intended Messages")
        to_process_flatten = (v1_to_process.process, v2_to_process.process)
        to_process = to_process_flatten | flatten_proc_lbl >> beam.Flatten()
        return to_process
Beispiel #2
0
def test_klio_drop(mock_config, caplog):
    kmsg = klio_pb2.KlioMessage()

    with test_pipeline.TestPipeline() as p:
        p | beam.Create([kmsg.SerializeToString()]) | helpers.KlioDrop()

    # beam produces 50+ log messages so let's just iterate and find what
    # we're looking for *shrug*
    for rec in caplog.records:
        if "Dropping KlioMessage" in rec.message:
            assert True
            break
    else:
        assert False, "Expected log message not found"

    actual_counters = p.result.metrics().query()["counters"]
    assert 3 == len(actual_counters)
    received_ctr = actual_counters[0]
    drop_ctr = actual_counters[1]
    success_ctr = actual_counters[2]

    assert 1 == received_ctr.committed
    assert "KlioDrop.process" == received_ctr.key.metric.namespace
    assert "kmsg-received" == received_ctr.key.metric.name

    assert 1 == drop_ctr.committed
    assert "KlioDrop" == drop_ctr.key.metric.namespace
    assert "kmsg-drop" == drop_ctr.key.metric.name

    assert 1 == success_ctr.committed
    assert "KlioDrop.process" == success_ctr.key.metric.namespace
    assert "kmsg-success" == success_ctr.key.metric.name
Beispiel #3
0
    def _filter_intended_recipients(self, in_pcol, label_pfx=None):
        pfx = ""
        if label_pfx is not None:
            pfx = "[{}] ".format(label_pfx)

        def lbl(label):
            return "{}{}".format(pfx, label)

        # TODO: this "tagging by version then processing each version
        # differently" should only be temporary and removed once v2
        # migration is done
        version_lbl = lbl("Tag Message Versions")
        msg_version = in_pcol | version_lbl >> helpers._KlioTagMessageVersion()

        # tag each v1 message as 'process' or to 'drop' depending on if this
        # job should actually be handling the received message.
        v1_proc_lbl = lbl("Should Process v1 Message")
        v1_to_process = (msg_version.v1
                         | v1_proc_lbl >> helpers._KlioV1CheckRecipients())
        v2_proc_lbl = lbl("Should Process v2 Message")
        v2_to_process = (msg_version.v2
                         | v2_proc_lbl >> helpers.KlioCheckRecipients())

        flatten_ign_lbl = lbl("Flatten to Drop Messages to Ignore")
        to_drop_flatten = (v1_to_process.drop, v2_to_process.drop)
        to_drop = to_drop_flatten | flatten_ign_lbl >> beam.Flatten()

        # TODO: update me to `var.KlioRunner.DIRECT_GKE_RUNNER` once
        #       direct_on_gke_runner_clean is merged
        if self.config.pipeline_options.runner == "DirectGKERunner":
            ack_inp_lbl = lbl("Ack Dropped Input Message")
            _ = to_drop | ack_inp_lbl >> beam.ParDo(
                helpers.KlioAckInputMessage())

        ignore_lbl = lbl("Drop Messages to Ignore")
        _ = to_drop | ignore_lbl >> helpers.KlioDrop()

        flatten_proc_lbl = lbl("Flatten to Process Intended Messages")
        to_process_flatten = (v1_to_process.process, v2_to_process.process)
        to_process = to_process_flatten | flatten_proc_lbl >> beam.Flatten()
        return to_process
Beispiel #4
0
    def _setup_data_io_filters(self, in_pcol, label_prefix=None):
        # label prefixes are required for multiple inputs (to avoid label
        # name collisions in Beam)
        if self._has_multi_data_inputs or self._has_multi_data_outputs:
            logging.error(
                "Klio does not (yet) support multiple data inputs and outputs."
            )
            raise SystemExit(1)

        data_in_config, data_out_config = None, None
        if self._has_data_inputs:
            data_in_config = self.config.job_config.data.inputs[0]
        if self._has_data_outputs:
            data_out_config = self.config.job_config.data.outputs[0]

        pfx = ""
        if label_prefix is not None:
            pfx = "[{}] ".format(label_prefix)

        def lbl(label):
            return "{}{}".format(pfx, label)

        to_process_output = in_pcol
        pass_thru = None
        if data_in_config:
            pings = in_pcol | lbl("Ping Filter") >> helpers.KlioFilterPing()
            to_process_output = pings.process
            pass_thru = pings.pass_thru

        if data_out_config and not data_out_config.skip_klio_existence_check:
            output_exists = (to_process_output
                             | lbl("Output Exists Filter") >>
                             helpers.KlioGcsCheckOutputExists())
            output_force = (
                output_exists.found
                | lbl("Output Force Filter") >> helpers.KlioFilterForce())
            to_pass_thru_tuple = (pass_thru, output_force.pass_thru)
            to_pass_thru = (to_pass_thru_tuple
                            | lbl("Flatten to Pass Thru") >> beam.Flatten())

            to_filter_input_tuple = (
                output_exists.not_found,
                output_force.process,
            )
            to_filter_input = (to_filter_input_tuple
                               | lbl("Flatten to Process") >> beam.Flatten())
        else:
            to_pass_thru = pass_thru
            to_filter_input = to_process_output

        if data_in_config and not data_in_config.skip_klio_existence_check:
            input_exists = (to_filter_input
                            | lbl("Input Exists Filter") >>
                            helpers.KlioGcsCheckInputExists())
            _ = (input_exists.not_found
                 | lbl("Drop Not Found Data") >> helpers.KlioDrop())
            to_process = input_exists.found
        else:
            to_process = to_filter_input

        return to_process, to_pass_thru
Beispiel #5
0
    def _setup_data_io_filters(self, in_pcol, label_prefix=None):
        # label prefixes are required for multiple inputs (to avoid label
        # name collisions in Beam)
        if self._has_multi_data_inputs or self._has_multi_data_outputs:
            logging.error(
                "Klio does not (yet) support multiple data inputs and outputs."
            )
            raise SystemExit(1)

        data_in_config, data_out_config = None, None
        if self._has_data_inputs:
            data_in_config = self.config.job_config.data.inputs[0]
        if self._has_data_outputs:
            data_out_config = self.config.job_config.data.outputs[0]

        pfx = ""
        if label_prefix is not None:
            pfx = "[{}] ".format(label_prefix)

        def lbl(label):
            return "{}{}".format(pfx, label)

        to_process_output = in_pcol
        pass_thru = None
        if data_in_config:
            pings = in_pcol | lbl("Ping Filter") >> helpers.KlioFilterPing()
            to_process_output = pings.process
            pass_thru = pings.pass_thru

        if data_out_config and not data_out_config.skip_klio_existence_check:
            output_exists = (to_process_output
                             | lbl("Output Exists Filter") >>
                             helpers.KlioGcsCheckOutputExists())
            output_force = (
                output_exists.found
                | lbl("Output Force Filter") >> helpers.KlioFilterForce())
            if pass_thru is not None:
                to_pass_thru_tuple = (pass_thru, output_force.pass_thru)
                to_pass_thru = (to_pass_thru_tuple
                                |
                                lbl("Flatten to Pass Thru") >> beam.Flatten())
            else:
                to_pass_thru = output_force.pass_thru

            to_filter_input_tuple = (
                output_exists.not_found,
                output_force.process,
            )
            to_filter_input = (to_filter_input_tuple
                               | lbl("Flatten to Process") >> beam.Flatten())
        else:
            to_pass_thru = pass_thru
            to_filter_input = to_process_output

        if data_in_config and not data_in_config.skip_klio_existence_check:
            input_exists = (to_filter_input
                            | lbl("Input Exists Filter") >>
                            helpers.KlioGcsCheckInputExists())

            # TODO: update me to `var.KlioRunner.DIRECT_GKE_RUNNER` once
            #       direct_on_gke_runner_clean is merged
            if self.config.pipeline_options.runner == "DirectGKERunner":
                ack_inp_lbl = lbl("Ack Input Message from No Data Input Found")
                _ = input_exists.not_found | ack_inp_lbl >> beam.ParDo(
                    helpers.KlioAckInputMessage())
            _ = (input_exists.not_found
                 | lbl("Drop Not Found Data") >> helpers.KlioDrop())
            to_process = input_exists.found
        else:
            to_process = to_filter_input

        return to_process, to_pass_thru