Example #1
0
def test_trigger_upstream_job(mock_config, mocker, caplog):
    mock_gcs_client = mocker.patch("klio.transforms._helpers.gcsio.GcsIO")
    mock_gcs_client.return_value.exists.return_value = False
    mock_pubsub_client = mocker.patch("google.cloud.pubsub.PublisherClient")

    kmsg = klio_pb2.KlioMessage()
    kmsg.data.element = b"does_not_exist"

    exp_current_job = klio_pb2.KlioJob()
    exp_current_job.job_name = "a-job"
    exp_current_job.gcp_project = "not-a-real-project"
    exp_upstream_job = klio_pb2.KlioJob()
    exp_upstream_job.job_name = "upstream-job"
    exp_upstream_job.gcp_project = "upstream-project"
    exp_kmsg = klio_pb2.KlioMessage()
    exp_kmsg.version = klio_pb2.Version.V2
    exp_kmsg.data.element = b"does_not_exist"
    exp_lmtd = exp_kmsg.metadata.intended_recipients.limited
    exp_lmtd.recipients.extend([exp_upstream_job, exp_current_job])
    exp_lmtd.trigger_children_of.CopyFrom(exp_current_job)

    options = pipeline_options.PipelineOptions([])
    options.view_as(pipeline_options.StandardOptions).streaming = True

    with test_pipeline.TestPipeline(options=options) as p:
        in_pcol = p | beam.Create([kmsg.SerializeToString()])
        input_data = in_pcol | helpers.KlioGcsCheckInputExists()

        _ = input_data.not_found | helpers.KlioTriggerUpstream(
            upstream_job_name="upstream-job",
            upstream_topic="projects/upstream-project/topics/does-not-exist",
        )

    mock_gcs_client.return_value.exists.assert_called_once_with(
        "gs://hopefully-this-bucket-doesnt-exist/does_not_exist")
    mock_pubsub_client.return_value.publish.assert_called_once_with(
        mock_pubsub_client.return_value.topic_path.return_value,
        exp_kmsg.SerializeToString(),
    )

    actual_counters = p.result.metrics().query()["counters"]
    assert 2 == len(actual_counters)

    data_not_found_ctr = actual_counters[0]
    trigger_upstream_ctr = actual_counters[1]
    assert 1 == data_not_found_ctr.committed
    assert "KlioGcsCheckInputExists" == data_not_found_ctr.key.metric.namespace
    assert "kmsg-data-not-found-input" == data_not_found_ctr.key.metric.name
    assert 1 == trigger_upstream_ctr.committed
    assert "KlioTriggerUpstream" == trigger_upstream_ctr.key.metric.namespace
    assert "kmsg-trigger-upstream" == trigger_upstream_ctr.key.metric.name

    expected_log_msg = "Triggering upstream upstream-job for does_not_exist"
    for record in caplog.records:
        if expected_log_msg in record.message:
            assert True
            break
    else:
        assert False, "Expected log message not found"
Example #2
0
def test_trigger_upstream_job(mock_config, mocker, capsys):
    mock_gcs_client = mocker.patch("klio.transforms._helpers.gcsio.GcsIO")
    mock_gcs_client.return_value.exists.return_value = False
    mock_pubsub_client = mocker.patch("google.cloud.pubsub.PublisherClient")

    kmsg = klio_pb2.KlioMessage()
    kmsg.data.element = b"does_not_exist"

    exp_current_job = klio_pb2.KlioJob()
    exp_current_job.job_name = "a-job"
    exp_current_job.gcp_project = "not-a-real-project"
    exp_upstream_job = klio_pb2.KlioJob()
    exp_upstream_job.job_name = "upstream-job"
    exp_upstream_job.gcp_project = "upstream-project"
    exp_kmsg = klio_pb2.KlioMessage()
    exp_kmsg.version = klio_pb2.Version.V2
    exp_kmsg.data.element = b"does_not_exist"
    exp_lmtd = exp_kmsg.metadata.intended_recipients.limited
    exp_lmtd.recipients.extend([exp_upstream_job, exp_current_job])
    exp_lmtd.trigger_children_of.CopyFrom(exp_current_job)

    options = pipeline_options.PipelineOptions([])
    options.view_as(pipeline_options.StandardOptions).streaming = True

    with test_pipeline.TestPipeline(options=options) as p:
        in_pcol = p | beam.Create([kmsg.SerializeToString()])
        input_data = in_pcol | helpers.KlioGcsCheckInputExists()

        _ = input_data.not_found | helpers.KlioTriggerUpstream(
            upstream_job_name="upstream-job",
            upstream_topic="projects/upstream-project/topics/does-not-exist",
        )

    mock_gcs_client.return_value.exists.assert_called_once_with(
        "gs://hopefully-this-bucket-doesnt-exist/does_not_exist")
    mock_pubsub_client.return_value.publish.assert_called_once_with(
        mock_pubsub_client.return_value.topic_path.return_value,
        exp_kmsg.SerializeToString(),
    )
Example #3
0
def assert_audit(actual):
    job = klio_pb2.KlioJob()
    job.job_name = "a-job"
    job.gcp_project = "not-a-real-project"
    audit_log_item = klio_pb2.KlioJobAuditLogItem()
    audit_log_item.klio_job.CopyFrom(job)
    exp_msg = klio_pb2.KlioMessage()
    exp_msg.version = klio_pb2.Version.V2
    exp_msg.metadata.job_audit_log.extend([audit_log_item])
    expected = exp_msg.SerializeToString()

    assert expected == actual
    return actual
Example #4
0
    def _should_process(self, klio_message):
        downstream = klio_message.metadata.downstream
        if not downstream:
            # if there's nothing in downstream, then it means the message is
            # in top-down mode and should be handled
            return True

        current_job = klio_pb2.KlioJob()
        current_job.ParseFromString(self._klio.job)

        if _helpers._job_in_jobs(current_job, downstream):
            return True

        self._klio.logger.info(
            "Dropping KlioMessage - job not an intended recipient for message "
            "with entity_id {}.".format(klio_message.data.entity_id))
        return False
Example #5
0
    def _should_process(self, klio_message):
        intended_recipients = klio_message.metadata.intended_recipients
        # returns "anyone", "limited", or None if not set
        recipients = intended_recipients.WhichOneof("recipients")

        if recipients is None:
            # is it safe to assume if this is not set in a v2 message, it should
            # be top-down? I think this will be the case for batch
            self._klio.logger.warning(
                "Dropping KlioMessage - No 'intended_recipients' set in "
                "metadata of KlioMessage with element '{}'.".format(
                    klio_message.data.element
                )
            )
            return False

        if recipients == "anyone":
            return True

        current_job = klio_pb2.KlioJob()
        current_job.ParseFromString(self._klio.job)

        # otherwise, recipients == "limited"
        # don't process if this job is not in the intended recipients
        if not _helpers._job_in_jobs(
            current_job, intended_recipients.limited.recipients
        ):
            return False

        # if it is in the intended recipients _and_ is the job in
        # trigger_children_of, then this message was originally in top-down
        # mode, but was missing dependencies, and therefore should update the
        # message intended receipients to be "anyone" signifying top-down
        if _helpers._job_in_jobs(
            current_job, [intended_recipients.limited.trigger_children_of]
        ):
            # FYI: since 'anyone' is essentially empty msg, it can't simply
            # be assigned. To set `anyone` as the intended_recipients, use
            # kmsg.metadata.intended_recipients.anyone.SetInParent()`
            # https://stackoverflow.com/a/29651069
            intended_recipients.anyone.SetInParent()

        return True
Example #6
0
 def _generate_current_job_object(self):
     job = klio_pb2.KlioJob()
     job.job_name = self._klio.config.job_name
     job.gcp_project = self._klio.config.pipeline_options.project
     return job
Example #7
0
 def _generate_upstream_job_object(self):
     upstream_job = klio_pb2.KlioJob()
     upstream_job.job_name = self.upstream_job_name
     upstream_job.gcp_project = self.upstream_gcp_project
     return upstream_job
Example #8
0
 def _create_klio_job_obj(self):
     klio_job = klio_pb2.KlioJob()
     klio_job.job_name = self.config.job_name
     klio_job.gcp_project = self.config.pipeline_options.project
     klio_job_str = klio_job.SerializeToString()
     return klio_job_str
Example #9
0
 def get_other_job(self):
     job = klio_pb2.KlioJob()
     job.job_name = "other"
     job.gcp_project = "not-a-real-project"
     return job
Example #10
0
 def get_current_job(self):
     job = klio_pb2.KlioJob()
     job.job_name = "a-job"
     job.gcp_project = "not-a-real-project"
     return job