Exemple #1
0
def namespace(namespace_name):
    owner_name = "some_owner"
    description = "this is a very nice namespace."
    basic_marquez_client = MarquezClient(host="localhost", port=8080)
    created_ns = basic_marquez_client.create_namespace(
        namespace_name, owner_name, description)
    return created_ns
Exemple #2
0
class DAG(airflow.models.DAG):
    DEFAULT_NAMESPACE = 'default'
    _job_id_mapping = None
    _marquez_client = None

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.marquez_namespace = os.environ.get('MARQUEZ_NAMESPACE') or \
            DAG.DEFAULT_NAMESPACE
        self.marquez_location = kwargs['default_args'].get(
            'marquez_location', 'unknown')
        self.marquez_input_urns = kwargs['default_args'].get(
            'marquez_input_urns', [])
        self.marquez_output_urns = kwargs['default_args'].get(
            'marquez_output_urns', [])
        self._job_id_mapping = JobIdMapping()

    def create_dagrun(self, *args, **kwargs):
        run_args = "{}"  # TODO extract the run Args from the tasks
        marquez_jobrun_id = None
        try:
            marquez_jobrun_id = self.report_jobrun(run_args,
                                                   kwargs['execution_date'])
            log.info(f'Successfully recorded job run.',
                     airflow_dag_id=self.dag_id,
                     marquez_run_id=marquez_jobrun_id,
                     marquez_namespace=self.marquez_namespace)
        except Exception as e:
            log.error(f'Failed to record job run: {e}',
                      airflow_dag_id=self.dag_id,
                      marquez_namespace=self.marquez_namespace)
            pass

        run = super(DAG, self).create_dagrun(*args, **kwargs)

        if marquez_jobrun_id:
            try:
                self._job_id_mapping.set(
                    JobIdMapping.make_key(run.dag_id, run.run_id),
                    marquez_jobrun_id)
            except Exception as e:
                log.error(f'Failed job run lookup: {e}',
                          airflow_dag_id=self.dag_id,
                          airflow_run_id=run.run_id,
                          marquez_run_id=marquez_jobrun_id,
                          marquez_namespace=self.marquez_namespace)
                pass

        return run

    def handle_callback(self, *args, **kwargs):
        try:
            self.report_jobrun_change(args[0], **kwargs)
        except Exception as e:
            log.error(f'Failed to record job run state change: {e}',
                      dag_id=self.dag_id)

        return super().handle_callback(*args, **kwargs)

    def report_jobrun(self, run_args, execution_date):
        now_ms = self._now_ms()

        job_name = self.dag_id
        start_time = execution_date.format("%Y-%m-%dT%H:%M:%SZ")
        end_time = self.compute_endtime(execution_date)
        if end_time:
            end_time = end_time.strftime("%Y-%m-%dT%H:%M:%SZ")
        marquez_client = self.get_marquez_client()

        marquez_client.create_job(job_name,
                                  self.marquez_location,
                                  self.marquez_input_urns,
                                  self.marquez_output_urns,
                                  description=self.description)
        log.info(f'Successfully recorded job: {job_name}',
                 airflow_dag_id=self.dag_id,
                 marquez_namespace=self.marquez_namespace)

        marquez_jobrun = marquez_client.create_job_run(
            job_name,
            run_args=run_args,
            nominal_start_time=start_time,
            nominal_end_time=end_time)

        marquez_jobrun_id = marquez_jobrun.get('runId')
        if marquez_jobrun_id:
            marquez_client.mark_job_run_as_running(marquez_jobrun_id)
            log.info(f'Successfully recorded job run: {job_name}',
                     airflow_dag_id=self.dag_id,
                     airflow_dag_execution_time=start_time,
                     marquez_run_id=marquez_jobrun_id,
                     marquez_namespace=self.marquez_namespace,
                     duration_ms=(self._now_ms() - now_ms))
        else:
            log.warn(f'Run id found not found: {job_name}',
                     airflow_dag_id=self.dag_id,
                     airflow_dag_execution_time=start_time,
                     marquez_run_id=marquez_jobrun_id,
                     marquez_namespace=self.marquez_namespace,
                     duration_ms=(self._now_ms() - now_ms))

        return marquez_jobrun_id

    def compute_endtime(self, execution_date):
        return self.following_schedule(execution_date)

    def report_jobrun_change(self, dagrun, **kwargs):
        session = kwargs.get('session')
        marquez_job_run_id = self._job_id_mapping.pop(
            JobIdMapping.make_key(dagrun.dag_id, dagrun.run_id), session)
        if marquez_job_run_id:
            log.info(f'Found job run.',
                     airflow_dag_id=dagrun.dag_id,
                     airflow_run_id=dagrun.run_id,
                     marquez_run_id=marquez_job_run_id,
                     marquez_namespace=self.marquez_namespace)

            if kwargs.get('success'):
                self.get_marquez_client().mark_job_run_as_completed(
                    marquez_job_run_id)
            else:
                self.get_marquez_client().mark_job_run_as_failed(
                    marquez_job_run_id)

        state = 'COMPLETED' if kwargs.get('success') else 'FAILED'
        log.info(f'Marked job run as {state}.',
                 airflow_dag_id=dagrun.dag_id,
                 airflow_run_id=dagrun.run_id,
                 marquez_run_id=marquez_job_run_id,
                 marquez_namespace=self.marquez_namespace)

    def get_marquez_client(self):
        if not self._marquez_client:
            self._marquez_client = MarquezClient(
                namespace_name=self.marquez_namespace)
            self._marquez_client.create_namespace(self.marquez_namespace,
                                                  "default_owner")
        return self._marquez_client

    @staticmethod
    def _now_ms():
        return int(round(time.time() * 1000))
Exemple #3
0
class TestMarquezClient(unittest.TestCase):
    def setUp(self):
        self.client = MarquezClient()

    @mock.patch("marquez_client.MarquezClient._put")
    def test_create_namespace(self, mock_put):
        owner_name = "me"
        description = "my namespace for testing."

        mock_put.return_value = {
            "name": _NAMESPACE,
            "ownerName": owner_name,
            "description": description
        }

        response = self.client.create_namespace(_NAMESPACE, owner_name,
                                                description)

        assert _NAMESPACE == str(response['name'])
        assert owner_name == str(response['ownerName'])
        assert description == str(response['description'])

    @mock.patch("marquez_client.MarquezClient._put")
    def test_create_dataset(self, mock_put):
        dataset_name = "my-dataset"
        description = "My dataset for testing."

        fields = [{
            "name": "flight_id",
            "type": "INTEGER",
            "description": "flight id"
        }, {
            "name": "flight_name",
            "type": "VARCHAR",
            "description": "flight name"
        }, {
            "name": "flight_date",
            "type": "TIMESTAMP",
            "description": "flight date"
        }]

        mock_put.return_value = {
            'id': {
                'namespace': 'my-namespace',
                'name': 'my-dataset'
            },
            'type':
            'DB_TABLE',
            'name':
            'my-dataset',
            'physicalName':
            'public.mytable',
            'createdAt':
            '2020-08-12T05:46:31.172877Z',
            'updatedAt':
            '2020-08-12T05:46:31.184934Z',
            'namespace':
            'my-namespace',
            'sourceName':
            'mydb',
            'fields': [{
                'name': 'my_date',
                'type': 'TIMESTAMP',
                'description': 'my date'
            }, {
                'name': 'my_id',
                'type': 'INTEGER',
                'description': 'my id'
            }, {
                'name': 'my_name',
                'type': 'VARCHAR',
                'description': 'my name'
            }],
            'tags': [],
            'lastModifiedAt':
            None,
            'description':
            'My dataset for testing.'
        }

        response = self.client.create_dataset(
            namespace_name=_NAMESPACE,
            dataset_name=dataset_name,
            dataset_type=DatasetType.DB_TABLE,
            physical_name=dataset_name,
            source_name='my-source',
            description=description,
            run_id=None,
            schema_location=None,
            fields=fields,
            tags=None)

        assert str(response['description']) == description
        assert str(response['name']) == dataset_name

    @mock.patch("marquez_client.MarquezClient._put")
    def test_create_datasource(self, mock_put):
        source_name = "flight_schedules_db"
        source_type = SourceType.POSTGRESQL
        source_url = "jdbc:postgresql://*****:*****@mock.patch("marquez_client.MarquezClient._put")
    def test_create_job(self, mock_put):
        job_name = "my-job"
        input_dataset = [{
            "namespace": "my-namespace",
            "name": "public.mytable"
        }]
        output_dataset = {
            "namespace": "my-namespace",
            "name": "public.mytable"
        }

        location = "https://github.com/my-jobs/blob/" \
                   "07f3d2dfc8186cadae9146719e70294a4c7a8ee8"

        context = {"SQL": "SELECT * FROM public.mytable;"}

        mock_put.return_value = {
            "id": {
                "namespace": "my-namespace",
                "name": "my-job"
            },
            "type": "BATCH",
            "name": "my-job",
            "createdAt": "2020-08-12T07:30:55.321059Z",
            "updatedAt": "2020-08-12T07:30:55.333230Z",
            "namespace": "my-namespace",
            "inputs": [{
                "namespace": "my-namespace",
                "name": "public.mytable"
            }],
            "outputs": [{
                "namespace": "my-namespace",
                "name": "public.mytable"
            }],
            "location": "https://github.com/my-jobs/blob/"
            "07f3d2dfc8186cadae9146719e70294a4c7a8ee8",
            "context": {
                "SQL": "SELECT * FROM public.mytable;"
            },
            "description": "My first job.",
            "latestRun": None
        }

        response = self.client.create_job(namespace_name=_NAMESPACE,
                                          job_name=job_name,
                                          job_type=JobType.BATCH,
                                          location=location,
                                          input_dataset=input_dataset,
                                          output_dataset=output_dataset,
                                          context=context)

        assert str(response['id']) is not None
        assert str(response['location']) == location

    @mock.patch("marquez_client.MarquezClient._post")
    def test_create_job_run(self, mock_post):
        job_name = "my-job"
        run_args = {
            "email": "*****@*****.**",
            "emailOnFailure": "true",
            "emailOnRetry": "true",
            "retries": "1"
        }
        created_at = str(
            generate(datetime.datetime.utcnow().replace(tzinfo=pytz.utc)))

        mock_post.return_value = {
            'id': f'{uuid.uuid4()}',
            'createdAt': f'{created_at}',
            'updatedAt': '2020-08-12T22:33:02.787228Z',
            'nominalStartTime': None,
            'nominalEndTime': None,
            'state': 'NEW',
            'startedAt': None,
            'endedAt': None,
            'durationMs': None,
            'run_args': {
                "email": "*****@*****.**",
                "emailOnFailure": "true",
                "emailOnRetry": "true",
                "retries": "1"
            }
        }

        response = self.client.create_job_run(namespace_name=_NAMESPACE,
                                              job_name=job_name,
                                              nominal_start_time=None,
                                              nominal_end_time=None,
                                              run_args=run_args,
                                              mark_as_running=False)

        assert response['id'] is not None
        assert str(response['run_args']) == str(run_args)
        assert str(response['createdAt']) == created_at

    @mock.patch("marquez_client.MarquezClient._post")
    def test_mark_job_run_as_start(self, mock_post):
        run_id = str(uuid.uuid4())

        mock_post.return_value = {
            'id': f'{run_id}',
            'createdAt': '2020-08-12T22:36:50.739951Z',
            'updatedAt': '2020-08-13T17:56:39.516802Z',
            'nominalStartTime': None,
            'nominalEndTime': None,
            'state': 'RUNNING',
            'startedAt': '2020-08-13T17:56:39.516802Z',
            'endedAt': None,
            'durationMs': None,
            'args': {}
        }

        response = self.client.mark_job_run_as_started(run_id=run_id)

        assert str(response['id']) == run_id
        assert str(response['state']) == RunState.RUNNING.value

    @mock.patch("marquez_client.MarquezClient._post")
    def test_mark_job_run_as_completed(self, mock_post):
        run_id = str(uuid.uuid4())

        mock_post.return_value = {
            'id': f'{run_id}',
            'createdAt': '2020-08-12T22:36:50.739951Z',
            'updatedAt': '2020-08-13T17:56:39.516802Z',
            'nominalStartTime': None,
            'nominalEndTime': None,
            'state': 'COMPLETED',
            'startedAt': '2020-08-13T17:56:39.516802Z',
            'endedAt': None,
            'durationMs': None,
            'args': {}
        }

        response = self.client.mark_job_run_as_completed(run_id=run_id)

        assert str(response['id']) == run_id
        assert str(response['state']) == RunState.COMPLETED.value

    @mock.patch("marquez_client.MarquezClient._post")
    def test_mark_job_run_as_failed(self, mock_post):
        run_id = str(uuid.uuid4())

        mock_post.return_value = {
            'id': f'{run_id}',
            'createdAt': '2020-08-12T22:36:50.739951Z',
            'updatedAt': '2020-08-13T17:56:39.516802Z',
            'nominalStartTime': None,
            'nominalEndTime': None,
            'state': 'FAILED',
            'startedAt': '2020-08-13T17:56:39.516802Z',
            'endedAt': None,
            'durationMs': None,
            'args': {}
        }

        response = self.client.mark_job_run_as_failed(run_id=run_id)

        assert str(response['id']) == run_id
        assert str(response['state']) == RunState.FAILED.value

    @mock.patch("marquez_client.MarquezClient._post")
    def test_mark_job_run_as_aborted(self, mock_post):
        run_id = str(uuid.uuid4())

        mock_post.return_value = {
            'id': f'{run_id}',
            'createdAt': '2020-08-12T22:36:50.739951Z',
            'updatedAt': '2020-08-13T17:56:39.516802Z',
            'nominalStartTime': None,
            'nominalEndTime': None,
            'state': 'ABORTED',
            'startedAt': '2020-08-13T17:56:39.516802Z',
            'endedAt': None,
            'durationMs': None,
            'args': {}
        }

        response = self.client.mark_job_run_as_aborted(run_id=run_id)

        assert str(response['id']) == run_id
        assert str(response['state']) == RunState.ABORTED.value