Example #1
0
def job_default_ns(job_name):
    marquez_client = MarquezClient(host="localhost", port=5000)

    input_datsets = ['input1a', 'input2a']
    output_datsets = ['output1a', 'output2a']
    return marquez_client.create_job(
        job_name, 'BATCH', 'https://github.com/wework/jobs/commit/124f',
        input_datsets, output_datsets)
Example #2
0
def job_default_ns(job_name):
    marquez_client = MarquezClient(host="localhost", port=8080)

    input_datset_urns = ['input1a', 'input2a']
    output_datset_urns = ['output1a', 'output2a']
    return marquez_client.create_job(
        job_name, 'some_other_location',
        input_datset_urns,
        output_datset_urns)
Example #3
0
class TestMarquezClient(unittest.TestCase):
    def setUp(self):
        self.client = MarquezClient()

    @mock.patch("marquez_client.MarquezClient._put")
    def test_create_namespace(self, mock_put):
        owner_name = "me"
        description = "my namespace for testing."

        mock_put.return_value = {
            "name": _NAMESPACE,
            "ownerName": owner_name,
            "description": description
        }

        response = self.client.create_namespace(_NAMESPACE, owner_name,
                                                description)

        assert _NAMESPACE == str(response['name'])
        assert owner_name == str(response['ownerName'])
        assert description == str(response['description'])

    @mock.patch("marquez_client.MarquezClient._put")
    def test_create_dataset(self, mock_put):
        dataset_name = "my-dataset"
        description = "My dataset for testing."

        fields = [{
            "name": "flight_id",
            "type": "INTEGER",
            "description": "flight id"
        }, {
            "name": "flight_name",
            "type": "VARCHAR",
            "description": "flight name"
        }, {
            "name": "flight_date",
            "type": "TIMESTAMP",
            "description": "flight date"
        }]

        mock_put.return_value = {
            'id': {
                'namespace': 'my-namespace',
                'name': 'my-dataset'
            },
            'type':
            'DB_TABLE',
            'name':
            'my-dataset',
            'physicalName':
            'public.mytable',
            'createdAt':
            '2020-08-12T05:46:31.172877Z',
            'updatedAt':
            '2020-08-12T05:46:31.184934Z',
            'namespace':
            'my-namespace',
            'sourceName':
            'mydb',
            'fields': [{
                'name': 'my_date',
                'type': 'TIMESTAMP',
                'description': 'my date'
            }, {
                'name': 'my_id',
                'type': 'INTEGER',
                'description': 'my id'
            }, {
                'name': 'my_name',
                'type': 'VARCHAR',
                'description': 'my name'
            }],
            'tags': [],
            'lastModifiedAt':
            None,
            'description':
            'My dataset for testing.'
        }

        response = self.client.create_dataset(
            namespace_name=_NAMESPACE,
            dataset_name=dataset_name,
            dataset_type=DatasetType.DB_TABLE,
            physical_name=dataset_name,
            source_name='my-source',
            description=description,
            run_id=None,
            schema_location=None,
            fields=fields,
            tags=None)

        assert str(response['description']) == description
        assert str(response['name']) == dataset_name

    @mock.patch("marquez_client.MarquezClient._put")
    def test_create_datasource(self, mock_put):
        source_name = "flight_schedules_db"
        source_type = SourceType.POSTGRESQL
        source_url = "jdbc:postgresql://*****:*****@mock.patch("marquez_client.MarquezClient._put")
    def test_create_job(self, mock_put):
        job_name = "my-job"
        input_dataset = [{
            "namespace": "my-namespace",
            "name": "public.mytable"
        }]
        output_dataset = {
            "namespace": "my-namespace",
            "name": "public.mytable"
        }

        location = "https://github.com/my-jobs/blob/" \
                   "07f3d2dfc8186cadae9146719e70294a4c7a8ee8"

        context = {"SQL": "SELECT * FROM public.mytable;"}

        mock_put.return_value = {
            "id": {
                "namespace": "my-namespace",
                "name": "my-job"
            },
            "type": "BATCH",
            "name": "my-job",
            "createdAt": "2020-08-12T07:30:55.321059Z",
            "updatedAt": "2020-08-12T07:30:55.333230Z",
            "namespace": "my-namespace",
            "inputs": [{
                "namespace": "my-namespace",
                "name": "public.mytable"
            }],
            "outputs": [{
                "namespace": "my-namespace",
                "name": "public.mytable"
            }],
            "location": "https://github.com/my-jobs/blob/"
            "07f3d2dfc8186cadae9146719e70294a4c7a8ee8",
            "context": {
                "SQL": "SELECT * FROM public.mytable;"
            },
            "description": "My first job.",
            "latestRun": None
        }

        response = self.client.create_job(namespace_name=_NAMESPACE,
                                          job_name=job_name,
                                          job_type=JobType.BATCH,
                                          location=location,
                                          input_dataset=input_dataset,
                                          output_dataset=output_dataset,
                                          context=context)

        assert str(response['id']) is not None
        assert str(response['location']) == location

    @mock.patch("marquez_client.MarquezClient._post")
    def test_create_job_run(self, mock_post):
        job_name = "my-job"
        run_args = {
            "email": "*****@*****.**",
            "emailOnFailure": "true",
            "emailOnRetry": "true",
            "retries": "1"
        }
        created_at = str(
            generate(datetime.datetime.utcnow().replace(tzinfo=pytz.utc)))

        mock_post.return_value = {
            'id': f'{uuid.uuid4()}',
            'createdAt': f'{created_at}',
            'updatedAt': '2020-08-12T22:33:02.787228Z',
            'nominalStartTime': None,
            'nominalEndTime': None,
            'state': 'NEW',
            'startedAt': None,
            'endedAt': None,
            'durationMs': None,
            'run_args': {
                "email": "*****@*****.**",
                "emailOnFailure": "true",
                "emailOnRetry": "true",
                "retries": "1"
            }
        }

        response = self.client.create_job_run(namespace_name=_NAMESPACE,
                                              job_name=job_name,
                                              nominal_start_time=None,
                                              nominal_end_time=None,
                                              run_args=run_args,
                                              mark_as_running=False)

        assert response['id'] is not None
        assert str(response['run_args']) == str(run_args)
        assert str(response['createdAt']) == created_at

    @mock.patch("marquez_client.MarquezClient._post")
    def test_mark_job_run_as_start(self, mock_post):
        run_id = str(uuid.uuid4())

        mock_post.return_value = {
            'id': f'{run_id}',
            'createdAt': '2020-08-12T22:36:50.739951Z',
            'updatedAt': '2020-08-13T17:56:39.516802Z',
            'nominalStartTime': None,
            'nominalEndTime': None,
            'state': 'RUNNING',
            'startedAt': '2020-08-13T17:56:39.516802Z',
            'endedAt': None,
            'durationMs': None,
            'args': {}
        }

        response = self.client.mark_job_run_as_started(run_id=run_id)

        assert str(response['id']) == run_id
        assert str(response['state']) == RunState.RUNNING.value

    @mock.patch("marquez_client.MarquezClient._post")
    def test_mark_job_run_as_completed(self, mock_post):
        run_id = str(uuid.uuid4())

        mock_post.return_value = {
            'id': f'{run_id}',
            'createdAt': '2020-08-12T22:36:50.739951Z',
            'updatedAt': '2020-08-13T17:56:39.516802Z',
            'nominalStartTime': None,
            'nominalEndTime': None,
            'state': 'COMPLETED',
            'startedAt': '2020-08-13T17:56:39.516802Z',
            'endedAt': None,
            'durationMs': None,
            'args': {}
        }

        response = self.client.mark_job_run_as_completed(run_id=run_id)

        assert str(response['id']) == run_id
        assert str(response['state']) == RunState.COMPLETED.value

    @mock.patch("marquez_client.MarquezClient._post")
    def test_mark_job_run_as_failed(self, mock_post):
        run_id = str(uuid.uuid4())

        mock_post.return_value = {
            'id': f'{run_id}',
            'createdAt': '2020-08-12T22:36:50.739951Z',
            'updatedAt': '2020-08-13T17:56:39.516802Z',
            'nominalStartTime': None,
            'nominalEndTime': None,
            'state': 'FAILED',
            'startedAt': '2020-08-13T17:56:39.516802Z',
            'endedAt': None,
            'durationMs': None,
            'args': {}
        }

        response = self.client.mark_job_run_as_failed(run_id=run_id)

        assert str(response['id']) == run_id
        assert str(response['state']) == RunState.FAILED.value

    @mock.patch("marquez_client.MarquezClient._post")
    def test_mark_job_run_as_aborted(self, mock_post):
        run_id = str(uuid.uuid4())

        mock_post.return_value = {
            'id': f'{run_id}',
            'createdAt': '2020-08-12T22:36:50.739951Z',
            'updatedAt': '2020-08-13T17:56:39.516802Z',
            'nominalStartTime': None,
            'nominalEndTime': None,
            'state': 'ABORTED',
            'startedAt': '2020-08-13T17:56:39.516802Z',
            'endedAt': None,
            'durationMs': None,
            'args': {}
        }

        response = self.client.mark_job_run_as_aborted(run_id=run_id)

        assert str(response['id']) == run_id
        assert str(response['state']) == RunState.ABORTED.value