def namespace(namespace_name): owner_name = "some_owner" description = "this is a very nice namespace." basic_marquez_client = MarquezClient(host="localhost", port=8080) created_ns = basic_marquez_client.create_namespace( namespace_name, owner_name, description) return created_ns
def get_marquez_client(self): if not self._marquez_client: self._marquez_client = MarquezClient( namespace_name=self.marquez_namespace) self._marquez_client.create_namespace(self.marquez_namespace, "default_owner") return self._marquez_client
def job_default_ns(job_name): marquez_client = MarquezClient(host="localhost", port=5000) input_datsets = ['input1a', 'input2a'] output_datsets = ['output1a', 'output2a'] return marquez_client.create_job( job_name, 'BATCH', 'https://github.com/wework/jobs/commit/124f', input_datsets, output_datsets)
def job_default_ns(job_name): marquez_client = MarquezClient(host="localhost", port=8080) input_datset_urns = ['input1a', 'input2a'] output_datset_urns = ['output1a', 'output2a'] return marquez_client.create_job( job_name, 'some_other_location', input_datset_urns, output_datset_urns)
def test_data_in_marquez(wait_for_marquez, init_airflow_db): dag_id = "test_dag_v2" execution_date = "2019-02-01T00:00:00" namespace = "integration-test" c = MarquezClient(namespace_name=namespace) assert (trigger_dag(dag_id, execution_date)) assert (check_dag_state(dag_id, execution_date)) result = c.get_namespace(namespace) assert (result and result['name'] == namespace) expected_job = "test_dag_v2" result = c.get_job(expected_job) assert (result and result['name'] == expected_job)
def test_namespace_from_constructor(clear_env): os.environ['MARQUEZ_NAMESPACE'] = 'from_env' client = MarquezClient(namespace_name='from_constructor') assert client.namespace == 'from_constructor' # TODO: https://github.com/MarquezProject/marquez-python/issues/59 os.environ.clear()
def marquez_client(namespace_name): return MarquezClient(host="localhost", namespace_name=namespace_name, port=8080)
def marquez_client_default_ns(): return MarquezClient(host="localhost", port=8080)
def test_timeout_from_constructor(clear_env): os.environ['MARQUEZ_TIMEOUT_MS'] = '2000' client = MarquezClient(timeout_ms=3500) assert client._timeout == 3.5
def test_timeout_default(clear_env): client = MarquezClient() assert client._timeout == DEFAULT_TIMEOUT_MS / 1000.0
def test_port_default(clear_env): client = MarquezClient() assert client._api_base == f'http://{DEFAULT_HOST}:{DEFAULT_PORT}/api/v1'
def test_host_from_env(clear_env): os.environ['MARQUEZ_HOST'] = 'marquez.dev' client = MarquezClient() assert client._api_base == f'http://marquez.dev:8080/api/v1'
def marquez_client(namespace): return MarquezClient(host="localhost", namespace_name=namespace['name'], port=5000)
def setUp(self): self.client = MarquezClient()
class TestMarquezClient(unittest.TestCase): def setUp(self): self.client = MarquezClient() @mock.patch("marquez_client.MarquezClient._put") def test_create_namespace(self, mock_put): owner_name = "me" description = "my namespace for testing." mock_put.return_value = { "name": _NAMESPACE, "ownerName": owner_name, "description": description } response = self.client.create_namespace(_NAMESPACE, owner_name, description) assert _NAMESPACE == str(response['name']) assert owner_name == str(response['ownerName']) assert description == str(response['description']) @mock.patch("marquez_client.MarquezClient._put") def test_create_dataset(self, mock_put): dataset_name = "my-dataset" description = "My dataset for testing." fields = [{ "name": "flight_id", "type": "INTEGER", "description": "flight id" }, { "name": "flight_name", "type": "VARCHAR", "description": "flight name" }, { "name": "flight_date", "type": "TIMESTAMP", "description": "flight date" }] mock_put.return_value = { 'id': { 'namespace': 'my-namespace', 'name': 'my-dataset' }, 'type': 'DB_TABLE', 'name': 'my-dataset', 'physicalName': 'public.mytable', 'createdAt': '2020-08-12T05:46:31.172877Z', 'updatedAt': '2020-08-12T05:46:31.184934Z', 'namespace': 'my-namespace', 'sourceName': 'mydb', 'fields': [{ 'name': 'my_date', 'type': 'TIMESTAMP', 'description': 'my date' }, { 'name': 'my_id', 'type': 'INTEGER', 'description': 'my id' }, { 'name': 'my_name', 'type': 'VARCHAR', 'description': 'my name' }], 'tags': [], 'lastModifiedAt': None, 'description': 'My dataset for testing.' } response = self.client.create_dataset( namespace_name=_NAMESPACE, dataset_name=dataset_name, dataset_type=DatasetType.DB_TABLE, physical_name=dataset_name, source_name='my-source', description=description, run_id=None, schema_location=None, fields=fields, tags=None) assert str(response['description']) == description assert str(response['name']) == dataset_name @mock.patch("marquez_client.MarquezClient._put") def test_create_datasource(self, mock_put): source_name = "flight_schedules_db" source_type = SourceType.POSTGRESQL source_url = "jdbc:postgresql://*****:*****@mock.patch("marquez_client.MarquezClient._put") def test_create_job(self, mock_put): job_name = "my-job" input_dataset = [{ "namespace": "my-namespace", "name": "public.mytable" }] output_dataset = { "namespace": "my-namespace", "name": "public.mytable" } location = "https://github.com/my-jobs/blob/" \ "07f3d2dfc8186cadae9146719e70294a4c7a8ee8" context = {"SQL": "SELECT * FROM public.mytable;"} mock_put.return_value = { "id": { "namespace": "my-namespace", "name": "my-job" }, "type": "BATCH", "name": "my-job", "createdAt": "2020-08-12T07:30:55.321059Z", "updatedAt": "2020-08-12T07:30:55.333230Z", "namespace": "my-namespace", "inputs": [{ "namespace": "my-namespace", "name": "public.mytable" }], "outputs": [{ "namespace": "my-namespace", "name": "public.mytable" }], "location": "https://github.com/my-jobs/blob/" "07f3d2dfc8186cadae9146719e70294a4c7a8ee8", "context": { "SQL": "SELECT * FROM public.mytable;" }, "description": "My first job.", "latestRun": None } response = self.client.create_job(namespace_name=_NAMESPACE, job_name=job_name, job_type=JobType.BATCH, location=location, input_dataset=input_dataset, output_dataset=output_dataset, context=context) assert str(response['id']) is not None assert str(response['location']) == location @mock.patch("marquez_client.MarquezClient._post") def test_create_job_run(self, mock_post): job_name = "my-job" run_args = { "email": "*****@*****.**", "emailOnFailure": "true", "emailOnRetry": "true", "retries": "1" } created_at = str( generate(datetime.datetime.utcnow().replace(tzinfo=pytz.utc))) mock_post.return_value = { 'id': f'{uuid.uuid4()}', 'createdAt': f'{created_at}', 'updatedAt': '2020-08-12T22:33:02.787228Z', 'nominalStartTime': None, 'nominalEndTime': None, 'state': 'NEW', 'startedAt': None, 'endedAt': None, 'durationMs': None, 'run_args': { "email": "*****@*****.**", "emailOnFailure": "true", "emailOnRetry": "true", "retries": "1" } } response = self.client.create_job_run(namespace_name=_NAMESPACE, job_name=job_name, nominal_start_time=None, nominal_end_time=None, run_args=run_args, mark_as_running=False) assert response['id'] is not None assert str(response['run_args']) == str(run_args) assert str(response['createdAt']) == created_at @mock.patch("marquez_client.MarquezClient._post") def test_mark_job_run_as_start(self, mock_post): run_id = str(uuid.uuid4()) mock_post.return_value = { 'id': f'{run_id}', 'createdAt': '2020-08-12T22:36:50.739951Z', 'updatedAt': '2020-08-13T17:56:39.516802Z', 'nominalStartTime': None, 'nominalEndTime': None, 'state': 'RUNNING', 'startedAt': '2020-08-13T17:56:39.516802Z', 'endedAt': None, 'durationMs': None, 'args': {} } response = self.client.mark_job_run_as_started(run_id=run_id) assert str(response['id']) == run_id assert str(response['state']) == RunState.RUNNING.value @mock.patch("marquez_client.MarquezClient._post") def test_mark_job_run_as_completed(self, mock_post): run_id = str(uuid.uuid4()) mock_post.return_value = { 'id': f'{run_id}', 'createdAt': '2020-08-12T22:36:50.739951Z', 'updatedAt': '2020-08-13T17:56:39.516802Z', 'nominalStartTime': None, 'nominalEndTime': None, 'state': 'COMPLETED', 'startedAt': '2020-08-13T17:56:39.516802Z', 'endedAt': None, 'durationMs': None, 'args': {} } response = self.client.mark_job_run_as_completed(run_id=run_id) assert str(response['id']) == run_id assert str(response['state']) == RunState.COMPLETED.value @mock.patch("marquez_client.MarquezClient._post") def test_mark_job_run_as_failed(self, mock_post): run_id = str(uuid.uuid4()) mock_post.return_value = { 'id': f'{run_id}', 'createdAt': '2020-08-12T22:36:50.739951Z', 'updatedAt': '2020-08-13T17:56:39.516802Z', 'nominalStartTime': None, 'nominalEndTime': None, 'state': 'FAILED', 'startedAt': '2020-08-13T17:56:39.516802Z', 'endedAt': None, 'durationMs': None, 'args': {} } response = self.client.mark_job_run_as_failed(run_id=run_id) assert str(response['id']) == run_id assert str(response['state']) == RunState.FAILED.value @mock.patch("marquez_client.MarquezClient._post") def test_mark_job_run_as_aborted(self, mock_post): run_id = str(uuid.uuid4()) mock_post.return_value = { 'id': f'{run_id}', 'createdAt': '2020-08-12T22:36:50.739951Z', 'updatedAt': '2020-08-13T17:56:39.516802Z', 'nominalStartTime': None, 'nominalEndTime': None, 'state': 'ABORTED', 'startedAt': '2020-08-13T17:56:39.516802Z', 'endedAt': None, 'durationMs': None, 'args': {} } response = self.client.mark_job_run_as_aborted(run_id=run_id) assert str(response['id']) == run_id assert str(response['state']) == RunState.ABORTED.value
def test_namespace_not_found(wait_for_marquez): c = MarquezClient(host=MARQUEZ_HOST, port=MARQUEZ_PORT) expected_namespace = "not_found" with pytest.raises(errors.APIError): c.get_namespace(expected_namespace)
def test_timeout(wait_for_marquez, broken_network): c = MarquezClient(host=MARQUEZ_HOST, port=MARQUEZ_PORT, timeout_ms=1) expected_namespace = "timeout_test" with pytest.raises(ReadTimeout): c.get_namespace(expected_namespace)
def test_bad_port(wait_for_marquez): c = MarquezClient(host=MARQUEZ_HOST, port="6000") with pytest.raises(requests.exceptions.ConnectionError) as e: c.get_namespace("no_connection") assert isinstance(e.value.args[0], MaxRetryError)
def test_host_default(clear_env): client = MarquezClient() assert client._api_base == f'http://{DEFAULT_HOST}:8080/api/v1'
def marquez_client(): return MarquezClient(host="localhost", port=5000)
def test_host_from_constructor(clear_env): os.environ['MARQUEZ_HOST'] = 'marquez.dev' client = MarquezClient(host='marquez.staging') assert client._api_base == f'http://marquez.staging:8080/api/v1'
def marquez_client_with_timeout(): return MarquezClient(host="localhost", port=5000, timeout_ms=4000)
def test_port_from_constructor(clear_env): os.environ['MARQUEZ_PORT'] = '5000' client = MarquezClient(port=5001) assert client._api_base == f'http://{DEFAULT_HOST}:5001/api/v1'
def get_marquez_client(self): if not self._marquez_client: self._marquez_client = MarquezClient() return self._marquez_client
def test_timeout_from_env(clear_env): os.environ['MARQUEZ_TIMEOUT_MS'] = '2000' client = MarquezClient() assert client._timeout == 2.0
def client(): return MarquezClient(url='http;//localhost:5000')
def test_namespace_default(clear_env): client = MarquezClient() assert client.namespace == DEFAULT_NAMESPACE_NAME
'type': 'VARCHAR', 'tags': [], 'description': None }, { 'name': 'order_placed_on', 'type': 'TIMESTAMP', 'tags': [], 'description': None }, { 'name': 'orders_placed', 'type': 'INT4', 'tags': [], 'description': None }] client = MarquezClient(url='http://marquez:5000') airflow_db_conn = psycopg2.connect(host="postgres", database="airflow", user="******", password="******") airflow_db_conn.autocommit = True @retry(wait_exponential_multiplier=1000, wait_exponential_max=10000) def wait_for_dag(): log.info(f"Waiting for DAG '{DAG_ID}'...") cur = airflow_db_conn.cursor() cur.execute(f""" SELECT dag_id, state
def new_client(): return MarquezClient(url=os.environ.get('MARQUEZ_URL', DEFAULT_MARQUEZ_URL), api_key=os.environ.get('MARQUEZ_API_KEY'))
class DAG(airflow.models.DAG): DEFAULT_NAMESPACE = 'default' _job_id_mapping = None _marquez_client = None def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.marquez_namespace = os.environ.get('MARQUEZ_NAMESPACE') or \ DAG.DEFAULT_NAMESPACE self.marquez_location = kwargs['default_args'].get( 'marquez_location', 'unknown') self.marquez_input_urns = kwargs['default_args'].get( 'marquez_input_urns', []) self.marquez_output_urns = kwargs['default_args'].get( 'marquez_output_urns', []) self._job_id_mapping = JobIdMapping() def create_dagrun(self, *args, **kwargs): run_args = "{}" # TODO extract the run Args from the tasks marquez_jobrun_id = None try: marquez_jobrun_id = self.report_jobrun(run_args, kwargs['execution_date']) log.info(f'Successfully recorded job run.', airflow_dag_id=self.dag_id, marquez_run_id=marquez_jobrun_id, marquez_namespace=self.marquez_namespace) except Exception as e: log.error(f'Failed to record job run: {e}', airflow_dag_id=self.dag_id, marquez_namespace=self.marquez_namespace) pass run = super(DAG, self).create_dagrun(*args, **kwargs) if marquez_jobrun_id: try: self._job_id_mapping.set( JobIdMapping.make_key(run.dag_id, run.run_id), marquez_jobrun_id) except Exception as e: log.error(f'Failed job run lookup: {e}', airflow_dag_id=self.dag_id, airflow_run_id=run.run_id, marquez_run_id=marquez_jobrun_id, marquez_namespace=self.marquez_namespace) pass return run def handle_callback(self, *args, **kwargs): try: self.report_jobrun_change(args[0], **kwargs) except Exception as e: log.error(f'Failed to record job run state change: {e}', dag_id=self.dag_id) return super().handle_callback(*args, **kwargs) def report_jobrun(self, run_args, execution_date): now_ms = self._now_ms() job_name = self.dag_id start_time = execution_date.format("%Y-%m-%dT%H:%M:%SZ") end_time = self.compute_endtime(execution_date) if end_time: end_time = end_time.strftime("%Y-%m-%dT%H:%M:%SZ") marquez_client = self.get_marquez_client() marquez_client.create_job(job_name, self.marquez_location, self.marquez_input_urns, self.marquez_output_urns, description=self.description) log.info(f'Successfully recorded job: {job_name}', airflow_dag_id=self.dag_id, marquez_namespace=self.marquez_namespace) marquez_jobrun = marquez_client.create_job_run( job_name, run_args=run_args, nominal_start_time=start_time, nominal_end_time=end_time) marquez_jobrun_id = marquez_jobrun.get('runId') if marquez_jobrun_id: marquez_client.mark_job_run_as_running(marquez_jobrun_id) log.info(f'Successfully recorded job run: {job_name}', airflow_dag_id=self.dag_id, airflow_dag_execution_time=start_time, marquez_run_id=marquez_jobrun_id, marquez_namespace=self.marquez_namespace, duration_ms=(self._now_ms() - now_ms)) else: log.warn(f'Run id found not found: {job_name}', airflow_dag_id=self.dag_id, airflow_dag_execution_time=start_time, marquez_run_id=marquez_jobrun_id, marquez_namespace=self.marquez_namespace, duration_ms=(self._now_ms() - now_ms)) return marquez_jobrun_id def compute_endtime(self, execution_date): return self.following_schedule(execution_date) def report_jobrun_change(self, dagrun, **kwargs): session = kwargs.get('session') marquez_job_run_id = self._job_id_mapping.pop( JobIdMapping.make_key(dagrun.dag_id, dagrun.run_id), session) if marquez_job_run_id: log.info(f'Found job run.', airflow_dag_id=dagrun.dag_id, airflow_run_id=dagrun.run_id, marquez_run_id=marquez_job_run_id, marquez_namespace=self.marquez_namespace) if kwargs.get('success'): self.get_marquez_client().mark_job_run_as_completed( marquez_job_run_id) else: self.get_marquez_client().mark_job_run_as_failed( marquez_job_run_id) state = 'COMPLETED' if kwargs.get('success') else 'FAILED' log.info(f'Marked job run as {state}.', airflow_dag_id=dagrun.dag_id, airflow_run_id=dagrun.run_id, marquez_run_id=marquez_job_run_id, marquez_namespace=self.marquez_namespace) def get_marquez_client(self): if not self._marquez_client: self._marquez_client = MarquezClient( namespace_name=self.marquez_namespace) self._marquez_client.create_namespace(self.marquez_namespace, "default_owner") return self._marquez_client @staticmethod def _now_ms(): return int(round(time.time() * 1000))