def __mark_job_run_as(self, run_id, action, at=None): Utils.is_valid_uuid(run_id, 'run_id') return self._post( self._url('/jobs/runs/{0}/{1}?at={2}', run_id, action, at if at else Utils.utc_now()) )
def get_job(self, namespace_name, job_name): Utils.check_name_length(namespace_name, 'namespace_name') Utils.check_name_length(job_name, 'job_name') return self._get( self._url('/namespaces/{0}/jobs/{1}', namespace_name, job_name) )
def get_dataset(self, namespace_name, dataset_name): Utils.check_name_length(namespace_name, 'namespace_name') Utils.check_name_length(dataset_name, 'dataset_name') return self._get( self._url('/namespaces/{0}/datasets/{1}', namespace_name, dataset_name))
def create_job_run(self, namespace_name, job_name, run_id=None, nominal_start_time=None, nominal_end_time=None, run_args=None, mark_as_running=False): Utils.check_name_length(namespace_name, 'namespace_name') Utils.check_name_length(job_name, 'job_name') payload = {} if run_id: payload['id'] = run_id if nominal_start_time: payload['nominalStartTime'] = nominal_start_time if nominal_end_time: payload['nominalEndTime'] = nominal_end_time if run_args: payload['args'] = run_args response = self._post( self._url('/namespaces/{0}/jobs/{1}/runs', namespace_name, job_name), payload=payload) if mark_as_running: response = self.mark_job_run_as_started(run_id) return response
def test_mk_fields_from(): fields_name_error = [{}] fields_valid = [{ "name": "flight_id", "type": "INTEGER", "description": "flight id", "tags": ["tag1", "tag2"] }, { "name": "flight_name", "type": "VARCHAR", "description": "flight name", "tags": ["tag3", "tag4"] }, { "name": "flight_date", "type": "TIMESTAMP", "description": "flight date" }] new_fields_valid = [{ "name": "flight_id", "type": "INTEGER", "description": "flight id", "tags": ["tag1", "tag2"] }, { "name": "flight_name", "type": "VARCHAR", "description": "flight name", "tags": ["tag3", "tag4"] }, { "name": "flight_date", "type": "TIMESTAMP", "description": "flight date" }] assert Utils.mk_fields_from(fields=fields_valid) == new_fields_valid with pytest.raises(ValueError): Utils.mk_fields_from(fields=fields_name_error)
def list_jobs(self, namespace_name, limit=None, offset=None): Utils.check_name_length(namespace_name, 'namespace_name') return self._get(self._url('/namespaces/{0}/jobs', namespace_name), params={ 'limit': limit, 'offset': offset })
def __init__(self, url, timeout_ms=None, api_key: str = None): self._timeout = Utils.to_seconds(timeout_ms or os.environ.get( 'MARQUEZ_TIMEOUT_MS', DEFAULT_TIMEOUT_MS) ) self._api_base = f"{url}{API_PATH_V1}" if api_key: Utils.add_auth_to(_HEADERS, api_key)
def list_datasets(self, namespace_name, limit=None, offset=None): Utils.check_name_length(namespace_name, 'namespace_name') return self._get(self._url('/namespaces/{0}/datasets', namespace_name), params={ 'limit': limit or DEFAULT_LIMIT, 'offset': offset or DEFAULT_OFFSET })
def tag_dataset(self, namespace_name, dataset_name, tag_name): Utils.check_name_length(namespace_name, 'namespace_name') Utils.check_name_length(dataset_name, 'dataset_name') if not tag_name: raise ValueError('tag_name must not be None') return self._post( self._url('/namespaces/{0}/datasets/{1}/tags/{2}', namespace_name, dataset_name, tag_name))
def list_job_runs(self, namespace_name, job_name, limit=None, offset=None): Utils.check_name_length(namespace_name, 'namespace_name') Utils.check_name_length(job_name, 'job_name') return self._get(self._url('/namespaces/{0}/jobs/{1}/runs', namespace_name, job_name), params={ 'limit': limit or DEFAULT_LIMIT, 'offset': offset or DEFAULT_OFFSET })
def get_dataset_version(self, namespace_name, dataset_name, version): Utils.check_name_length(namespace_name, 'namespace_name') Utils.check_name_length(dataset_name, 'dataset_name') if not version: raise ValueError('version must not be None') return self._get( self._url('/namespaces/{0}/datasets/{1}/versions/{2}', namespace_name, dataset_name, version))
def create_namespace(self, namespace_name, owner_name, description=None): Utils.check_name_length(namespace_name, 'namespace_name') Utils.check_name_length(owner_name, 'owner_name') payload = {'ownerName': owner_name} if description: payload['description'] = description return self._put(self._url('/namespaces/{0}', namespace_name), payload=payload)
def post(self, path, headers, payload=None): if self._api_key: Utils.add_auth_to(headers, self._api_key) response = requests.post( url=f"{self._api_base}{path}", headers=headers, json=payload, timeout=self._timeout ) return self._response(response, as_json=True)
def test_new_client(): os.environ['MARQUEZ_API_KEY'] = API_KEY from marquez_client.client import _USER_AGENT, _HEADERS headers_with_auth = {'User-Agent': _USER_AGENT} # Add API key to headers Utils.add_auth_to(headers_with_auth, API_KEY) client = Clients.new_client() assert client._api_base == API_BASE assert _HEADERS == headers_with_auth del os.environ['MARQUEZ_API_KEY']
def create_source(self, source_name, source_type, connection_url, description=None): Utils.check_name_length(source_name, 'source_name') Utils.is_valid_connection_url(connection_url) payload = { 'type': source_type.upper(), 'connectionUrl': connection_url } if description: payload['description'] = description return self._put(self._url('/sources/{0}', source_name), payload=payload)
def __init__(self, url, timeout_ms=None): self._timeout = Utils.to_seconds( timeout_ms or os.environ.get('MARQUEZ_TIMEOUT_MS', DEFAULT_TIMEOUT_MS)) self._api_base = f'{url}{_API_PATH}' log.debug(self._api_base)
def tag_dataset_field(self, namespace_name, dataset_name, field_name, tag_name): Utils.check_name_length(namespace_name, 'namespace_name') Utils.check_name_length(dataset_name, 'dataset_name') Utils.check_name_length(field_name, 'field_name') Utils.check_name_length(tag_name, 'tag_name') return self._post( self._url('/namespaces/{0}/datasets/{1}/fields/{2}/tags/{3}', namespace_name, dataset_name, field_name, tag_name))
def create_dataset(self, namespace_name, dataset_name, dataset_type, physical_name, source_name, description=None, run_id=None, schema_location=None, fields=None, tags=None): Utils.check_name_length(namespace_name, 'namespace_name') Utils.check_name_length(dataset_name, 'dataset_name') Utils.is_instance_of(dataset_type, DatasetType) if dataset_type == DatasetType.STREAM: MarquezClient._is_none(schema_location, 'schema_location') Utils.check_name_length(physical_name, 'physical_name') Utils.check_name_length(source_name, 'source_name') payload = { 'type': dataset_type.value, 'physicalName': physical_name, 'sourceName': source_name, } if description: payload['description'] = description if run_id: payload['runId'] = run_id if fields: payload['fields'] = fields if tags: payload['tags'] = tags if schema_location: payload['schemaLocation'] = schema_location return self._put(self._url('/namespaces/{0}/datasets/{1}', namespace_name, dataset_name), payload=payload)
def _get(self, url, params=None, as_json=True): now_ms = Utils.now_ms() response = requests.get(url=url, params=params, headers=_HEADERS, timeout=self._timeout) log.info(f"{url} method=GET duration_ms={Utils.now_ms() - now_ms}") return self._response(response, as_json)
def _put(self, url, payload=None, as_json=True): now_ms = Utils.now_ms() response = requests.put(url=url, headers=_HEADERS, json=payload, timeout=self._timeout) log.info(f"{url} method=PUT payload={json.dumps(payload)} " f"duration_ms={Utils.now_ms() - now_ms}") return self._response(response, as_json)
def test_mark_job_run_as_aborted(wo_client): aborted_at = Utils.utc_now() wo_client.mark_job_run_as_aborted( run_id='71feb41b-be50-428c-8470-37b9c292f787', at=aborted_at) wo_client._backend.post.assert_called_once_with( path=MarquezWriteOnlyClient._path( '/jobs/runs/71feb41b-be50-428c-8470-37b9c292f787/abort?at={0}', aborted_at), headers=mock.ANY)
def create_job(self, namespace_name, job_name, job_type, location=None, inputs: [DatasetId] = None, outputs: [DatasetId] = None, description=None, context=None, run_id=None): Utils.check_name_length(namespace_name, 'namespace_name') Utils.check_name_length(job_name, 'job_name') Utils.is_instance_of(job_type, JobType) payload = { 'type': job_type.value, 'inputs': [ input.__dict__ for input in inputs ] if inputs else [], 'outputs': [ output.__dict__ for output in outputs ] if outputs else [] } if run_id: payload['runId'] = run_id if context: payload['context'] = context if location: payload['location'] = location if description: payload['description'] = description return self._put( self._url('/namespaces/{0}/jobs/{1}', namespace_name, job_name), payload=payload )
def create_job(self, namespace_name, job_name, job_type, location=None, input_dataset=None, output_dataset=None, description=None, context=None): Utils.check_name_length(namespace_name, 'namespace_name') Utils.check_name_length(job_name, 'job_name') Utils.is_instance_of(job_type, JobType) payload = { 'inputs': input_dataset or [], 'outputs': output_dataset or [], 'type': job_type.name } if context: payload['context'] = context if location: payload['location'] = location if description: payload['description'] = description return self._put(self._url('/namespaces/{0}/jobs/{1}', namespace_name, job_name), payload=payload)
def _backend_from_env(): backend = \ os.environ.get('MARQUEZ_BACKEND', DEFAULT_MARQUEZ_BACKEND).upper() if backend == 'HTTP': url = os.environ.get('MARQUEZ_URL', DEFAULT_MARQUEZ_URL) api_key = os.environ.get('MARQUEZ_API_KEY') timeout = Utils.to_seconds( os.environ.get('MARQUEZ_TIMEOUT_MS', DEFAULT_TIMEOUT_MS)) return HttpBackend(url, timeout, api_key) elif backend == 'FILE': file = os.environ.get('MARQUEZ_FILE', DEFAULT_MARQUEZ_FILE) return FileBackend(file) elif backend == 'LOG': return LogBackend()
def test_is_instance_of(): with pytest.raises(ValueError): Utils.is_instance_of(variable_value=JobType.BATCH, variable_enum_type=DatasetType) with pytest.raises(ValueError): Utils.is_instance_of(variable_value=DatasetType.DB_TABLE, variable_enum_type=JobType) with pytest.raises(ValueError): Utils.is_instance_of(variable_value=JobType.BATCH, variable_enum_type=RunState)
def _get(self, url, params=None, as_json=True): now_ms = Utils.now_ms() response = requests.get(url, params=params, headers=_HEADERS, timeout=self._timeout) get_details = {} get_details['url'] = url get_details['http_method'] = 'POST' get_details['http_headers'] = _HEADERS get_details['payload'] = params get_details['duration_ms'] = (self._now_ms() - now_ms) log.info(get_details) return self._response(response, as_json)
def _put(self, url, payload=None, as_json=True): now_ms = Utils.now_ms() response = requests.put(url=url, headers=_HEADERS, json=payload, timeout=self._timeout) put_details = {} put_details['url'] = url put_details['http_method'] = 'POST' put_details['http_headers'] = _HEADERS put_details['payload'] = payload put_details['duration_ms'] = (self._now_ms() - now_ms) log.info(put_details) return self._response(response, as_json)
def test_mark_job_run_as_failed(mock_post, client): mock_post.return_value.status_code.return_value = HTTPStatus.OK mock_post.return_value.json.return_value = FAILED now = Utils.utc_now() run = client.mark_job_run_as_failed(RUN_ID, at=now) assert run['id'] == RUN_ID assert run['nominalStartTime'] == NOMINAL_START_TIME assert run['nominalEndTime'] == NOMINAL_END_TIME assert run['state'] == RunState.FAILED assert run['startedAt'] == START_AT assert run['endedAt'] == ENDED_AT assert run['durationMs'] == DURATION_MS mock_post.assert_called_once_with(url=client._url( '/jobs/runs/{0}/fail?at={1}', RUN_ID, now), headers=mock.ANY, json=None, timeout=mock.ANY)
def test_create_job_run(self, mock_post): run_id = str(uuid.uuid4()) action_at = Utils.utc_now() job_name = "my-job" run_args = { "email": "*****@*****.**", "emailOnFailure": "true", "emailOnRetry": "true", "retries": "1" } mock_post.return_value = { 'id': f'{run_id}', 'createdAt': '2020-08-12T22:33:02.787228Z', 'updatedAt': '2020-08-12T22:33:02.787228Z', 'nominalStartTime': None, 'nominalEndTime': None, 'state': 'NEW', 'startedAt': f'{action_at}', 'endedAt': None, 'durationMs': None, 'args': { "email": "*****@*****.**", "emailOnFailure": "true", "emailOnRetry": "true", "retries": "1" } } response = self.client.create_job_run(namespace_name=_NAMESPACE, job_name=job_name, run_id=run_id, nominal_start_time=None, nominal_end_time=None, run_args=run_args, mark_as_running=True) assert response['id'] is not None assert str(response['args']) == str(run_args) assert str(response['startedAt']) == action_at
def test_mark_job_run_as_aborted(self, mock_post): run_id = str(uuid.uuid4()) action_at = Utils.utc_now() mock_post.return_value = { 'id': f'{run_id}', 'createdAt': '2020-08-12T22:36:50.739951Z', 'updatedAt': '2020-08-13T17:56:39.516802Z', 'nominalStartTime': None, 'nominalEndTime': None, 'state': 'ABORTED', 'startedAt': f'{action_at}', 'endedAt': None, 'durationMs': None, 'args': {} } response = self.client.mark_job_run_as_aborted(run_id=run_id, action_at=action_at) assert str(response['id']) == run_id assert str(response['state']) == RunState.ABORTED.value assert str(response['startedAt']) == action_at