def test_build_cluster_data(self): for suffix, dataproc_operator in enumerate(self.dataproc_operators): cluster_data = dataproc_operator._build_cluster_data() self.assertEqual(cluster_data['clusterName'], CLUSTER_NAME) self.assertEqual(cluster_data['projectId'], PROJECT_ID) self.assertEqual(cluster_data['config']['softwareConfig'], {'imageVersion': IMAGE_VERSION}) self.assertEqual(cluster_data['config']['configBucket'], STORAGE_BUCKET) self.assertEqual(cluster_data['config']['workerConfig']['numInstances'], NUM_WORKERS) self.assertEqual(cluster_data['config']['secondaryWorkerConfig']['numInstances'], NUM_PREEMPTIBLE_WORKERS) self.assertEqual(cluster_data['config']['gceClusterConfig']['serviceAccountScopes'], SERVICE_ACCOUNT_SCOPES) self.assertEqual(cluster_data['config']['gceClusterConfig']['subnetworkUri'], SUBNETWORK_URI) self.assertEqual(cluster_data['config']['gceClusterConfig']['networkUri'], NETWORK_URI) self.assertEqual(cluster_data['config']['gceClusterConfig']['tags'], TAGS) # test whether the default airflow-version label has been properly # set to the dataproc operator. merged_labels = {} merged_labels.update(self.labels[suffix]) merged_labels.update({'airflow-version': 'v' + version.replace('.', '-').replace('+','-')}) self.assertTrue(re.match(r'[a-z]([-a-z0-9]*[a-z0-9])?', cluster_data['labels']['airflow-version'])) self.assertEqual(cluster_data['labels'], merged_labels)
def test_deploy_execute(self, mock_hook): mock_hook.return_value.get_function.side_effect = mock.Mock( side_effect=HttpError(resp=MOCK_RESP_404, content=b'not found')) mock_hook.return_value.create_new_function.return_value = True op = GcfFunctionDeployOperator( project_id=GCP_PROJECT_ID, location=GCP_LOCATION, body=deepcopy(VALID_BODY), task_id="id" ) op.execute(None) mock_hook.assert_called_once_with(api_version='v1', gcp_conn_id='google_cloud_default') mock_hook.return_value.get_function.assert_called_once_with( 'projects/test_project_id/locations/test_region/functions/helloWorld' ) expected_body = deepcopy(VALID_BODY) expected_body['labels'] = { 'airflow-version': 'v' + version.replace('.', '-').replace('+', '-') } mock_hook.return_value.create_new_function.assert_called_once_with( project_id='test_project_id', location='test_region', body=expected_body )
def __init__( self, py_file, job_name='{{task.task_id}}', py_options=None, dataflow_default_options=None, options=None, gcp_conn_id='google_cloud_default', delegate_to=None, poll_sleep=10, *args, **kwargs): super(DataFlowPythonOperator, self).__init__(*args, **kwargs) self.py_file = py_file self.job_name = job_name self.py_options = py_options or [] self.dataflow_default_options = dataflow_default_options or {} self.options = options or {} self.options.setdefault('labels', {}).update( {'airflow-version': 'v' + version.replace('.', '-').replace('+', '-')}) self.gcp_conn_id = gcp_conn_id self.delegate_to = delegate_to self.poll_sleep = poll_sleep
def __init__( self, jar, dataflow_default_options=None, options=None, gcp_conn_id='google_cloud_default', delegate_to=None, poll_sleep=10, job_class=None, *args, **kwargs): """ Create a new DataFlowJavaOperator. Note that both dataflow_default_options and options will be merged to specify pipeline execution parameter, and dataflow_default_options is expected to save high-level options, for instances, project and zone information, which apply to all dataflow operators in the DAG. .. seealso:: For more detail on job submission have a look at the reference: https://cloud.google.com/dataflow/pipelines/specifying-exec-params :param jar: The reference to a self executing DataFlow jar. :type jar: string :param dataflow_default_options: Map of default job options. :type dataflow_default_options: dict :param options: Map of job specific options. :type options: dict :param gcp_conn_id: The connection ID to use connecting to Google Cloud Platform. :type gcp_conn_id: string :param delegate_to: The account to impersonate, if any. For this to work, the service account making the request must have domain-wide delegation enabled. :type delegate_to: string :param poll_sleep: The time in seconds to sleep between polling Google Cloud Platform for the dataflow job status while the job is in the JOB_STATE_RUNNING state. :type poll_sleep: int :param job_class: The name of the dataflow job class to be executued, it is often not the main class configured in the dataflow jar file. :type job_class: string """ super(DataFlowJavaOperator, self).__init__(*args, **kwargs) dataflow_default_options = dataflow_default_options or {} options = options or {} options.setdefault('labels', {}).update( {'airflow-version': 'v' + version.replace('.', '-').replace('+', '-')}) self.gcp_conn_id = gcp_conn_id self.delegate_to = delegate_to self.jar = jar self.dataflow_default_options = dataflow_default_options self.options = options self.poll_sleep = poll_sleep self.job_class = job_class
def __init__( self, py_file, py_options=None, dataflow_default_options=None, options=None, gcp_conn_id='google_cloud_default', delegate_to=None, poll_sleep=10, *args, **kwargs): """ Create a new DataFlowPythonOperator. Note that both dataflow_default_options and options will be merged to specify pipeline execution parameter, and dataflow_default_options is expected to save high-level options, for instances, project and zone information, which apply to all dataflow operators in the DAG. .. seealso:: For more detail on job submission have a look at the reference: https://cloud.google.com/dataflow/pipelines/specifying-exec-params :param py_file: Reference to the python dataflow pipleline file.py, e.g., /some/local/file/path/to/your/python/pipeline/file. :type py_file: string :param py_options: Additional python options. :type pyt_options: list of strings, e.g., ["-m", "-v"]. :param dataflow_default_options: Map of default job options. :type dataflow_default_options: dict :param options: Map of job specific options. :type options: dict :param gcp_conn_id: The connection ID to use connecting to Google Cloud Platform. :type gcp_conn_id: string :param delegate_to: The account to impersonate, if any. For this to work, the service account making the request must have domain-wide delegation enabled. :type delegate_to: string :param poll_sleep: The time in seconds to sleep between polling Google Cloud Platform for the dataflow job status while the job is in the JOB_STATE_RUNNING state. :type poll_sleep: int """ super(DataFlowPythonOperator, self).__init__(*args, **kwargs) self.py_file = py_file self.py_options = py_options or [] self.dataflow_default_options = dataflow_default_options or {} self.options = options or {} self.options.setdefault('labels', {}).update( {'airflow-version': 'v' + version.replace('.', '-').replace('+', '-')}) self.gcp_conn_id = gcp_conn_id self.delegate_to = delegate_to self.poll_sleep = poll_sleep
def test_build_cluster_data(self): for suffix, dataproc_operator in enumerate(self.dataproc_operators): cluster_data = dataproc_operator._build_cluster_data() self.assertEqual(cluster_data['clusterName'], CLUSTER_NAME) self.assertEqual(cluster_data['projectId'], GCP_PROJECT_ID) self.assertEqual(cluster_data['config']['softwareConfig'], {'imageVersion': IMAGE_VERSION}) self.assertEqual(cluster_data['config']['configBucket'], STORAGE_BUCKET) self.assertEqual( cluster_data['config']['workerConfig']['numInstances'], NUM_WORKERS) self.assertEqual( cluster_data['config']['secondaryWorkerConfig'] ['numInstances'], NUM_PREEMPTIBLE_WORKERS) self.assertEqual( cluster_data['config']['gceClusterConfig'] ['serviceAccountScopes'], SERVICE_ACCOUNT_SCOPES) self.assertEqual( cluster_data['config']['gceClusterConfig']['internalIpOnly'], INTERNAL_IP_ONLY) self.assertEqual( cluster_data['config']['gceClusterConfig']['subnetworkUri'], SUBNETWORK_URI) self.assertEqual( cluster_data['config']['gceClusterConfig']['networkUri'], NETWORK_URI) self.assertEqual( cluster_data['config']['gceClusterConfig']['tags'], TAGS) self.assertEqual( cluster_data['config']['lifecycleConfig']['idleDeleteTtl'], "321s") self.assertEqual( cluster_data['config']['lifecycleConfig']['autoDeleteTime'], "2017-06-07T00:00:00.000000Z") self.assertEqual( cluster_data['config']['autoscalingConfig']['policyUri'], SCALING_POLICY) # test whether the default airflow-version label has been properly # set to the dataproc operator. merged_labels = {} merged_labels.update(self.labels[suffix]) merged_labels.update({ 'airflow-version': 'v' + version.replace('.', '-').replace('+', '-') }) self.assertTrue( re.match(r'[a-z]([-a-z0-9]*[a-z0-9])?', cluster_data['labels']['airflow-version'])) self.assertEqual(cluster_data['labels'], merged_labels)
def __get_dataflow_pipeline_options( self, pipeline_options: dict, job_name: str, job_name_key: Optional[str] = None) -> dict: pipeline_options = copy.deepcopy(pipeline_options) if job_name_key is not None: pipeline_options[job_name_key] = job_name pipeline_options["project"] = self.dataflow_config.project_id pipeline_options["region"] = self.dataflow_config.location pipeline_options.setdefault("labels", {}).update({ "airflow-version": "v" + version.replace(".", "-").replace("+", "-") }) return pipeline_options
def test_empty_project_id_is_ok(self, mock_hook): mock_hook.return_value.get_function.side_effect = \ HttpError(resp=MOCK_RESP_404, content=b'not found') operator = CloudFunctionDeployFunctionOperator( location="test_region", body=deepcopy(VALID_BODY), task_id="id") operator.execute(None) mock_hook.assert_called_once_with(api_version='v1', gcp_conn_id='google_cloud_default') new_body = deepcopy(VALID_BODY) new_body['labels'] = { 'airflow-version': 'v' + version.replace('.', '-').replace('+', '-') } mock_hook.return_value.create_new_function.assert_called_once_with( project_id=None, location="test_region", body=new_body)
def setUp(self): self.maxDiff = None # pylint: disable=invalid-name self.api_client = ApiClient() self.expected_pod = { 'apiVersion': 'v1', 'kind': 'Pod', 'metadata': { 'namespace': 'default', 'name': mock.ANY, 'annotations': {}, 'labels': { 'foo': 'bar', 'kubernetes_pod_operator': 'True', 'airflow_version': airflow_version.replace('+', '-'), 'execution_date': '2016-01-01T0100000100-a2f50a31f', 'dag_id': 'dag', 'task_id': 'task', 'try_number': '1', }, }, 'spec': { 'affinity': {}, 'containers': [{ 'image': 'ubuntu:16.04', 'imagePullPolicy': 'IfNotPresent', 'args': ["echo 10"], 'command': ["bash", "-cx"], 'env': [], 'envFrom': [], 'resources': {}, 'name': 'base', 'ports': [], 'volumeMounts': [], }], 'hostNetwork': False, 'imagePullSecrets': [], 'initContainers': [], 'nodeSelector': {}, 'restartPolicy': 'Never', 'securityContext': {}, 'serviceAccountName': 'default', 'tolerations': [], 'volumes': [], }, }
def __init__( self, *, py_file: str, runner: str = "DirectRunner", default_pipeline_options: Optional[dict] = None, pipeline_options: Optional[dict] = None, py_interpreter: str = "python3", py_options: Optional[List[str]] = None, py_requirements: Optional[List[str]] = None, py_system_site_packages: bool = False, gcp_conn_id: str = "google_cloud_default", delegate_to: Optional[str] = None, dataflow_config: Optional[Union[DataflowConfiguration, dict]] = None, **kwargs, ) -> None: super().__init__(**kwargs) self.py_file = py_file self.runner = runner self.py_options = py_options or [] self.default_pipeline_options = default_pipeline_options or {} self.pipeline_options = pipeline_options or {} self.pipeline_options.setdefault("labels", {}).update({ "airflow-version": "v" + version.replace(".", "-").replace("+", "-") }) self.py_interpreter = py_interpreter self.py_requirements = py_requirements self.py_system_site_packages = py_system_site_packages self.gcp_conn_id = gcp_conn_id self.delegate_to = delegate_to self.beam_hook: Optional[BeamHook] = None self.dataflow_hook: Optional[DataflowHook] = None self.dataflow_job_id: Optional[str] = None if dataflow_config is None: self.dataflow_config = DataflowConfiguration() elif isinstance(dataflow_config, dict): self.dataflow_config = DataflowConfiguration(**dataflow_config) else: self.dataflow_config = dataflow_config if self.dataflow_config and self.runner.lower( ) != BeamRunnerType.DataflowRunner.lower(): self.log.warning( "dataflow_config is defined but runner is different than DataflowRunner (%s)", self.runner)
def execute(self, context): if self.labels is not None: self.labels.update( {'airflow-version': 'v' + version.replace('.', '-').replace('+', '-')} ) hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self.google_cloud_storage_conn_id, delegate_to=self.delegate_to ) hook.create_bucket(bucket_name=self.bucket_name, storage_class=self.storage_class, location=self.location, project_id=self.project_id, labels=self.labels)
def execute(self, context): if self.labels is not None: self.labels.update({ 'airflow-version': 'v' + version.replace('.', '-').replace('+', '-') }) hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self.google_cloud_storage_conn_id, delegate_to=self.delegate_to) hook.create_bucket(bucket_name=self.bucket_name, storage_class=self.storage_class, location=self.location, project_id=self.project_id, labels=self.labels)
def __init__( # pylint: disable=too-many-arguments self, *, py_file: str, job_name: str = "{{task.task_id}}", dataflow_default_options: Optional[dict] = None, options: Optional[dict] = None, py_interpreter: str = "python3", py_options: Optional[List[str]] = None, py_requirements: Optional[List[str]] = None, py_system_site_packages: bool = False, project_id: Optional[str] = None, location: str = DEFAULT_DATAFLOW_LOCATION, gcp_conn_id: str = "google_cloud_default", delegate_to: Optional[str] = None, poll_sleep: int = 10, drain_pipeline: bool = False, cancel_timeout: Optional[int] = 10 * 60, wait_until_finished: Optional[bool] = None, **kwargs, ) -> None: super().__init__(**kwargs) self.py_file = py_file self.job_name = job_name self.py_options = py_options or [] self.dataflow_default_options = dataflow_default_options or {} self.options = options or {} self.options.setdefault("labels", {}).update({ "airflow-version": "v" + version.replace(".", "-").replace("+", "-") }) self.py_interpreter = py_interpreter self.py_requirements = py_requirements self.py_system_site_packages = py_system_site_packages self.project_id = project_id self.location = location self.gcp_conn_id = gcp_conn_id self.delegate_to = delegate_to self.poll_sleep = poll_sleep self.drain_pipeline = drain_pipeline self.cancel_timeout = cancel_timeout self.wait_until_finished = wait_until_finished self.job_id = None self.hook: Optional[DataflowHook] = None
def test_make_pod_assert_labels(self): # Tests the pod created has all the expected labels set self.kube_config.dags_folder = 'dags' worker_config = WorkerConfiguration(self.kube_config) pod = worker_config.make_pod("default", "sample-uuid", "test_pod_id", "test_dag_id", "test_task_id", "2019-11-21 11:08:22.920875", 1, "bash -c 'ls /'") expected_labels = { 'airflow-worker': 'sample-uuid', 'airflow_version': airflow_version.replace('+', '-'), 'dag_id': 'test_dag_id', 'execution_date': '2019-11-21 11:08:22.920875', 'kubernetes_executor': 'True', 'task_id': 'test_task_id', 'try_number': '1' } self.assertEqual(pod.metadata.labels, expected_labels)
def test_execute(self, mock_hook): operator = GoogleCloudStorageCreateBucketOperator( task_id=TASK_ID, bucket_name=TEST_BUCKET, resource={ "lifecycle": { "rule": [{ "action": { "type": "Delete" }, "condition": { "age": 7 } }] } }, storage_class='MULTI_REGIONAL', location='EU', labels={'env': 'prod'}, project_id=TEST_PROJECT) operator.execute(None) mock_hook.return_value.create_bucket.assert_called_once_with( bucket_name=TEST_BUCKET, storage_class='MULTI_REGIONAL', location='EU', labels={ 'airflow-version': 'v' + version.replace('.', '-').replace('+', '-'), 'env': 'prod' }, project_id=TEST_PROJECT, resource={ 'lifecycle': { 'rule': [{ 'action': { 'type': 'Delete' }, 'condition': { 'age': 7 } }] } })
def test_execute(self, mock_hook): operator = GoogleCloudStorageCreateBucketOperator( task_id=TASK_ID, bucket_name=TEST_BUCKET, storage_class='MULTI_REGIONAL', location='EU', labels={'env': 'prod'}, project_id=TEST_PROJECT ) operator.execute(None) mock_hook.return_value.create_bucket.assert_called_once_with( bucket_name=TEST_BUCKET, storage_class='MULTI_REGIONAL', location='EU', labels={ 'airflow-version': 'v' + version.replace('.', '-').replace('+', '-'), 'env': 'prod' }, project_id=TEST_PROJECT )
def test_empty_project_id_is_ok(self, mock_hook): operator = GcfFunctionDeployOperator( location="test_region", body=deepcopy(VALID_BODY), task_id="id" ) operator._hook.get_function.side_effect = \ HttpError(resp=MOCK_RESP_404, content=b'not found') operator.execute(None) mock_hook.assert_called_once_with(api_version='v1', gcp_conn_id='google_cloud_default') new_body = deepcopy(VALID_BODY) new_body['labels'] = { 'airflow-version': 'v' + version.replace('.', '-').replace('+', '-')} mock_hook.return_value.create_new_function.assert_called_once_with( project_id=None, location="test_region", body=new_body)
def test_build_cluster_data(self): for suffix, dataproc_operator in enumerate(self.dataproc_operators): cluster_data = dataproc_operator._build_cluster_data() self.assertEqual(cluster_data['clusterName'], CLUSTER_NAME) self.assertEqual(cluster_data['projectId'], PROJECT_ID) self.assertEqual(cluster_data['config']['softwareConfig'], {'imageVersion': IMAGE_VERSION}) self.assertEqual(cluster_data['config']['configBucket'], STORAGE_BUCKET) self.assertEqual(cluster_data['config']['workerConfig']['numInstances'], NUM_WORKERS) self.assertEqual(cluster_data['config']['secondaryWorkerConfig']['numInstances'], NUM_PREEMPTIBLE_WORKERS) # test whether the default airflow-version label has been properly # set to the dataproc operator. merged_labels = {} merged_labels.update(self.labels[suffix]) merged_labels.update({'airflow-version': 'v' + version.replace('.', '-')}) self.assertTrue(re.match(r'[a-z]([-a-z0-9]*[a-z0-9])?', cluster_data['labels']['airflow-version'])) self.assertEqual(cluster_data['labels'], merged_labels)
def construct_pod( # pylint: disable=too-many-arguments dag_id: str, task_id: str, pod_id: str, try_number: int, kube_image: str, date: datetime.datetime, command: List[str], pod_override_object: Optional[k8s.V1Pod], base_worker_pod: k8s.V1Pod, namespace: str, worker_uuid: str) -> k8s.V1Pod: """ Construct a pod by gathering and consolidating the configuration from 3 places: - airflow.cfg - executor_config - dynamic arguments """ dynamic_pod = PodGenerator(namespace=namespace, image=kube_image, labels={ 'airflow-worker': worker_uuid, 'dag_id': make_safe_label_value(dag_id), 'task_id': make_safe_label_value(task_id), 'execution_date': datetime_to_label_safe_datestring(date), 'try_number': str(try_number), 'airflow_version': airflow_version.replace('+', '-'), 'kubernetes_executor': 'True', }, annotations={ 'dag_id': dag_id, 'task_id': task_id, 'execution_date': date.isoformat(), 'try_number': str(try_number), }, cmds=command, name=pod_id).gen_pod() # Reconcile the pods starting with the first chronologically, # Pod from the pod_template_File -> Pod from executor_config arg -> Pod from the K8s executor pod_list = [base_worker_pod, pod_override_object, dynamic_pod] return reduce(PodGenerator.reconcile_pods, pod_list)
def create_new_pod_for_operator( self, labels, launcher) -> Tuple[State, k8s.V1Pod, Optional[str]]: """ Creates a new pod and monitors for duration of task :param labels: labels used to track pod :param launcher: pod launcher that will manage launching and monitoring pods :return: """ self.log.debug( "Adding KubernetesPodOperator labels to pod before launch for task %s", self.task_id) # Merge Pod Identifying labels with labels passed to operator self.pod.metadata.labels.update(labels) # Add Airflow Version to the label # And a label to identify that pod is launched by KubernetesPodOperator self.pod.metadata.labels.update({ 'airflow_version': airflow_version.replace('+', '-'), 'kubernetes_pod_operator': 'True', }) self.log.debug("Starting pod:\n%s", yaml.safe_dump(self.pod.to_dict())) final_state = None try: launcher.start_pod(self.pod, startup_timeout=self.startup_timeout_seconds) final_state, remote_pod, result = launcher.monitor_pod( pod=self.pod, get_logs=self.get_logs) except AirflowException: if self.log_events_on_failure: for event in launcher.read_pod_events(self.pod).items: self.log.error("Pod Event: %s - %s", event.reason, event.message) raise finally: if self.is_delete_operator_pod: self.log.debug("Deleting pod for task %s", self.task_id) launcher.delete_pod(self.pod) elif final_state != State.SUCCESS: self.patch_already_checked(self.pod) return final_state, remote_pod, result
def __init__( self, *, jar: str, job_name: str = "{{task.task_id}}", dataflow_default_options: Optional[dict] = None, options: Optional[dict] = None, project_id: Optional[str] = None, location: str = DEFAULT_DATAFLOW_LOCATION, gcp_conn_id: str = "google_cloud_default", delegate_to: Optional[str] = None, poll_sleep: int = 10, job_class: Optional[str] = None, check_if_running: CheckJobRunning = CheckJobRunning.WaitForRun, multiple_jobs: Optional[bool] = None, cancel_timeout: Optional[int] = 10 * 60, wait_until_finished: Optional[bool] = None, **kwargs, ) -> None: super().__init__(**kwargs) dataflow_default_options = dataflow_default_options or {} options = options or {} options.setdefault("labels", {}).update({ "airflow-version": "v" + version.replace(".", "-").replace("+", "-") }) self.project_id = project_id self.location = location self.gcp_conn_id = gcp_conn_id self.delegate_to = delegate_to self.jar = jar self.multiple_jobs = multiple_jobs self.job_name = job_name self.dataflow_default_options = dataflow_default_options self.options = options self.poll_sleep = poll_sleep self.job_class = job_class self.check_if_running = check_if_running self.cancel_timeout = cancel_timeout self.wait_until_finished = wait_until_finished self.job_id = None self.hook = None
def setUp(self): self.maxDiff = None # pylint: disable=invalid-name self.api_client = ApiClient() self.expected_pod = { 'apiVersion': 'v1', 'kind': 'Pod', 'metadata': { 'namespace': 'default', 'name': ANY, 'annotations': {}, 'labels': { 'foo': 'bar', 'kubernetes_pod_operator': 'True', 'airflow_version': airflow_version.replace('+', '-') } }, 'spec': { 'affinity': {}, 'containers': [{ 'image': 'ubuntu:16.04', 'args': ["echo 10"], 'command': ["bash", "-cx"], 'env': [], 'imagePullPolicy': 'IfNotPresent', 'envFrom': [], 'name': 'base', 'ports': [], 'resources': {'limits': {'cpu': None, 'memory': None, 'nvidia.com/gpu': None}, 'requests': {'cpu': None, 'memory': None}}, 'volumeMounts': [], }], 'hostNetwork': False, 'imagePullSecrets': [], 'nodeSelector': {}, 'restartPolicy': 'Never', 'securityContext': {}, 'serviceAccountName': 'default', 'tolerations': [], 'volumes': [], } }
def test_execute(self, mock_hook): operator = GoogleCloudStorageCreateBucketOperator( task_id=TASK_ID, bucket_name=TEST_BUCKET, resource={"lifecycle": {"rule": [{"action": {"type": "Delete"}, "condition": {"age": 7}}]}}, storage_class='MULTI_REGIONAL', location='EU', labels={'env': 'prod'}, project_id=TEST_PROJECT ) operator.execute(None) mock_hook.return_value.create_bucket.assert_called_once_with( bucket_name=TEST_BUCKET, storage_class='MULTI_REGIONAL', location='EU', labels={ 'airflow-version': 'v' + version.replace('.', '-').replace('+', '-'), 'env': 'prod' }, project_id=TEST_PROJECT, resource={'lifecycle': {'rule': [{'action': {'type': 'Delete'}, 'condition': {'age': 7}}]}} )
def __init__( self, project_id: str, task_id: str, cluster_name: str, job_type: str, properties: Optional[Dict[str, str]] = None, ) -> None: name = task_id + "_" + str(uuid.uuid4())[:8] self.job_type = job_type self.job = { "job": { "reference": {"project_id": project_id, "job_id": name}, "placement": {"cluster_name": cluster_name}, "labels": {'airflow-version': 'v' + airflow_version.replace('.', '-').replace('+', '-')}, job_type: {}, } } # type: Dict[str, Any] if properties is not None: self.job["job"][job_type]["properties"] = properties
def __init__( # pylint: disable=too-many-arguments self, *, py_file: str, job_name: str = '{{task.task_id}}', dataflow_default_options: Optional[dict] = None, options: Optional[dict] = None, py_interpreter: str = "python3", py_options: Optional[List[str]] = None, py_requirements: Optional[List[str]] = None, py_system_site_packages: bool = False, project_id: Optional[str] = None, location: str = DEFAULT_DATAFLOW_LOCATION, gcp_conn_id: str = 'google_cloud_default', delegate_to: Optional[str] = None, poll_sleep: int = 10, drain_pipeline: bool = False, **kwargs, ) -> None: super().__init__(**kwargs) self.py_file = py_file self.job_name = job_name self.py_options = py_options or [] self.dataflow_default_options = dataflow_default_options or {} self.options = options or {} self.options.setdefault('labels', {}).update( {'airflow-version': 'v' + version.replace('.', '-').replace('+', '-')} ) self.py_interpreter = py_interpreter self.py_requirements = py_requirements self.py_system_site_packages = py_system_site_packages self.project_id = project_id self.location = location self.gcp_conn_id = gcp_conn_id self.delegate_to = delegate_to self.poll_sleep = poll_sleep self.drain_pipeline = drain_pipeline self.job_id = None self.hook = None
def test_execute(self, mock_hook): operator = GoogleCloudStorageCreateBucketOperator( task_id=TASK_ID, bucket_name=TEST_BUCKET, storage_class='MULTI_REGIONAL', location='EU', labels={'env': 'prod'}, project_id=TEST_PROJECT) operator.execute(None) mock_hook.return_value.create_bucket.assert_called_once_with( bucket_name=TEST_BUCKET, storage_class='MULTI_REGIONAL', location='EU', labels={ 'airflow-version': 'v' + version.replace('.', '-').replace('+', '-'), 'env': 'prod' }, project_id=TEST_PROJECT)
def test_deploy_execute(self, mock_hook): mock_hook.return_value.get_function.side_effect = mock.Mock( side_effect=HttpError(resp=MOCK_RESP_404, content=b'not found')) mock_hook.return_value.create_new_function.return_value = True op = GcfFunctionDeployOperator(project_id=GCP_PROJECT_ID, location=GCP_LOCATION, body=deepcopy(VALID_BODY), task_id="id") op.execute(None) mock_hook.assert_called_once_with(api_version='v1', gcp_conn_id='google_cloud_default') mock_hook.return_value.get_function.assert_called_once_with( 'projects/test_project_id/locations/test_region/functions/helloWorld' ) expected_body = deepcopy(VALID_BODY) expected_body['labels'] = { 'airflow-version': 'v' + version.replace('.', '-').replace('+', '-') } mock_hook.return_value.create_new_function.assert_called_once_with( 'projects/test_project_id/locations/test_region', expected_body)
def test_update_function_if_exists(self, mock_hook): mock_hook.return_value.get_function.return_value = True mock_hook.return_value.update_function.return_value = True op = GcfFunctionDeployOperator(project_id=GCP_PROJECT_ID, location=GCP_LOCATION, body=deepcopy(VALID_BODY), task_id="id") op.execute(None) mock_hook.assert_called_once_with(api_version='v1', gcp_conn_id='google_cloud_default') mock_hook.return_value.get_function.assert_called_once_with( 'projects/test_project_id/locations/test_region/functions/helloWorld' ) expected_body = deepcopy(VALID_BODY) expected_body['labels'] = { 'airflow-version': 'v' + version.replace('.', '-').replace('+', '-') } mock_hook.return_value.update_function.assert_called_once_with( 'projects/test_project_id/locations/test_region/functions/helloWorld', expected_body, expected_body.keys()) mock_hook.return_value.create_new_function.assert_not_called()
def __init__(self, project_id, task_id, cluster_name, job_type, properties): name = task_id + "_" + str(uuid.uuid4())[:8] self.job_type = job_type self.job = { "job": { "reference": { "projectId": project_id, "jobId": name, }, "placement": { "clusterName": cluster_name }, "labels": { 'airflow-version': 'v' + version.replace('.', '-').replace('+', '-') }, job_type: {} } } if properties is not None: self.job["job"][job_type]["properties"] = properties
def __init__( self, *, jar: str, job_name: str = '{{task.task_id}}', dataflow_default_options: Optional[dict] = None, options: Optional[dict] = None, project_id: Optional[str] = None, location: str = DEFAULT_DATAFLOW_LOCATION, gcp_conn_id: str = 'google_cloud_default', delegate_to: Optional[str] = None, poll_sleep: int = 10, job_class: Optional[str] = None, check_if_running: CheckJobRunning = CheckJobRunning.WaitForRun, multiple_jobs: Optional[bool] = None, **kwargs, ) -> None: super().__init__(**kwargs) dataflow_default_options = dataflow_default_options or {} options = options or {} options.setdefault('labels', {}).update({ 'airflow-version': 'v' + version.replace('.', '-').replace('+', '-') }) self.project_id = project_id self.location = location self.gcp_conn_id = gcp_conn_id self.delegate_to = delegate_to self.jar = jar self.multiple_jobs = multiple_jobs self.job_name = job_name self.dataflow_default_options = dataflow_default_options self.options = options self.poll_sleep = poll_sleep self.job_class = job_class self.check_if_running = check_if_running self.job_id = None self.hook = None
def construct_pod( dag_id: str, task_id: str, pod_id: str, try_number: int, date: str, command: List[str], kube_executor_config: Optional[k8s.V1Pod], worker_config: k8s.V1Pod, namespace: str, worker_uuid: str ) -> k8s.V1Pod: """ Construct a pod by gathering and consolidating the configuration from 3 places: - airflow.cfg - executor_config - dynamic arguments """ dynamic_pod = PodGenerator( namespace=namespace, labels={ 'airflow-worker': worker_uuid, 'dag_id': dag_id, 'task_id': task_id, 'execution_date': date, 'try_number': str(try_number), 'airflow_version': airflow_version.replace('+', '-'), 'kubernetes_executor': 'True', }, cmds=command, name=pod_id ).gen_pod() # Reconcile the pods starting with the first chronologically, # Pod from the airflow.cfg -> Pod from executor_config arg -> Pod from the K8s executor pod_list = [worker_config, kube_executor_config, dynamic_pod] return reduce(PodGenerator.reconcile_pods, pod_list)
def construct_pod(dag_id: str, task_id: str, pod_id: str, try_number: int, date: str, command: List[str], kube_executor_config: Optional[k8s.V1Pod], worker_config: k8s.V1Pod, namespace: str, worker_uuid: str) -> k8s.V1Pod: """ Construct a pod by gathering and consolidating the configuration from 3 places: - airflow.cfg - executor_config - dynamic arguments """ dynamic_pod = PodGenerator(namespace=namespace, labels={ 'airflow-worker': worker_uuid, 'dag_id': dag_id, 'task_id': task_id, 'execution_date': date, 'try_number': str(try_number), 'airflow_version': airflow_version.replace('+', '-'), 'kubernetes_executor': 'True', }, cmds=command, name=pod_id).gen_pod() # Reconcile the pod generated by the Operator and the Pod # generated by the .cfg file pod_with_executor_config = PodGenerator.reconcile_pods( worker_config, kube_executor_config) # Reconcile that pod with the dynamic fields. return PodGenerator.reconcile_pods(pod_with_executor_config, dynamic_pod)
def test_update_function_if_exists(self, mock_hook): mock_hook.return_value.get_function.return_value = True mock_hook.return_value.update_function.return_value = True op = GcfFunctionDeployOperator( project_id=GCP_PROJECT_ID, location=GCP_LOCATION, body=deepcopy(VALID_BODY), task_id="id" ) op.execute(None) mock_hook.assert_called_once_with(api_version='v1', gcp_conn_id='google_cloud_default') mock_hook.return_value.get_function.assert_called_once_with( 'projects/test_project_id/locations/test_region/functions/helloWorld' ) expected_body = deepcopy(VALID_BODY) expected_body['labels'] = { 'airflow-version': 'v' + version.replace('.', '-').replace('+', '-') } mock_hook.return_value.update_function.assert_called_once_with( 'projects/test_project_id/locations/test_region/functions/helloWorld', expected_body, expected_body.keys()) mock_hook.return_value.create_new_function.assert_not_called()
def __init__( self, py_file, py_options=None, dataflow_default_options=None, options=None, gcp_conn_id='google_cloud_default', delegate_to=None, poll_sleep=10, *args, **kwargs): super(DataFlowPythonOperator, self).__init__(*args, **kwargs) self.py_file = py_file self.py_options = py_options or [] self.dataflow_default_options = dataflow_default_options or {} self.options = options or {} self.options.setdefault('labels', {}).update( {'airflow-version': 'v' + version.replace('.', '-').replace('+', '-')}) self.gcp_conn_id = gcp_conn_id self.delegate_to = delegate_to self.poll_sleep = poll_sleep
def create_new_pod_for_operator(self, labels, launcher): """ Creates a new pod and monitors for duration of task @param labels: labels used to track pod @param launcher: pod launcher that will manage launching and monitoring pods @return: """ if not (self.full_pod_spec or self.pod_template_file): # Add Airflow Version to the label # And a label to identify that pod is launched by KubernetesPodOperator self.labels.update({ 'airflow_version': airflow_version.replace('+', '-'), 'kubernetes_pod_operator': 'True', }) self.labels.update(labels) self.pod.metadata.labels = self.labels self.log.debug("Starting pod:\n%s", yaml.safe_dump(self.pod.to_dict())) try: launcher.start_pod(self.pod, startup_timeout=self.startup_timeout_seconds) final_state, result = launcher.monitor_pod(pod=self.pod, get_logs=self.get_logs) except AirflowException as ex: if self.log_events_on_failure: for event in launcher.read_pod_events(self.pod).items: self.log.error("Pod Event: %s - %s", event.reason, event.message) raise AirflowException( 'Pod Launching failed: {error}'.format(error=ex)) finally: if self.is_delete_operator_pod: launcher.delete_pod(self.pod) return final_state, self.pod, result
def make_pod(self, namespace, worker_uuid, pod_id, dag_id, task_id, execution_date, try_number, airflow_command) -> k8s.V1Pod: """Creates POD.""" pod_generator = PodGenerator( namespace=namespace, name=pod_id, image=self.kube_config.kube_image, image_pull_policy=self.kube_config.kube_image_pull_policy, image_pull_secrets=self.kube_config.image_pull_secrets, labels={ 'airflow-worker': worker_uuid, 'dag_id': dag_id, 'task_id': task_id, 'execution_date': execution_date, 'try_number': str(try_number), 'airflow_version': airflow_version.replace('+', '-'), 'kubernetes_executor': 'True', }, cmds=airflow_command, volumes=self._get_volumes(), volume_mounts=self._get_volume_mounts(), init_containers=self._get_init_containers(), annotations=self.kube_config.kube_annotations, affinity=self.kube_config.kube_affinity, tolerations=self.kube_config.kube_tolerations, envs=self._get_environment(), node_selectors=self.kube_config.kube_node_selectors, service_account_name=self.kube_config.worker_service_account_name, ) pod = pod_generator.gen_pod() pod.spec.containers[0].env_from = pod.spec.containers[0].env_from or [] pod.spec.containers[0].env_from.extend(self._get_env_from()) pod.spec.security_context = self._get_security_context() return append_to_pod(pod, self._get_secrets())
def create_subscription( self, topic: str, project_id: str, subscription: Optional[str] = None, subscription_project_id: Optional[str] = None, ack_deadline_secs: int = 10, fail_if_exists: bool = False, push_config: Optional[Union[dict, PushConfig]] = None, retain_acked_messages: Optional[bool] = None, message_retention_duration: Optional[Union[dict, Duration]] = None, labels: Optional[Dict[str, str]] = None, enable_message_ordering: bool = False, expiration_policy: Optional[Union[dict, ExpirationPolicy]] = None, filter_: Optional[str] = None, dead_letter_policy: Optional[Union[dict, DeadLetterPolicy]] = None, retry_policy: Optional[Union[dict, RetryPolicy]] = None, retry: Optional[Retry] = None, timeout: Optional[float] = None, metadata: Optional[Sequence[Tuple[str, str]]] = None, ) -> str: """ Creates a Pub/Sub subscription, if it does not already exist. :param topic: the Pub/Sub topic name that the subscription will be bound to create; do not include the ``projects/{project}/subscriptions/`` prefix. :type topic: str :param project_id: Optional, the Google Cloud project ID of the topic that the subscription will be bound to. If set to None or missing, the default project_id from the Google Cloud connection is used. :type project_id: str :param subscription: the Pub/Sub subscription name. If empty, a random name will be generated using the uuid module :type subscription: str :param subscription_project_id: the Google Cloud project ID where the subscription will be created. If unspecified, ``project_id`` will be used. :type subscription_project_id: str :param ack_deadline_secs: Number of seconds that a subscriber has to acknowledge each message pulled from the subscription :type ack_deadline_secs: int :param fail_if_exists: if set, raise an exception if the topic already exists :type fail_if_exists: bool :param push_config: If push delivery is used with this subscription, this field is used to configure it. An empty ``pushConfig`` signifies that the subscriber will pull and ack messages using API methods. :type push_config: Union[Dict, google.cloud.pubsub_v1.types.PushConfig] :param retain_acked_messages: Indicates whether to retain acknowledged messages. If true, then messages are not expunged from the subscription's backlog, even if they are acknowledged, until they fall out of the ``message_retention_duration`` window. This must be true if you would like to Seek to a timestamp. :type retain_acked_messages: bool :param message_retention_duration: How long to retain unacknowledged messages in the subscription's backlog, from the moment a message is published. If ``retain_acked_messages`` is true, then this also configures the retention of acknowledged messages, and thus configures how far back in time a ``Seek`` can be done. Defaults to 7 days. Cannot be more than 7 days or less than 10 minutes. :type message_retention_duration: Union[Dict, google.cloud.pubsub_v1.types.Duration] :param labels: Client-assigned labels; see https://cloud.google.com/pubsub/docs/labels :type labels: Dict[str, str] :param enable_message_ordering: If true, messages published with the same ordering_key in PubsubMessage will be delivered to the subscribers in the order in which they are received by the Pub/Sub system. Otherwise, they may be delivered in any order. :type enable_message_ordering: bool :param expiration_policy: A policy that specifies the conditions for this subscription’s expiration. A subscription is considered active as long as any connected subscriber is successfully consuming messages from the subscription or is issuing operations on the subscription. If expiration_policy is not set, a default policy with ttl of 31 days will be used. The minimum allowed value for expiration_policy.ttl is 1 day. :type expiration_policy: Union[Dict, google.cloud.pubsub_v1.types.ExpirationPolicy`] :param filter_: An expression written in the Cloud Pub/Sub filter language. If non-empty, then only PubsubMessages whose attributes field matches the filter are delivered on this subscription. If empty, then no messages are filtered out. :type filter_: str :param dead_letter_policy: A policy that specifies the conditions for dead lettering messages in this subscription. If dead_letter_policy is not set, dead lettering is disabled. :type dead_letter_policy: Union[Dict, google.cloud.pubsub_v1.types.DeadLetterPolicy] :param retry_policy: A policy that specifies how Pub/Sub retries message delivery for this subscription. If not set, the default retry policy is applied. This generally implies that messages will be retried as soon as possible for healthy subscribers. RetryPolicy will be triggered on NACKs or acknowledgement deadline exceeded events for a given message. :type retry_policy: Union[Dict, google.cloud.pubsub_v1.types.RetryPolicy] :param retry: (Optional) A retry object used to retry requests. If None is specified, requests will not be retried. :type retry: google.api_core.retry.Retry :param timeout: (Optional) The amount of time, in seconds, to wait for the request to complete. Note that if retry is specified, the timeout applies to each individual attempt. :type timeout: float :param metadata: (Optional) Additional metadata that is provided to the method. :type metadata: Sequence[Tuple[str, str]]] :return: subscription name which will be the system-generated value if the ``subscription`` parameter is not supplied :rtype: str """ subscriber = self.subscriber_client if not subscription: subscription = 'sub-{}'.format(uuid4()) if not subscription_project_id: subscription_project_id = project_id # Add airflow-version label to the subscription labels = labels or {} labels['airflow-version'] = 'v' + version.replace('.', '-').replace('+', '-') # pylint: disable=no-member subscription_path = SubscriberClient.subscription_path(subscription_project_id, subscription) topic_path = SubscriberClient.topic_path(project_id, topic) self.log.info("Creating subscription (path) %s for topic (path) %a", subscription_path, topic_path) try: subscriber.create_subscription( name=subscription_path, topic=topic_path, push_config=push_config, ack_deadline_seconds=ack_deadline_secs, retain_acked_messages=retain_acked_messages, message_retention_duration=message_retention_duration, labels=labels, enable_message_ordering=enable_message_ordering, expiration_policy=expiration_policy, filter_=filter_, dead_letter_policy=dead_letter_policy, retry_policy=retry_policy, retry=retry, timeout=timeout, metadata=metadata, ) except AlreadyExists: self.log.warning('Subscription already exists: %s', subscription_path) if fail_if_exists: raise PubSubException('Subscription already exists: {}'.format(subscription_path)) except GoogleAPICallError as e: raise PubSubException('Error creating subscription {}'.format(subscription_path), e) self.log.info("Created subscription (path) %s for topic (path) %s", subscription_path, topic_path) return subscription
def create_topic( self, topic: str, project_id: str, fail_if_exists: bool = False, labels: Optional[Dict[str, str]] = None, message_storage_policy: Union[Dict, MessageStoragePolicy] = None, kms_key_name: Optional[str] = None, retry: Optional[Retry] = None, timeout: Optional[float] = None, metadata: Optional[Sequence[Tuple[str, str]]] = None, ) -> None: """ Creates a Pub/Sub topic, if it does not already exist. :param topic: the Pub/Sub topic name to create; do not include the ``projects/{project}/topics/`` prefix. :type topic: str :param project_id: Optional, the Google Cloud project ID in which to create the topic If set to None or missing, the default project_id from the Google Cloud connection is used. :type project_id: str :param fail_if_exists: if set, raise an exception if the topic already exists :type fail_if_exists: bool :param labels: Client-assigned labels; see https://cloud.google.com/pubsub/docs/labels :type labels: Dict[str, str] :param message_storage_policy: Policy constraining the set of Google Cloud regions where messages published to the topic may be stored. If not present, then no constraints are in effect. :type message_storage_policy: Union[Dict, google.cloud.pubsub_v1.types.MessageStoragePolicy] :param kms_key_name: The resource name of the Cloud KMS CryptoKey to be used to protect access to messages published on this topic. The expected format is ``projects/*/locations/*/keyRings/*/cryptoKeys/*``. :type kms_key_name: str :param retry: (Optional) A retry object used to retry requests. If None is specified, requests will not be retried. :type retry: google.api_core.retry.Retry :param timeout: (Optional) The amount of time, in seconds, to wait for the request to complete. Note that if retry is specified, the timeout applies to each individual attempt. :type timeout: float :param metadata: (Optional) Additional metadata that is provided to the method. :type metadata: Sequence[Tuple[str, str]]] """ publisher = self.get_conn() topic_path = PublisherClient.topic_path(project_id, topic) # pylint: disable=no-member # Add airflow-version label to the topic labels = labels or {} labels['airflow-version'] = 'v' + version.replace('.', '-').replace('+', '-') self.log.info("Creating topic (path) %s", topic_path) try: # pylint: disable=no-member publisher.create_topic( name=topic_path, labels=labels, message_storage_policy=message_storage_policy, kms_key_name=kms_key_name, retry=retry, timeout=timeout, metadata=metadata, ) except AlreadyExists: self.log.warning('Topic already exists: %s', topic) if fail_if_exists: raise PubSubException('Topic already exists: {}'.format(topic)) except GoogleAPICallError as e: raise PubSubException('Error creating topic {}'.format(topic), e) self.log.info("Created topic (path) %s", topic_path)
def create_new_pod_for_operator( self, labels, launcher) -> Tuple[State, k8s.V1Pod, Optional[str]]: """ Creates a new pod and monitors for duration of task :param labels: labels used to track pod :param launcher: pod launcher that will manage launching and monitoring pods :return: """ if not (self.full_pod_spec or self.pod_template_file): # Add Airflow Version to the label # And a label to identify that pod is launched by KubernetesPodOperator self.labels.update({ 'airflow_version': airflow_version.replace('+', '-'), 'kubernetes_pod_operator': 'True', }) self.labels.update(labels) pod = pod_generator.PodGenerator( image=self.image, namespace=self.namespace, cmds=self.cmds, args=self.arguments, labels=self.labels, name=self.name, envs=self.env_vars, extract_xcom=self.do_xcom_push, image_pull_policy=self.image_pull_policy, node_selectors=self.node_selectors, annotations=self.annotations, affinity=self.affinity, image_pull_secrets=self.image_pull_secrets, service_account_name=self.service_account_name, hostnetwork=self.hostnetwork, tolerations=self.tolerations, configmaps=self.configmaps, security_context=self.security_context, dnspolicy=self.dnspolicy, schedulername=self.schedulername, init_containers=self.init_containers, restart_policy='Never', priority_class_name=self.priority_class_name, pod_template_file=self.pod_template_file, pod=self.full_pod_spec, ).gen_pod() # noinspection PyTypeChecker pod = append_to_pod( pod, self.pod_runtime_info_envs + self.ports + # type: ignore self.resources + self.secrets + # type: ignore self.volumes + # type: ignore self.volume_mounts # type: ignore ) self.pod = pod self.log.debug("Starting pod:\n%s", yaml.safe_dump(pod.to_dict())) try: launcher.start_pod(pod, startup_timeout=self.startup_timeout_seconds) final_state, result = launcher.monitor_pod(pod=pod, get_logs=self.get_logs) except AirflowException: if self.log_events_on_failure: for event in launcher.read_pod_events(pod).items: self.log.error("Pod Event: %s - %s", event.reason, event.message) raise finally: if self.is_delete_operator_pod: launcher.delete_pod(pod) return final_state, pod, result
def create_bucket( self, bucket_name: str, resource: Optional[dict] = None, storage_class: str = 'MULTI_REGIONAL', location: str = 'US', project_id: Optional[str] = None, labels: Optional[dict] = None, ) -> str: """ Creates a new bucket. Google Cloud Storage uses a flat namespace, so you can't create a bucket with a name that is already in use. .. seealso:: For more information, see Bucket Naming Guidelines: https://cloud.google.com/storage/docs/bucketnaming.html#requirements :param bucket_name: The name of the bucket. :type bucket_name: str :param resource: An optional dict with parameters for creating the bucket. For information on available parameters, see Cloud Storage API doc: https://cloud.google.com/storage/docs/json_api/v1/buckets/insert :type resource: dict :param storage_class: This defines how objects in the bucket are stored and determines the SLA and the cost of storage. Values include - ``MULTI_REGIONAL`` - ``REGIONAL`` - ``STANDARD`` - ``NEARLINE`` - ``COLDLINE``. If this value is not specified when the bucket is created, it will default to STANDARD. :type storage_class: str :param location: The location of the bucket. Object data for objects in the bucket resides in physical storage within this region. Defaults to US. .. seealso:: https://developers.google.com/storage/docs/bucket-locations :type location: str :param project_id: The ID of the Google Cloud Project. :type project_id: str :param labels: User-provided labels, in key/value pairs. :type labels: dict :return: If successful, it returns the ``id`` of the bucket. """ self.log.info('Creating Bucket: %s; Location: %s; Storage Class: %s', bucket_name, location, storage_class) # Add airflow-version label to the bucket labels = labels or {} labels['airflow-version'] = 'v' + version.replace('.', '-').replace( '+', '-') client = self.get_conn() bucket = client.bucket(bucket_name=bucket_name) bucket_resource = resource or {} for item in bucket_resource: if item != "name": bucket._patch_property( # pylint: disable=protected-access name=item, value=resource[item] # type: ignore[index] ) bucket.storage_class = storage_class bucket.labels = labels bucket.create(project=project_id, location=location) return bucket.id
PY_OPTIONS = ['-m'] DEFAULT_OPTIONS_PYTHON = DEFAULT_OPTIONS_JAVA = { 'project': 'test', 'stagingLocation': 'gs://test/staging', } DEFAULT_OPTIONS_TEMPLATE = { 'project': 'test', 'stagingLocation': 'gs://test/staging', 'tempLocation': 'gs://test/temp', 'zone': 'us-central1-f' } ADDITIONAL_OPTIONS = { 'output': 'gs://test/output', 'labels': {'foo': 'bar'} } TEST_VERSION = 'v{}'.format(version.replace('.', '-').replace('+', '-')) EXPECTED_ADDITIONAL_OPTIONS = { 'output': 'gs://test/output', 'labels': {'foo': 'bar', 'airflow-version': TEST_VERSION} } POLL_SLEEP = 30 GCS_HOOK_STRING = 'airflow.contrib.operators.dataflow_operator.{}' class DataFlowPythonOperatorTest(unittest.TestCase): def setUp(self): self.dataflow = DataFlowPythonOperator( task_id=TASK_ID, py_file=PY_FILE, job_name=JOB_NAME,
def _build_cluster_data(self): zone_uri = \ 'https://www.googleapis.com/compute/v1/projects/{}/zones/{}'.format( self.project_id, self.zone ) master_type_uri = \ "https://www.googleapis.com/compute/v1/projects/{}/zones/{}/machineTypes/{}".format( self.project_id, self.zone, self.master_machine_type ) worker_type_uri = \ "https://www.googleapis.com/compute/v1/projects/{}/zones/{}/machineTypes/{}".format( self.project_id, self.zone, self.worker_machine_type ) cluster_data = { 'projectId': self.project_id, 'clusterName': self.cluster_name, 'config': { 'gceClusterConfig': { 'zoneUri': zone_uri }, 'masterConfig': { 'numInstances': 1, 'machineTypeUri': master_type_uri, 'diskConfig': { 'bootDiskSizeGb': self.master_disk_size } }, 'workerConfig': { 'numInstances': self.num_workers, 'machineTypeUri': worker_type_uri, 'diskConfig': { 'bootDiskSizeGb': self.worker_disk_size } }, 'secondaryWorkerConfig': {}, 'softwareConfig': {} } } if self.num_preemptible_workers > 0: cluster_data['config']['secondaryWorkerConfig'] = { 'numInstances': self.num_preemptible_workers, 'machineTypeUri': worker_type_uri, 'diskConfig': { 'bootDiskSizeGb': self.worker_disk_size }, 'isPreemptible': True } cluster_data['labels'] = self.labels if self.labels else {} # Dataproc labels must conform to the following regex: # [a-z]([-a-z0-9]*[a-z0-9])? (current airflow version string follows # semantic versioning spec: x.y.z). cluster_data['labels'].update({'airflow-version': 'v' + version.replace('.', '-').replace('+','-')}) if self.storage_bucket: cluster_data['config']['configBucket'] = self.storage_bucket if self.metadata: cluster_data['config']['gceClusterConfig']['metadata'] = self.metadata if self.network_uri: cluster_data['config']['gceClusterConfig']['networkUri'] = self.network_uri if self.subnetwork_uri: cluster_data['config']['gceClusterConfig']['subnetworkUri'] = self.subnetwork_uri if self.tags: cluster_data['config']['gceClusterConfig']['tags'] = self.tags if self.image_version: cluster_data['config']['softwareConfig']['imageVersion'] = self.image_version if self.properties: cluster_data['config']['softwareConfig']['properties'] = self.properties if self.init_actions_uris: init_actions_dict = [ {'executableFile': uri} for uri in self.init_actions_uris ] cluster_data['config']['initializationActions'] = init_actions_dict if self.service_account: cluster_data['config']['gceClusterConfig']['serviceAccount'] =\ self.service_account if self.service_account_scopes: cluster_data['config']['gceClusterConfig']['serviceAccountScopes'] =\ self.service_account_scopes return cluster_data
def _set_airflow_version_label(self): if 'labels' not in self.body.keys(): self.body['labels'] = {} self.body['labels'].update( {'airflow-version': 'v' + version.replace('.', '-').replace('+', '-')})