def test_build_cluster_data(self):
     for suffix, dataproc_operator in enumerate(self.dataproc_operators):
         cluster_data = dataproc_operator._build_cluster_data()
         self.assertEqual(cluster_data['clusterName'], CLUSTER_NAME)
         self.assertEqual(cluster_data['projectId'], PROJECT_ID)
         self.assertEqual(cluster_data['config']['softwareConfig'], {'imageVersion': IMAGE_VERSION})
         self.assertEqual(cluster_data['config']['configBucket'], STORAGE_BUCKET)
         self.assertEqual(cluster_data['config']['workerConfig']['numInstances'], NUM_WORKERS)
         self.assertEqual(cluster_data['config']['secondaryWorkerConfig']['numInstances'],
                          NUM_PREEMPTIBLE_WORKERS)
         self.assertEqual(cluster_data['config']['gceClusterConfig']['serviceAccountScopes'],
             SERVICE_ACCOUNT_SCOPES)
         self.assertEqual(cluster_data['config']['gceClusterConfig']['subnetworkUri'],
             SUBNETWORK_URI)
         self.assertEqual(cluster_data['config']['gceClusterConfig']['networkUri'],
             NETWORK_URI)
         self.assertEqual(cluster_data['config']['gceClusterConfig']['tags'],
             TAGS)
         # test whether the default airflow-version label has been properly
         # set to the dataproc operator.
         merged_labels = {}
         merged_labels.update(self.labels[suffix])
         merged_labels.update({'airflow-version': 'v' + version.replace('.', '-').replace('+','-')})
         self.assertTrue(re.match(r'[a-z]([-a-z0-9]*[a-z0-9])?',
                                  cluster_data['labels']['airflow-version']))
         self.assertEqual(cluster_data['labels'], merged_labels)
 def test_deploy_execute(self, mock_hook):
     mock_hook.return_value.get_function.side_effect = mock.Mock(
         side_effect=HttpError(resp=MOCK_RESP_404, content=b'not found'))
     mock_hook.return_value.create_new_function.return_value = True
     op = GcfFunctionDeployOperator(
         project_id=GCP_PROJECT_ID,
         location=GCP_LOCATION,
         body=deepcopy(VALID_BODY),
         task_id="id"
     )
     op.execute(None)
     mock_hook.assert_called_once_with(api_version='v1',
                                       gcp_conn_id='google_cloud_default')
     mock_hook.return_value.get_function.assert_called_once_with(
         'projects/test_project_id/locations/test_region/functions/helloWorld'
     )
     expected_body = deepcopy(VALID_BODY)
     expected_body['labels'] = {
         'airflow-version': 'v' + version.replace('.', '-').replace('+', '-')
     }
     mock_hook.return_value.create_new_function.assert_called_once_with(
         project_id='test_project_id',
         location='test_region',
         body=expected_body
     )
    def __init__(
            self,
            py_file,
            job_name='{{task.task_id}}',
            py_options=None,
            dataflow_default_options=None,
            options=None,
            gcp_conn_id='google_cloud_default',
            delegate_to=None,
            poll_sleep=10,
            *args,
            **kwargs):

        super(DataFlowPythonOperator, self).__init__(*args, **kwargs)

        self.py_file = py_file
        self.job_name = job_name
        self.py_options = py_options or []
        self.dataflow_default_options = dataflow_default_options or {}
        self.options = options or {}
        self.options.setdefault('labels', {}).update(
            {'airflow-version': 'v' + version.replace('.', '-').replace('+', '-')})
        self.gcp_conn_id = gcp_conn_id
        self.delegate_to = delegate_to
        self.poll_sleep = poll_sleep
    def __init__(
            self,
            jar,
            dataflow_default_options=None,
            options=None,
            gcp_conn_id='google_cloud_default',
            delegate_to=None,
            poll_sleep=10,
            job_class=None,
            *args,
            **kwargs):
        """
        Create a new DataFlowJavaOperator. Note that both
        dataflow_default_options and options will be merged to specify pipeline
        execution parameter, and dataflow_default_options is expected to save
        high-level options, for instances, project and zone information, which
        apply to all dataflow operators in the DAG.


        .. seealso::
            For more detail on job submission have a look at the reference:
            https://cloud.google.com/dataflow/pipelines/specifying-exec-params

        :param jar: The reference to a self executing DataFlow jar.
        :type jar: string
        :param dataflow_default_options: Map of default job options.
        :type dataflow_default_options: dict
        :param options: Map of job specific options.
        :type options: dict
        :param gcp_conn_id: The connection ID to use connecting to Google Cloud
        Platform.
        :type gcp_conn_id: string
        :param delegate_to: The account to impersonate, if any.
            For this to work, the service account making the request must have
            domain-wide delegation enabled.
        :type delegate_to: string
        :param poll_sleep: The time in seconds to sleep between polling Google
            Cloud Platform for the dataflow job status while the job is in the
            JOB_STATE_RUNNING state.
        :type poll_sleep: int
        :param job_class: The name of the dataflow job class to be executued, it
        is often not the main class configured in the dataflow jar file.
        :type job_class: string
        """
        super(DataFlowJavaOperator, self).__init__(*args, **kwargs)

        dataflow_default_options = dataflow_default_options or {}
        options = options or {}
        options.setdefault('labels', {}).update(
            {'airflow-version': 'v' + version.replace('.', '-').replace('+', '-')})
        self.gcp_conn_id = gcp_conn_id
        self.delegate_to = delegate_to
        self.jar = jar
        self.dataflow_default_options = dataflow_default_options
        self.options = options
        self.poll_sleep = poll_sleep
        self.job_class = job_class
    def __init__(
            self,
            py_file,
            py_options=None,
            dataflow_default_options=None,
            options=None,
            gcp_conn_id='google_cloud_default',
            delegate_to=None,
            poll_sleep=10,
            *args,
            **kwargs):
        """
        Create a new DataFlowPythonOperator. Note that both
        dataflow_default_options and options will be merged to specify pipeline
        execution parameter, and dataflow_default_options is expected to save
        high-level options, for instances, project and zone information, which
        apply to all dataflow operators in the DAG.

        .. seealso::
            For more detail on job submission have a look at the reference:
            https://cloud.google.com/dataflow/pipelines/specifying-exec-params

        :param py_file: Reference to the python dataflow pipleline file.py, e.g.,
            /some/local/file/path/to/your/python/pipeline/file.
        :type py_file: string
        :param py_options: Additional python options.
        :type pyt_options: list of strings, e.g., ["-m", "-v"].
        :param dataflow_default_options: Map of default job options.
        :type dataflow_default_options: dict
        :param options: Map of job specific options.
        :type options: dict
        :param gcp_conn_id: The connection ID to use connecting to Google Cloud
            Platform.
        :type gcp_conn_id: string
        :param delegate_to: The account to impersonate, if any.
            For this to work, the service account making the request must have
            domain-wide  delegation enabled.
        :type delegate_to: string
        :param poll_sleep: The time in seconds to sleep between polling Google
            Cloud Platform for the dataflow job status while the job is in the
            JOB_STATE_RUNNING state.
        :type poll_sleep: int
        """
        super(DataFlowPythonOperator, self).__init__(*args, **kwargs)

        self.py_file = py_file
        self.py_options = py_options or []
        self.dataflow_default_options = dataflow_default_options or {}
        self.options = options or {}
        self.options.setdefault('labels', {}).update(
            {'airflow-version': 'v' + version.replace('.', '-').replace('+', '-')})
        self.gcp_conn_id = gcp_conn_id
        self.delegate_to = delegate_to
        self.poll_sleep = poll_sleep
Example #6
0
 def test_build_cluster_data(self):
     for suffix, dataproc_operator in enumerate(self.dataproc_operators):
         cluster_data = dataproc_operator._build_cluster_data()
         self.assertEqual(cluster_data['clusterName'], CLUSTER_NAME)
         self.assertEqual(cluster_data['projectId'], GCP_PROJECT_ID)
         self.assertEqual(cluster_data['config']['softwareConfig'],
                          {'imageVersion': IMAGE_VERSION})
         self.assertEqual(cluster_data['config']['configBucket'],
                          STORAGE_BUCKET)
         self.assertEqual(
             cluster_data['config']['workerConfig']['numInstances'],
             NUM_WORKERS)
         self.assertEqual(
             cluster_data['config']['secondaryWorkerConfig']
             ['numInstances'], NUM_PREEMPTIBLE_WORKERS)
         self.assertEqual(
             cluster_data['config']['gceClusterConfig']
             ['serviceAccountScopes'], SERVICE_ACCOUNT_SCOPES)
         self.assertEqual(
             cluster_data['config']['gceClusterConfig']['internalIpOnly'],
             INTERNAL_IP_ONLY)
         self.assertEqual(
             cluster_data['config']['gceClusterConfig']['subnetworkUri'],
             SUBNETWORK_URI)
         self.assertEqual(
             cluster_data['config']['gceClusterConfig']['networkUri'],
             NETWORK_URI)
         self.assertEqual(
             cluster_data['config']['gceClusterConfig']['tags'], TAGS)
         self.assertEqual(
             cluster_data['config']['lifecycleConfig']['idleDeleteTtl'],
             "321s")
         self.assertEqual(
             cluster_data['config']['lifecycleConfig']['autoDeleteTime'],
             "2017-06-07T00:00:00.000000Z")
         self.assertEqual(
             cluster_data['config']['autoscalingConfig']['policyUri'],
             SCALING_POLICY)
         # test whether the default airflow-version label has been properly
         # set to the dataproc operator.
         merged_labels = {}
         merged_labels.update(self.labels[suffix])
         merged_labels.update({
             'airflow-version':
             'v' + version.replace('.', '-').replace('+', '-')
         })
         self.assertTrue(
             re.match(r'[a-z]([-a-z0-9]*[a-z0-9])?',
                      cluster_data['labels']['airflow-version']))
         self.assertEqual(cluster_data['labels'], merged_labels)
Example #7
0
 def __get_dataflow_pipeline_options(
         self,
         pipeline_options: dict,
         job_name: str,
         job_name_key: Optional[str] = None) -> dict:
     pipeline_options = copy.deepcopy(pipeline_options)
     if job_name_key is not None:
         pipeline_options[job_name_key] = job_name
     pipeline_options["project"] = self.dataflow_config.project_id
     pipeline_options["region"] = self.dataflow_config.location
     pipeline_options.setdefault("labels", {}).update({
         "airflow-version":
         "v" + version.replace(".", "-").replace("+", "-")
     })
     return pipeline_options
 def test_empty_project_id_is_ok(self, mock_hook):
     mock_hook.return_value.get_function.side_effect = \
         HttpError(resp=MOCK_RESP_404, content=b'not found')
     operator = CloudFunctionDeployFunctionOperator(
         location="test_region", body=deepcopy(VALID_BODY), task_id="id")
     operator.execute(None)
     mock_hook.assert_called_once_with(api_version='v1',
                                       gcp_conn_id='google_cloud_default')
     new_body = deepcopy(VALID_BODY)
     new_body['labels'] = {
         'airflow-version':
         'v' + version.replace('.', '-').replace('+', '-')
     }
     mock_hook.return_value.create_new_function.assert_called_once_with(
         project_id=None, location="test_region", body=new_body)
Example #9
0
 def setUp(self):
     self.maxDiff = None  # pylint: disable=invalid-name
     self.api_client = ApiClient()
     self.expected_pod = {
         'apiVersion': 'v1',
         'kind': 'Pod',
         'metadata': {
             'namespace': 'default',
             'name': mock.ANY,
             'annotations': {},
             'labels': {
                 'foo': 'bar',
                 'kubernetes_pod_operator': 'True',
                 'airflow_version': airflow_version.replace('+', '-'),
                 'execution_date': '2016-01-01T0100000100-a2f50a31f',
                 'dag_id': 'dag',
                 'task_id': 'task',
                 'try_number': '1',
             },
         },
         'spec': {
             'affinity': {},
             'containers': [{
                 'image': 'ubuntu:16.04',
                 'imagePullPolicy': 'IfNotPresent',
                 'args': ["echo 10"],
                 'command': ["bash", "-cx"],
                 'env': [],
                 'envFrom': [],
                 'resources': {},
                 'name': 'base',
                 'ports': [],
                 'volumeMounts': [],
             }],
             'hostNetwork':
             False,
             'imagePullSecrets': [],
             'initContainers': [],
             'nodeSelector': {},
             'restartPolicy':
             'Never',
             'securityContext': {},
             'serviceAccountName':
             'default',
             'tolerations': [],
             'volumes': [],
         },
     }
Example #10
0
    def __init__(
        self,
        *,
        py_file: str,
        runner: str = "DirectRunner",
        default_pipeline_options: Optional[dict] = None,
        pipeline_options: Optional[dict] = None,
        py_interpreter: str = "python3",
        py_options: Optional[List[str]] = None,
        py_requirements: Optional[List[str]] = None,
        py_system_site_packages: bool = False,
        gcp_conn_id: str = "google_cloud_default",
        delegate_to: Optional[str] = None,
        dataflow_config: Optional[Union[DataflowConfiguration, dict]] = None,
        **kwargs,
    ) -> None:
        super().__init__(**kwargs)

        self.py_file = py_file
        self.runner = runner
        self.py_options = py_options or []
        self.default_pipeline_options = default_pipeline_options or {}
        self.pipeline_options = pipeline_options or {}
        self.pipeline_options.setdefault("labels", {}).update({
            "airflow-version":
            "v" + version.replace(".", "-").replace("+", "-")
        })
        self.py_interpreter = py_interpreter
        self.py_requirements = py_requirements
        self.py_system_site_packages = py_system_site_packages
        self.gcp_conn_id = gcp_conn_id
        self.delegate_to = delegate_to
        self.beam_hook: Optional[BeamHook] = None
        self.dataflow_hook: Optional[DataflowHook] = None
        self.dataflow_job_id: Optional[str] = None

        if dataflow_config is None:
            self.dataflow_config = DataflowConfiguration()
        elif isinstance(dataflow_config, dict):
            self.dataflow_config = DataflowConfiguration(**dataflow_config)
        else:
            self.dataflow_config = dataflow_config

        if self.dataflow_config and self.runner.lower(
        ) != BeamRunnerType.DataflowRunner.lower():
            self.log.warning(
                "dataflow_config is defined but runner is different than DataflowRunner (%s)",
                self.runner)
Example #11
0
    def execute(self, context):
        if self.labels is not None:
            self.labels.update(
                {'airflow-version': 'v' + version.replace('.', '-').replace('+', '-')}
            )

        hook = GoogleCloudStorageHook(
            google_cloud_storage_conn_id=self.google_cloud_storage_conn_id,
            delegate_to=self.delegate_to
        )

        hook.create_bucket(bucket_name=self.bucket_name,
                           storage_class=self.storage_class,
                           location=self.location,
                           project_id=self.project_id,
                           labels=self.labels)
    def execute(self, context):
        if self.labels is not None:
            self.labels.update({
                'airflow-version':
                'v' + version.replace('.', '-').replace('+', '-')
            })

        hook = GoogleCloudStorageHook(
            google_cloud_storage_conn_id=self.google_cloud_storage_conn_id,
            delegate_to=self.delegate_to)

        hook.create_bucket(bucket_name=self.bucket_name,
                           storage_class=self.storage_class,
                           location=self.location,
                           project_id=self.project_id,
                           labels=self.labels)
    def __init__(  # pylint: disable=too-many-arguments
        self,
        *,
        py_file: str,
        job_name: str = "{{task.task_id}}",
        dataflow_default_options: Optional[dict] = None,
        options: Optional[dict] = None,
        py_interpreter: str = "python3",
        py_options: Optional[List[str]] = None,
        py_requirements: Optional[List[str]] = None,
        py_system_site_packages: bool = False,
        project_id: Optional[str] = None,
        location: str = DEFAULT_DATAFLOW_LOCATION,
        gcp_conn_id: str = "google_cloud_default",
        delegate_to: Optional[str] = None,
        poll_sleep: int = 10,
        drain_pipeline: bool = False,
        cancel_timeout: Optional[int] = 10 * 60,
        wait_until_finished: Optional[bool] = None,
        **kwargs,
    ) -> None:

        super().__init__(**kwargs)

        self.py_file = py_file
        self.job_name = job_name
        self.py_options = py_options or []
        self.dataflow_default_options = dataflow_default_options or {}
        self.options = options or {}
        self.options.setdefault("labels", {}).update({
            "airflow-version":
            "v" + version.replace(".", "-").replace("+", "-")
        })
        self.py_interpreter = py_interpreter
        self.py_requirements = py_requirements
        self.py_system_site_packages = py_system_site_packages
        self.project_id = project_id
        self.location = location
        self.gcp_conn_id = gcp_conn_id
        self.delegate_to = delegate_to
        self.poll_sleep = poll_sleep
        self.drain_pipeline = drain_pipeline
        self.cancel_timeout = cancel_timeout
        self.wait_until_finished = wait_until_finished
        self.job_id = None
        self.hook: Optional[DataflowHook] = None
Example #14
0
    def test_make_pod_assert_labels(self):
        # Tests the pod created has all the expected labels set
        self.kube_config.dags_folder = 'dags'

        worker_config = WorkerConfiguration(self.kube_config)
        pod = worker_config.make_pod("default", "sample-uuid", "test_pod_id", "test_dag_id",
                                     "test_task_id", "2019-11-21 11:08:22.920875", 1, "bash -c 'ls /'")
        expected_labels = {
            'airflow-worker': 'sample-uuid',
            'airflow_version': airflow_version.replace('+', '-'),
            'dag_id': 'test_dag_id',
            'execution_date': '2019-11-21 11:08:22.920875',
            'kubernetes_executor': 'True',
            'task_id': 'test_task_id',
            'try_number': '1'
        }
        self.assertEqual(pod.metadata.labels, expected_labels)
    def test_execute(self, mock_hook):
        operator = GoogleCloudStorageCreateBucketOperator(
            task_id=TASK_ID,
            bucket_name=TEST_BUCKET,
            resource={
                "lifecycle": {
                    "rule": [{
                        "action": {
                            "type": "Delete"
                        },
                        "condition": {
                            "age": 7
                        }
                    }]
                }
            },
            storage_class='MULTI_REGIONAL',
            location='EU',
            labels={'env': 'prod'},
            project_id=TEST_PROJECT)

        operator.execute(None)
        mock_hook.return_value.create_bucket.assert_called_once_with(
            bucket_name=TEST_BUCKET,
            storage_class='MULTI_REGIONAL',
            location='EU',
            labels={
                'airflow-version':
                'v' + version.replace('.', '-').replace('+', '-'),
                'env':
                'prod'
            },
            project_id=TEST_PROJECT,
            resource={
                'lifecycle': {
                    'rule': [{
                        'action': {
                            'type': 'Delete'
                        },
                        'condition': {
                            'age': 7
                        }
                    }]
                }
            })
    def test_execute(self, mock_hook):
        operator = GoogleCloudStorageCreateBucketOperator(
            task_id=TASK_ID,
            bucket_name=TEST_BUCKET,
            storage_class='MULTI_REGIONAL',
            location='EU',
            labels={'env': 'prod'},
            project_id=TEST_PROJECT
        )

        operator.execute(None)
        mock_hook.return_value.create_bucket.assert_called_once_with(
            bucket_name=TEST_BUCKET, storage_class='MULTI_REGIONAL',
            location='EU', labels={
                'airflow-version': 'v' + version.replace('.', '-').replace('+', '-'),
                'env': 'prod'
            }, project_id=TEST_PROJECT
        )
 def test_empty_project_id_is_ok(self, mock_hook):
     operator = GcfFunctionDeployOperator(
         location="test_region",
         body=deepcopy(VALID_BODY),
         task_id="id"
     )
     operator._hook.get_function.side_effect = \
         HttpError(resp=MOCK_RESP_404, content=b'not found')
     operator.execute(None)
     mock_hook.assert_called_once_with(api_version='v1',
                                       gcp_conn_id='google_cloud_default')
     new_body = deepcopy(VALID_BODY)
     new_body['labels'] = {
         'airflow-version': 'v' + version.replace('.', '-').replace('+', '-')}
     mock_hook.return_value.create_new_function.assert_called_once_with(
         project_id=None,
         location="test_region",
         body=new_body)
 def test_build_cluster_data(self):
     for suffix, dataproc_operator in enumerate(self.dataproc_operators):
         cluster_data = dataproc_operator._build_cluster_data()
         self.assertEqual(cluster_data['clusterName'], CLUSTER_NAME)
         self.assertEqual(cluster_data['projectId'], PROJECT_ID)
         self.assertEqual(cluster_data['config']['softwareConfig'], {'imageVersion': IMAGE_VERSION})
         self.assertEqual(cluster_data['config']['configBucket'], STORAGE_BUCKET)
         self.assertEqual(cluster_data['config']['workerConfig']['numInstances'], NUM_WORKERS)
         self.assertEqual(cluster_data['config']['secondaryWorkerConfig']['numInstances'],
                          NUM_PREEMPTIBLE_WORKERS)
         # test whether the default airflow-version label has been properly
         # set to the dataproc operator.
         merged_labels = {}
         merged_labels.update(self.labels[suffix])
         merged_labels.update({'airflow-version': 'v' + version.replace('.', '-')})
         self.assertTrue(re.match(r'[a-z]([-a-z0-9]*[a-z0-9])?',
                                  cluster_data['labels']['airflow-version']))
         self.assertEqual(cluster_data['labels'], merged_labels)
Example #19
0
    def construct_pod(  # pylint: disable=too-many-arguments
            dag_id: str, task_id: str, pod_id: str, try_number: int,
            kube_image: str, date: datetime.datetime, command: List[str],
            pod_override_object: Optional[k8s.V1Pod],
            base_worker_pod: k8s.V1Pod, namespace: str,
            worker_uuid: str) -> k8s.V1Pod:
        """
        Construct a pod by gathering and consolidating the configuration from 3 places:
            - airflow.cfg
            - executor_config
            - dynamic arguments
        """
        dynamic_pod = PodGenerator(namespace=namespace,
                                   image=kube_image,
                                   labels={
                                       'airflow-worker':
                                       worker_uuid,
                                       'dag_id':
                                       make_safe_label_value(dag_id),
                                       'task_id':
                                       make_safe_label_value(task_id),
                                       'execution_date':
                                       datetime_to_label_safe_datestring(date),
                                       'try_number':
                                       str(try_number),
                                       'airflow_version':
                                       airflow_version.replace('+', '-'),
                                       'kubernetes_executor':
                                       'True',
                                   },
                                   annotations={
                                       'dag_id': dag_id,
                                       'task_id': task_id,
                                       'execution_date': date.isoformat(),
                                       'try_number': str(try_number),
                                   },
                                   cmds=command,
                                   name=pod_id).gen_pod()

        # Reconcile the pods starting with the first chronologically,
        # Pod from the pod_template_File -> Pod from executor_config arg -> Pod from the K8s executor
        pod_list = [base_worker_pod, pod_override_object, dynamic_pod]

        return reduce(PodGenerator.reconcile_pods, pod_list)
    def create_new_pod_for_operator(
            self, labels, launcher) -> Tuple[State, k8s.V1Pod, Optional[str]]:
        """
        Creates a new pod and monitors for duration of task

        :param labels: labels used to track pod
        :param launcher: pod launcher that will manage launching and monitoring pods
        :return:
        """
        self.log.debug(
            "Adding KubernetesPodOperator labels to pod before launch for task %s",
            self.task_id)

        # Merge Pod Identifying labels with labels passed to operator
        self.pod.metadata.labels.update(labels)
        # Add Airflow Version to the label
        # And a label to identify that pod is launched by KubernetesPodOperator
        self.pod.metadata.labels.update({
            'airflow_version':
            airflow_version.replace('+', '-'),
            'kubernetes_pod_operator':
            'True',
        })

        self.log.debug("Starting pod:\n%s", yaml.safe_dump(self.pod.to_dict()))
        final_state = None
        try:
            launcher.start_pod(self.pod,
                               startup_timeout=self.startup_timeout_seconds)
            final_state, remote_pod, result = launcher.monitor_pod(
                pod=self.pod, get_logs=self.get_logs)
        except AirflowException:
            if self.log_events_on_failure:
                for event in launcher.read_pod_events(self.pod).items:
                    self.log.error("Pod Event: %s - %s", event.reason,
                                   event.message)
            raise
        finally:
            if self.is_delete_operator_pod:
                self.log.debug("Deleting pod for task %s", self.task_id)
                launcher.delete_pod(self.pod)
            elif final_state != State.SUCCESS:
                self.patch_already_checked(self.pod)
        return final_state, remote_pod, result
    def __init__(
        self,
        *,
        jar: str,
        job_name: str = "{{task.task_id}}",
        dataflow_default_options: Optional[dict] = None,
        options: Optional[dict] = None,
        project_id: Optional[str] = None,
        location: str = DEFAULT_DATAFLOW_LOCATION,
        gcp_conn_id: str = "google_cloud_default",
        delegate_to: Optional[str] = None,
        poll_sleep: int = 10,
        job_class: Optional[str] = None,
        check_if_running: CheckJobRunning = CheckJobRunning.WaitForRun,
        multiple_jobs: Optional[bool] = None,
        cancel_timeout: Optional[int] = 10 * 60,
        wait_until_finished: Optional[bool] = None,
        **kwargs,
    ) -> None:
        super().__init__(**kwargs)

        dataflow_default_options = dataflow_default_options or {}
        options = options or {}
        options.setdefault("labels", {}).update({
            "airflow-version":
            "v" + version.replace(".", "-").replace("+", "-")
        })
        self.project_id = project_id
        self.location = location
        self.gcp_conn_id = gcp_conn_id
        self.delegate_to = delegate_to
        self.jar = jar
        self.multiple_jobs = multiple_jobs
        self.job_name = job_name
        self.dataflow_default_options = dataflow_default_options
        self.options = options
        self.poll_sleep = poll_sleep
        self.job_class = job_class
        self.check_if_running = check_if_running
        self.cancel_timeout = cancel_timeout
        self.wait_until_finished = wait_until_finished
        self.job_id = None
        self.hook = None
Example #22
0
 def setUp(self):
     self.maxDiff = None  # pylint: disable=invalid-name
     self.api_client = ApiClient()
     self.expected_pod = {
         'apiVersion': 'v1',
         'kind': 'Pod',
         'metadata': {
             'namespace': 'default',
             'name': ANY,
             'annotations': {},
             'labels': {
                 'foo': 'bar', 'kubernetes_pod_operator': 'True',
                 'airflow_version': airflow_version.replace('+', '-')
             }
         },
         'spec': {
             'affinity': {},
             'containers': [{
                 'image': 'ubuntu:16.04',
                 'args': ["echo 10"],
                 'command': ["bash", "-cx"],
                 'env': [],
                 'imagePullPolicy': 'IfNotPresent',
                 'envFrom': [],
                 'name': 'base',
                 'ports': [],
                 'resources': {'limits': {'cpu': None,
                                          'memory': None,
                                          'nvidia.com/gpu': None},
                               'requests': {'cpu': None,
                                            'memory': None}},
                 'volumeMounts': [],
             }],
             'hostNetwork': False,
             'imagePullSecrets': [],
             'nodeSelector': {},
             'restartPolicy': 'Never',
             'securityContext': {},
             'serviceAccountName': 'default',
             'tolerations': [],
             'volumes': [],
         }
     }
Example #23
0
    def test_execute(self, mock_hook):
        operator = GoogleCloudStorageCreateBucketOperator(
            task_id=TASK_ID,
            bucket_name=TEST_BUCKET,
            resource={"lifecycle": {"rule": [{"action": {"type": "Delete"}, "condition": {"age": 7}}]}},
            storage_class='MULTI_REGIONAL',
            location='EU',
            labels={'env': 'prod'},
            project_id=TEST_PROJECT
        )

        operator.execute(None)
        mock_hook.return_value.create_bucket.assert_called_once_with(
            bucket_name=TEST_BUCKET, storage_class='MULTI_REGIONAL',
            location='EU', labels={
                'airflow-version': 'v' + version.replace('.', '-').replace('+', '-'),
                'env': 'prod'
            }, project_id=TEST_PROJECT,
            resource={'lifecycle': {'rule': [{'action': {'type': 'Delete'}, 'condition': {'age': 7}}]}}
        )
Example #24
0
 def __init__(
     self,
     project_id: str,
     task_id: str,
     cluster_name: str,
     job_type: str,
     properties: Optional[Dict[str, str]] = None,
 ) -> None:
     name = task_id + "_" + str(uuid.uuid4())[:8]
     self.job_type = job_type
     self.job = {
         "job": {
             "reference": {"project_id": project_id, "job_id": name},
             "placement": {"cluster_name": cluster_name},
             "labels": {'airflow-version': 'v' + airflow_version.replace('.', '-').replace('+', '-')},
             job_type: {},
         }
     }  # type: Dict[str, Any]
     if properties is not None:
         self.job["job"][job_type]["properties"] = properties
Example #25
0
    def __init__(  # pylint: disable=too-many-arguments
        self,
        *,
        py_file: str,
        job_name: str = '{{task.task_id}}',
        dataflow_default_options: Optional[dict] = None,
        options: Optional[dict] = None,
        py_interpreter: str = "python3",
        py_options: Optional[List[str]] = None,
        py_requirements: Optional[List[str]] = None,
        py_system_site_packages: bool = False,
        project_id: Optional[str] = None,
        location: str = DEFAULT_DATAFLOW_LOCATION,
        gcp_conn_id: str = 'google_cloud_default',
        delegate_to: Optional[str] = None,
        poll_sleep: int = 10,
        drain_pipeline: bool = False,
        **kwargs,
    ) -> None:

        super().__init__(**kwargs)

        self.py_file = py_file
        self.job_name = job_name
        self.py_options = py_options or []
        self.dataflow_default_options = dataflow_default_options or {}
        self.options = options or {}
        self.options.setdefault('labels', {}).update(
            {'airflow-version': 'v' + version.replace('.', '-').replace('+', '-')}
        )
        self.py_interpreter = py_interpreter
        self.py_requirements = py_requirements
        self.py_system_site_packages = py_system_site_packages
        self.project_id = project_id
        self.location = location
        self.gcp_conn_id = gcp_conn_id
        self.delegate_to = delegate_to
        self.poll_sleep = poll_sleep
        self.drain_pipeline = drain_pipeline
        self.job_id = None
        self.hook = None
Example #26
0
    def test_execute(self, mock_hook):
        operator = GoogleCloudStorageCreateBucketOperator(
            task_id=TASK_ID,
            bucket_name=TEST_BUCKET,
            storage_class='MULTI_REGIONAL',
            location='EU',
            labels={'env': 'prod'},
            project_id=TEST_PROJECT)

        operator.execute(None)
        mock_hook.return_value.create_bucket.assert_called_once_with(
            bucket_name=TEST_BUCKET,
            storage_class='MULTI_REGIONAL',
            location='EU',
            labels={
                'airflow-version':
                'v' + version.replace('.', '-').replace('+', '-'),
                'env':
                'prod'
            },
            project_id=TEST_PROJECT)
 def test_deploy_execute(self, mock_hook):
     mock_hook.return_value.get_function.side_effect = mock.Mock(
         side_effect=HttpError(resp=MOCK_RESP_404, content=b'not found'))
     mock_hook.return_value.create_new_function.return_value = True
     op = GcfFunctionDeployOperator(project_id=GCP_PROJECT_ID,
                                    location=GCP_LOCATION,
                                    body=deepcopy(VALID_BODY),
                                    task_id="id")
     op.execute(None)
     mock_hook.assert_called_once_with(api_version='v1',
                                       gcp_conn_id='google_cloud_default')
     mock_hook.return_value.get_function.assert_called_once_with(
         'projects/test_project_id/locations/test_region/functions/helloWorld'
     )
     expected_body = deepcopy(VALID_BODY)
     expected_body['labels'] = {
         'airflow-version':
         'v' + version.replace('.', '-').replace('+', '-')
     }
     mock_hook.return_value.create_new_function.assert_called_once_with(
         'projects/test_project_id/locations/test_region', expected_body)
 def test_update_function_if_exists(self, mock_hook):
     mock_hook.return_value.get_function.return_value = True
     mock_hook.return_value.update_function.return_value = True
     op = GcfFunctionDeployOperator(project_id=GCP_PROJECT_ID,
                                    location=GCP_LOCATION,
                                    body=deepcopy(VALID_BODY),
                                    task_id="id")
     op.execute(None)
     mock_hook.assert_called_once_with(api_version='v1',
                                       gcp_conn_id='google_cloud_default')
     mock_hook.return_value.get_function.assert_called_once_with(
         'projects/test_project_id/locations/test_region/functions/helloWorld'
     )
     expected_body = deepcopy(VALID_BODY)
     expected_body['labels'] = {
         'airflow-version':
         'v' + version.replace('.', '-').replace('+', '-')
     }
     mock_hook.return_value.update_function.assert_called_once_with(
         'projects/test_project_id/locations/test_region/functions/helloWorld',
         expected_body, expected_body.keys())
     mock_hook.return_value.create_new_function.assert_not_called()
Example #29
0
 def __init__(self, project_id, task_id, cluster_name, job_type,
              properties):
     name = task_id + "_" + str(uuid.uuid4())[:8]
     self.job_type = job_type
     self.job = {
         "job": {
             "reference": {
                 "projectId": project_id,
                 "jobId": name,
             },
             "placement": {
                 "clusterName": cluster_name
             },
             "labels": {
                 'airflow-version':
                 'v' + version.replace('.', '-').replace('+', '-')
             },
             job_type: {}
         }
     }
     if properties is not None:
         self.job["job"][job_type]["properties"] = properties
Example #30
0
    def __init__(
        self,
        *,
        jar: str,
        job_name: str = '{{task.task_id}}',
        dataflow_default_options: Optional[dict] = None,
        options: Optional[dict] = None,
        project_id: Optional[str] = None,
        location: str = DEFAULT_DATAFLOW_LOCATION,
        gcp_conn_id: str = 'google_cloud_default',
        delegate_to: Optional[str] = None,
        poll_sleep: int = 10,
        job_class: Optional[str] = None,
        check_if_running: CheckJobRunning = CheckJobRunning.WaitForRun,
        multiple_jobs: Optional[bool] = None,
        **kwargs,
    ) -> None:
        super().__init__(**kwargs)

        dataflow_default_options = dataflow_default_options or {}
        options = options or {}
        options.setdefault('labels', {}).update({
            'airflow-version':
            'v' + version.replace('.', '-').replace('+', '-')
        })
        self.project_id = project_id
        self.location = location
        self.gcp_conn_id = gcp_conn_id
        self.delegate_to = delegate_to
        self.jar = jar
        self.multiple_jobs = multiple_jobs
        self.job_name = job_name
        self.dataflow_default_options = dataflow_default_options
        self.options = options
        self.poll_sleep = poll_sleep
        self.job_class = job_class
        self.check_if_running = check_if_running
        self.job_id = None
        self.hook = None
Example #31
0
    def construct_pod(
        dag_id: str,
        task_id: str,
        pod_id: str,
        try_number: int,
        date: str,
        command: List[str],
        kube_executor_config: Optional[k8s.V1Pod],
        worker_config: k8s.V1Pod,
        namespace: str,
        worker_uuid: str
    ) -> k8s.V1Pod:
        """
        Construct a pod by gathering and consolidating the configuration from 3 places:
            - airflow.cfg
            - executor_config
            - dynamic arguments
        """
        dynamic_pod = PodGenerator(
            namespace=namespace,
            labels={
                'airflow-worker': worker_uuid,
                'dag_id': dag_id,
                'task_id': task_id,
                'execution_date': date,
                'try_number': str(try_number),
                'airflow_version': airflow_version.replace('+', '-'),
                'kubernetes_executor': 'True',
            },
            cmds=command,
            name=pod_id
        ).gen_pod()

        # Reconcile the pods starting with the first chronologically,
        # Pod from the airflow.cfg -> Pod from executor_config arg -> Pod from the K8s executor
        pod_list = [worker_config, kube_executor_config, dynamic_pod]

        return reduce(PodGenerator.reconcile_pods, pod_list)
Example #32
0
    def construct_pod(dag_id: str, task_id: str, pod_id: str, try_number: int,
                      date: str, command: List[str],
                      kube_executor_config: Optional[k8s.V1Pod],
                      worker_config: k8s.V1Pod, namespace: str,
                      worker_uuid: str) -> k8s.V1Pod:
        """
        Construct a pod by gathering and consolidating the configuration from 3 places:
            - airflow.cfg
            - executor_config
            - dynamic arguments
        """
        dynamic_pod = PodGenerator(namespace=namespace,
                                   labels={
                                       'airflow-worker':
                                       worker_uuid,
                                       'dag_id':
                                       dag_id,
                                       'task_id':
                                       task_id,
                                       'execution_date':
                                       date,
                                       'try_number':
                                       str(try_number),
                                       'airflow_version':
                                       airflow_version.replace('+', '-'),
                                       'kubernetes_executor':
                                       'True',
                                   },
                                   cmds=command,
                                   name=pod_id).gen_pod()

        # Reconcile the pod generated by the Operator and the Pod
        # generated by the .cfg file
        pod_with_executor_config = PodGenerator.reconcile_pods(
            worker_config, kube_executor_config)
        # Reconcile that pod with the dynamic fields.
        return PodGenerator.reconcile_pods(pod_with_executor_config,
                                           dynamic_pod)
 def test_update_function_if_exists(self, mock_hook):
     mock_hook.return_value.get_function.return_value = True
     mock_hook.return_value.update_function.return_value = True
     op = GcfFunctionDeployOperator(
         project_id=GCP_PROJECT_ID,
         location=GCP_LOCATION,
         body=deepcopy(VALID_BODY),
         task_id="id"
     )
     op.execute(None)
     mock_hook.assert_called_once_with(api_version='v1',
                                       gcp_conn_id='google_cloud_default')
     mock_hook.return_value.get_function.assert_called_once_with(
         'projects/test_project_id/locations/test_region/functions/helloWorld'
     )
     expected_body = deepcopy(VALID_BODY)
     expected_body['labels'] = {
         'airflow-version': 'v' + version.replace('.', '-').replace('+', '-')
     }
     mock_hook.return_value.update_function.assert_called_once_with(
         'projects/test_project_id/locations/test_region/functions/helloWorld',
         expected_body, expected_body.keys())
     mock_hook.return_value.create_new_function.assert_not_called()
Example #34
0
    def __init__(
            self,
            py_file,
            py_options=None,
            dataflow_default_options=None,
            options=None,
            gcp_conn_id='google_cloud_default',
            delegate_to=None,
            poll_sleep=10,
            *args,
            **kwargs):

        super(DataFlowPythonOperator, self).__init__(*args, **kwargs)

        self.py_file = py_file
        self.py_options = py_options or []
        self.dataflow_default_options = dataflow_default_options or {}
        self.options = options or {}
        self.options.setdefault('labels', {}).update(
            {'airflow-version': 'v' + version.replace('.', '-').replace('+', '-')})
        self.gcp_conn_id = gcp_conn_id
        self.delegate_to = delegate_to
        self.poll_sleep = poll_sleep
Example #35
0
    def create_new_pod_for_operator(self, labels, launcher):
        """
        Creates a new pod and monitors for duration of task

        @param labels: labels used to track pod
        @param launcher: pod launcher that will manage launching and monitoring pods
        @return:
        """
        if not (self.full_pod_spec or self.pod_template_file):
            # Add Airflow Version to the label
            # And a label to identify that pod is launched by KubernetesPodOperator
            self.labels.update({
                'airflow_version':
                airflow_version.replace('+', '-'),
                'kubernetes_pod_operator':
                'True',
            })
            self.labels.update(labels)
            self.pod.metadata.labels = self.labels
        self.log.debug("Starting pod:\n%s", yaml.safe_dump(self.pod.to_dict()))

        try:
            launcher.start_pod(self.pod,
                               startup_timeout=self.startup_timeout_seconds)
            final_state, result = launcher.monitor_pod(pod=self.pod,
                                                       get_logs=self.get_logs)
        except AirflowException as ex:
            if self.log_events_on_failure:
                for event in launcher.read_pod_events(self.pod).items:
                    self.log.error("Pod Event: %s - %s", event.reason,
                                   event.message)
            raise AirflowException(
                'Pod Launching failed: {error}'.format(error=ex))
        finally:
            if self.is_delete_operator_pod:
                launcher.delete_pod(self.pod)
        return final_state, self.pod, result
Example #36
0
    def make_pod(self, namespace, worker_uuid, pod_id, dag_id, task_id,
                 execution_date, try_number, airflow_command) -> k8s.V1Pod:
        """Creates POD."""
        pod_generator = PodGenerator(
            namespace=namespace,
            name=pod_id,
            image=self.kube_config.kube_image,
            image_pull_policy=self.kube_config.kube_image_pull_policy,
            image_pull_secrets=self.kube_config.image_pull_secrets,
            labels={
                'airflow-worker': worker_uuid,
                'dag_id': dag_id,
                'task_id': task_id,
                'execution_date': execution_date,
                'try_number': str(try_number),
                'airflow_version': airflow_version.replace('+', '-'),
                'kubernetes_executor': 'True',
            },
            cmds=airflow_command,
            volumes=self._get_volumes(),
            volume_mounts=self._get_volume_mounts(),
            init_containers=self._get_init_containers(),
            annotations=self.kube_config.kube_annotations,
            affinity=self.kube_config.kube_affinity,
            tolerations=self.kube_config.kube_tolerations,
            envs=self._get_environment(),
            node_selectors=self.kube_config.kube_node_selectors,
            service_account_name=self.kube_config.worker_service_account_name,
        )

        pod = pod_generator.gen_pod()
        pod.spec.containers[0].env_from = pod.spec.containers[0].env_from or []
        pod.spec.containers[0].env_from.extend(self._get_env_from())
        pod.spec.security_context = self._get_security_context()

        return append_to_pod(pod, self._get_secrets())
Example #37
0
    def create_subscription(
        self,
        topic: str,
        project_id: str,
        subscription: Optional[str] = None,
        subscription_project_id: Optional[str] = None,
        ack_deadline_secs: int = 10,
        fail_if_exists: bool = False,
        push_config: Optional[Union[dict, PushConfig]] = None,
        retain_acked_messages: Optional[bool] = None,
        message_retention_duration: Optional[Union[dict, Duration]] = None,
        labels: Optional[Dict[str, str]] = None,
        enable_message_ordering: bool = False,
        expiration_policy: Optional[Union[dict, ExpirationPolicy]] = None,
        filter_: Optional[str] = None,
        dead_letter_policy: Optional[Union[dict, DeadLetterPolicy]] = None,
        retry_policy: Optional[Union[dict, RetryPolicy]] = None,
        retry: Optional[Retry] = None,
        timeout: Optional[float] = None,
        metadata: Optional[Sequence[Tuple[str, str]]] = None,
    ) -> str:
        """
        Creates a Pub/Sub subscription, if it does not already exist.

        :param topic: the Pub/Sub topic name that the subscription will be bound
            to create; do not include the ``projects/{project}/subscriptions/`` prefix.
        :type topic: str
        :param project_id: Optional, the Google Cloud project ID of the topic that the subscription will be
            bound to. If set to None or missing, the default project_id from the Google Cloud connection
            is used.
        :type project_id: str
        :param subscription: the Pub/Sub subscription name. If empty, a random
            name will be generated using the uuid module
        :type subscription: str
        :param subscription_project_id: the Google Cloud project ID where the subscription
            will be created. If unspecified, ``project_id`` will be used.
        :type subscription_project_id: str
        :param ack_deadline_secs: Number of seconds that a subscriber has to
            acknowledge each message pulled from the subscription
        :type ack_deadline_secs: int
        :param fail_if_exists: if set, raise an exception if the topic
            already exists
        :type fail_if_exists: bool
        :param push_config: If push delivery is used with this subscription,
            this field is used to configure it. An empty ``pushConfig`` signifies
            that the subscriber will pull and ack messages using API methods.
        :type push_config: Union[Dict, google.cloud.pubsub_v1.types.PushConfig]
        :param retain_acked_messages: Indicates whether to retain acknowledged
            messages. If true, then messages are not expunged from the subscription's
            backlog, even if they are acknowledged, until they fall out of the
            ``message_retention_duration`` window. This must be true if you would
            like to Seek to a timestamp.
        :type retain_acked_messages: bool
        :param message_retention_duration: How long to retain unacknowledged messages
            in the subscription's backlog, from the moment a message is published. If
            ``retain_acked_messages`` is true, then this also configures the
            retention of acknowledged messages, and thus configures how far back in
            time a ``Seek`` can be done. Defaults to 7 days. Cannot be more than 7
            days or less than 10 minutes.
        :type message_retention_duration: Union[Dict, google.cloud.pubsub_v1.types.Duration]
        :param labels: Client-assigned labels; see
            https://cloud.google.com/pubsub/docs/labels
        :type labels: Dict[str, str]
        :param enable_message_ordering: If true, messages published with the same
            ordering_key in PubsubMessage will be delivered to the subscribers in the order
            in which they are received by the Pub/Sub system. Otherwise, they may be
            delivered in any order.
        :type enable_message_ordering: bool
        :param expiration_policy: A policy that specifies the conditions for this
            subscription’s expiration. A subscription is considered active as long as any
            connected subscriber is successfully consuming messages from the subscription or
            is issuing operations on the subscription. If expiration_policy is not set,
            a default policy with ttl of 31 days will be used. The minimum allowed value for
            expiration_policy.ttl is 1 day.
        :type expiration_policy: Union[Dict, google.cloud.pubsub_v1.types.ExpirationPolicy`]
        :param filter_: An expression written in the Cloud Pub/Sub filter language. If
            non-empty, then only PubsubMessages whose attributes field matches the filter are
            delivered on this subscription. If empty, then no messages are filtered out.
        :type filter_: str
        :param dead_letter_policy: A policy that specifies the conditions for dead lettering
            messages in this subscription. If dead_letter_policy is not set, dead lettering is
            disabled.
        :type dead_letter_policy: Union[Dict, google.cloud.pubsub_v1.types.DeadLetterPolicy]
        :param retry_policy: A policy that specifies how Pub/Sub retries message delivery
            for this subscription. If not set, the default retry policy is applied. This
            generally implies that messages will be retried as soon as possible for healthy
            subscribers. RetryPolicy will be triggered on NACKs or acknowledgement deadline
            exceeded events for a given message.
        :type retry_policy: Union[Dict, google.cloud.pubsub_v1.types.RetryPolicy]
        :param retry: (Optional) A retry object used to retry requests.
            If None is specified, requests will not be retried.
        :type retry: google.api_core.retry.Retry
        :param timeout: (Optional) The amount of time, in seconds, to wait for the request
            to complete. Note that if retry is specified, the timeout applies to each
            individual attempt.
        :type timeout: float
        :param metadata: (Optional) Additional metadata that is provided to the method.
        :type metadata: Sequence[Tuple[str, str]]]
        :return: subscription name which will be the system-generated value if
            the ``subscription`` parameter is not supplied
        :rtype: str
        """
        subscriber = self.subscriber_client

        if not subscription:
            subscription = 'sub-{}'.format(uuid4())
        if not subscription_project_id:
            subscription_project_id = project_id

        # Add airflow-version label to the subscription
        labels = labels or {}
        labels['airflow-version'] = 'v' + version.replace('.', '-').replace('+', '-')

        # pylint: disable=no-member
        subscription_path = SubscriberClient.subscription_path(subscription_project_id, subscription)
        topic_path = SubscriberClient.topic_path(project_id, topic)

        self.log.info("Creating subscription (path) %s for topic (path) %a", subscription_path, topic_path)
        try:
            subscriber.create_subscription(
                name=subscription_path,
                topic=topic_path,
                push_config=push_config,
                ack_deadline_seconds=ack_deadline_secs,
                retain_acked_messages=retain_acked_messages,
                message_retention_duration=message_retention_duration,
                labels=labels,
                enable_message_ordering=enable_message_ordering,
                expiration_policy=expiration_policy,
                filter_=filter_,
                dead_letter_policy=dead_letter_policy,
                retry_policy=retry_policy,
                retry=retry,
                timeout=timeout,
                metadata=metadata,
            )
        except AlreadyExists:
            self.log.warning('Subscription already exists: %s', subscription_path)
            if fail_if_exists:
                raise PubSubException('Subscription already exists: {}'.format(subscription_path))
        except GoogleAPICallError as e:
            raise PubSubException('Error creating subscription {}'.format(subscription_path), e)

        self.log.info("Created subscription (path) %s for topic (path) %s", subscription_path, topic_path)
        return subscription
Example #38
0
    def create_topic(
        self,
        topic: str,
        project_id: str,
        fail_if_exists: bool = False,
        labels: Optional[Dict[str, str]] = None,
        message_storage_policy: Union[Dict, MessageStoragePolicy] = None,
        kms_key_name: Optional[str] = None,
        retry: Optional[Retry] = None,
        timeout: Optional[float] = None,
        metadata: Optional[Sequence[Tuple[str, str]]] = None,
    ) -> None:
        """
        Creates a Pub/Sub topic, if it does not already exist.

        :param topic: the Pub/Sub topic name to create; do not
            include the ``projects/{project}/topics/`` prefix.
        :type topic: str
        :param project_id: Optional, the Google Cloud project ID in which to create the topic
            If set to None or missing, the default project_id from the Google Cloud connection is used.
        :type project_id: str
        :param fail_if_exists: if set, raise an exception if the topic
            already exists
        :type fail_if_exists: bool
        :param labels: Client-assigned labels; see
            https://cloud.google.com/pubsub/docs/labels
        :type labels: Dict[str, str]
        :param message_storage_policy: Policy constraining the set
            of Google Cloud regions where messages published to
            the topic may be stored. If not present, then no constraints
            are in effect.
        :type message_storage_policy:
            Union[Dict, google.cloud.pubsub_v1.types.MessageStoragePolicy]
        :param kms_key_name: The resource name of the Cloud KMS CryptoKey
            to be used to protect access to messages published on this topic.
            The expected format is
            ``projects/*/locations/*/keyRings/*/cryptoKeys/*``.
        :type kms_key_name: str
        :param retry: (Optional) A retry object used to retry requests.
            If None is specified, requests will not be retried.
        :type retry: google.api_core.retry.Retry
        :param timeout: (Optional) The amount of time, in seconds, to wait for the request
            to complete. Note that if retry is specified, the timeout applies to each
            individual attempt.
        :type timeout: float
        :param metadata: (Optional) Additional metadata that is provided to the method.
        :type metadata: Sequence[Tuple[str, str]]]
        """
        publisher = self.get_conn()
        topic_path = PublisherClient.topic_path(project_id, topic)  # pylint: disable=no-member

        # Add airflow-version label to the topic
        labels = labels or {}
        labels['airflow-version'] = 'v' + version.replace('.', '-').replace('+', '-')

        self.log.info("Creating topic (path) %s", topic_path)
        try:
            # pylint: disable=no-member
            publisher.create_topic(
                name=topic_path,
                labels=labels,
                message_storage_policy=message_storage_policy,
                kms_key_name=kms_key_name,
                retry=retry,
                timeout=timeout,
                metadata=metadata,
            )
        except AlreadyExists:
            self.log.warning('Topic already exists: %s', topic)
            if fail_if_exists:
                raise PubSubException('Topic already exists: {}'.format(topic))
        except GoogleAPICallError as e:
            raise PubSubException('Error creating topic {}'.format(topic), e)

        self.log.info("Created topic (path) %s", topic_path)
Example #39
0
    def create_new_pod_for_operator(
            self, labels, launcher) -> Tuple[State, k8s.V1Pod, Optional[str]]:
        """
        Creates a new pod and monitors for duration of task

        :param labels: labels used to track pod
        :param launcher: pod launcher that will manage launching and monitoring pods
        :return:
        """
        if not (self.full_pod_spec or self.pod_template_file):
            # Add Airflow Version to the label
            # And a label to identify that pod is launched by KubernetesPodOperator
            self.labels.update({
                'airflow_version':
                airflow_version.replace('+', '-'),
                'kubernetes_pod_operator':
                'True',
            })
            self.labels.update(labels)
        pod = pod_generator.PodGenerator(
            image=self.image,
            namespace=self.namespace,
            cmds=self.cmds,
            args=self.arguments,
            labels=self.labels,
            name=self.name,
            envs=self.env_vars,
            extract_xcom=self.do_xcom_push,
            image_pull_policy=self.image_pull_policy,
            node_selectors=self.node_selectors,
            annotations=self.annotations,
            affinity=self.affinity,
            image_pull_secrets=self.image_pull_secrets,
            service_account_name=self.service_account_name,
            hostnetwork=self.hostnetwork,
            tolerations=self.tolerations,
            configmaps=self.configmaps,
            security_context=self.security_context,
            dnspolicy=self.dnspolicy,
            schedulername=self.schedulername,
            init_containers=self.init_containers,
            restart_policy='Never',
            priority_class_name=self.priority_class_name,
            pod_template_file=self.pod_template_file,
            pod=self.full_pod_spec,
        ).gen_pod()

        # noinspection PyTypeChecker
        pod = append_to_pod(
            pod,
            self.pod_runtime_info_envs + self.ports +  # type: ignore
            self.resources + self.secrets +  # type: ignore
            self.volumes +  # type: ignore
            self.volume_mounts  # type: ignore
        )

        self.pod = pod
        self.log.debug("Starting pod:\n%s", yaml.safe_dump(pod.to_dict()))
        try:
            launcher.start_pod(pod,
                               startup_timeout=self.startup_timeout_seconds)
            final_state, result = launcher.monitor_pod(pod=pod,
                                                       get_logs=self.get_logs)
        except AirflowException:
            if self.log_events_on_failure:
                for event in launcher.read_pod_events(pod).items:
                    self.log.error("Pod Event: %s - %s", event.reason,
                                   event.message)
            raise
        finally:
            if self.is_delete_operator_pod:
                launcher.delete_pod(pod)
        return final_state, pod, result
Example #40
0
    def create_bucket(
        self,
        bucket_name: str,
        resource: Optional[dict] = None,
        storage_class: str = 'MULTI_REGIONAL',
        location: str = 'US',
        project_id: Optional[str] = None,
        labels: Optional[dict] = None,
    ) -> str:
        """
        Creates a new bucket. Google Cloud Storage uses a flat namespace, so
        you can't create a bucket with a name that is already in use.

        .. seealso::
            For more information, see Bucket Naming Guidelines:
            https://cloud.google.com/storage/docs/bucketnaming.html#requirements

        :param bucket_name: The name of the bucket.
        :type bucket_name: str
        :param resource: An optional dict with parameters for creating the bucket.
            For information on available parameters, see Cloud Storage API doc:
            https://cloud.google.com/storage/docs/json_api/v1/buckets/insert
        :type resource: dict
        :param storage_class: This defines how objects in the bucket are stored
            and determines the SLA and the cost of storage. Values include

            - ``MULTI_REGIONAL``
            - ``REGIONAL``
            - ``STANDARD``
            - ``NEARLINE``
            - ``COLDLINE``.

            If this value is not specified when the bucket is
            created, it will default to STANDARD.
        :type storage_class: str
        :param location: The location of the bucket.
            Object data for objects in the bucket resides in physical storage
            within this region. Defaults to US.

            .. seealso::
                https://developers.google.com/storage/docs/bucket-locations

        :type location: str
        :param project_id: The ID of the Google Cloud Project.
        :type project_id: str
        :param labels: User-provided labels, in key/value pairs.
        :type labels: dict
        :return: If successful, it returns the ``id`` of the bucket.
        """
        self.log.info('Creating Bucket: %s; Location: %s; Storage Class: %s',
                      bucket_name, location, storage_class)

        # Add airflow-version label to the bucket
        labels = labels or {}
        labels['airflow-version'] = 'v' + version.replace('.', '-').replace(
            '+', '-')

        client = self.get_conn()
        bucket = client.bucket(bucket_name=bucket_name)
        bucket_resource = resource or {}

        for item in bucket_resource:
            if item != "name":
                bucket._patch_property(  # pylint: disable=protected-access
                    name=item,
                    value=resource[item]  # type: ignore[index]
                )

        bucket.storage_class = storage_class
        bucket.labels = labels
        bucket.create(project=project_id, location=location)
        return bucket.id
PY_OPTIONS = ['-m']
DEFAULT_OPTIONS_PYTHON = DEFAULT_OPTIONS_JAVA = {
    'project': 'test',
    'stagingLocation': 'gs://test/staging',
}
DEFAULT_OPTIONS_TEMPLATE = {
    'project': 'test',
    'stagingLocation': 'gs://test/staging',
    'tempLocation': 'gs://test/temp',
    'zone': 'us-central1-f'
}
ADDITIONAL_OPTIONS = {
    'output': 'gs://test/output',
    'labels': {'foo': 'bar'}
}
TEST_VERSION = 'v{}'.format(version.replace('.', '-').replace('+', '-'))
EXPECTED_ADDITIONAL_OPTIONS = {
    'output': 'gs://test/output',
    'labels': {'foo': 'bar', 'airflow-version': TEST_VERSION}
}
POLL_SLEEP = 30
GCS_HOOK_STRING = 'airflow.contrib.operators.dataflow_operator.{}'


class DataFlowPythonOperatorTest(unittest.TestCase):

    def setUp(self):
        self.dataflow = DataFlowPythonOperator(
            task_id=TASK_ID,
            py_file=PY_FILE,
            job_name=JOB_NAME,
Example #42
0
    def _build_cluster_data(self):
        zone_uri = \
            'https://www.googleapis.com/compute/v1/projects/{}/zones/{}'.format(
                self.project_id, self.zone
            )
        master_type_uri = \
            "https://www.googleapis.com/compute/v1/projects/{}/zones/{}/machineTypes/{}".format(
                self.project_id, self.zone, self.master_machine_type
            )
        worker_type_uri = \
            "https://www.googleapis.com/compute/v1/projects/{}/zones/{}/machineTypes/{}".format(
                self.project_id, self.zone, self.worker_machine_type
            )
        cluster_data = {
            'projectId': self.project_id,
            'clusterName': self.cluster_name,
            'config': {
                'gceClusterConfig': {
                    'zoneUri': zone_uri
                },
                'masterConfig': {
                    'numInstances': 1,
                    'machineTypeUri': master_type_uri,
                    'diskConfig': {
                        'bootDiskSizeGb': self.master_disk_size
                    }
                },
                'workerConfig': {
                    'numInstances': self.num_workers,
                    'machineTypeUri': worker_type_uri,
                    'diskConfig': {
                        'bootDiskSizeGb': self.worker_disk_size
                    }
                },
                'secondaryWorkerConfig': {},
                'softwareConfig': {}
            }
        }
        if self.num_preemptible_workers > 0:
            cluster_data['config']['secondaryWorkerConfig'] = {
                'numInstances': self.num_preemptible_workers,
                'machineTypeUri': worker_type_uri,
                'diskConfig': {
                    'bootDiskSizeGb': self.worker_disk_size
                },
                'isPreemptible': True
            }

        cluster_data['labels'] = self.labels if self.labels else {}
        # Dataproc labels must conform to the following regex:
        # [a-z]([-a-z0-9]*[a-z0-9])? (current airflow version string follows
        # semantic versioning spec: x.y.z).
        cluster_data['labels'].update({'airflow-version':
                                       'v' + version.replace('.', '-').replace('+','-')})
        if self.storage_bucket:
            cluster_data['config']['configBucket'] = self.storage_bucket
        if self.metadata:
            cluster_data['config']['gceClusterConfig']['metadata'] = self.metadata
        if self.network_uri:
            cluster_data['config']['gceClusterConfig']['networkUri'] = self.network_uri
        if self.subnetwork_uri:
            cluster_data['config']['gceClusterConfig']['subnetworkUri'] = self.subnetwork_uri
        if self.tags:
            cluster_data['config']['gceClusterConfig']['tags'] = self.tags
        if self.image_version:
            cluster_data['config']['softwareConfig']['imageVersion'] = self.image_version
        if self.properties:
            cluster_data['config']['softwareConfig']['properties'] = self.properties
        if self.init_actions_uris:
            init_actions_dict = [
                {'executableFile': uri} for uri in self.init_actions_uris
            ]
            cluster_data['config']['initializationActions'] = init_actions_dict
        if self.service_account:
            cluster_data['config']['gceClusterConfig']['serviceAccount'] =\
                    self.service_account
        if self.service_account_scopes:
            cluster_data['config']['gceClusterConfig']['serviceAccountScopes'] =\
                    self.service_account_scopes
        return cluster_data
 def _set_airflow_version_label(self):
     if 'labels' not in self.body.keys():
         self.body['labels'] = {}
     self.body['labels'].update(
         {'airflow-version': 'v' + version.replace('.', '-').replace('+', '-')})