Esempio n. 1
0
        def my_pipeline(msg1, json, kind, msg2='value2'):
            op = dsl.ContainerOp(name='echo', image='image', command=['sh', '-c'],
                                 arguments=['echo %s %s | tee /tmp/message.txt' % (msg1, msg2)],
                                 file_outputs={'merged': '/tmp/message.txt'}) \
              .add_volume_mount(k8s_client.V1VolumeMount(
                mount_path='/secret/gcp-credentials',
                name='gcp-credentials')) \
              .add_env_variable(k8s_client.V1EnvVar(
                name='GOOGLE_APPLICATION_CREDENTIALS',
                value='/secret/gcp-credentials/user-gcp-sa.json'))
            res = dsl.ResourceOp(
                name="test-resource",
                k8s_resource=k8s_client.V1PersistentVolumeClaim(
                    api_version="v1",
                    kind=kind,
                    metadata=k8s_client.V1ObjectMeta(name="resource")),
                attribute_outputs={"out": json})
            golden_output = {
                'container': {
                    'image':
                    'image',
                    'args': [
                        'echo {{inputs.parameters.msg1}} {{inputs.parameters.msg2}} | tee /tmp/message.txt'
                    ],
                    'command': ['sh', '-c'],
                    'env': [{
                        'name': 'GOOGLE_APPLICATION_CREDENTIALS',
                        'value': '/secret/gcp-credentials/user-gcp-sa.json'
                    }],
                    'volumeMounts': [{
                        'mountPath': '/secret/gcp-credentials',
                        'name': 'gcp-credentials',
                    }]
                },
                'inputs': {
                    'parameters': [
                        {
                            'name': 'msg1'
                        },
                        {
                            'name': 'msg2'
                        },
                    ]
                },
                'name': 'echo',
                'outputs': {
                    'artifacts': [
                        {
                            'name': 'echo-merged',
                            'path': '/tmp/message.txt',
                        },
                    ],
                    'parameters': [{
                        'name': 'echo-merged',
                        'valueFrom': {
                            'path': '/tmp/message.txt'
                        }
                    }],
                }
            }
            res_output = {
                'inputs': {
                    'parameters': [{
                        'name': 'json'
                    }, {
                        'name': 'kind'
                    }]
                },
                'name': 'test-resource',
                'outputs': {
                    'parameters': [{
                        'name': 'test-resource-manifest',
                        'valueFrom': {
                            'jsonPath': '{}'
                        }
                    }, {
                        'name': 'test-resource-name',
                        'valueFrom': {
                            'jsonPath': '{.metadata.name}'
                        }
                    }, {
                        'name': 'test-resource-out',
                        'valueFrom': {
                            'jsonPath': '{{inputs.parameters.json}}'
                        }
                    }]
                },
                'resource': {
                    'action':
                    'create',
                    'manifest': ("apiVersion: v1\n"
                                 "kind: '{{inputs.parameters.kind}}'\n"
                                 "metadata:\n"
                                 "  name: resource\n")
                }
            }

            self.maxDiff = None
            self.assertEqual(golden_output,
                             compiler._op_to_template._op_to_template(op))
            self.assertEqual(res_output,
                             compiler._op_to_template._op_to_template(res))
Esempio n. 2
0
    def _create_pod(self,
                    image,
                    pod_name,
                    job_name,
                    port=80,
                    cmd_string=None,
                    volumes=[]):
        """ Create a kubernetes pod for the job.
        Args:
              - image (string) : Docker image to launch
              - pod_name (string) : Name of the pod
              - job_name (string) : App label
        KWargs:
             - port (integer) : Container port
        Returns:
              - None
        """

        security_context = None
        if self.user_id and self.group_id:
            security_context = client.V1SecurityContext(
                run_as_group=self.group_id,
                run_as_user=self.user_id,
                run_as_non_root=self.run_as_non_root)

        # Create the enviornment variables and command to initiate IPP
        environment_vars = client.V1EnvVar(name="TEST", value="SOME DATA")

        launch_args = ["-c", "{0};".format(cmd_string)]

        volume_mounts = []
        # Create mount paths for the volumes
        for volume in volumes:
            volume_mounts.append(
                client.V1VolumeMount(mount_path=volume[1], name=volume[0]))
        resources = client.V1ResourceRequirements(limits={
            'cpu': str(self.max_cpu),
            'memory': self.max_mem
        },
                                                  requests={
                                                      'cpu':
                                                      str(self.init_cpu),
                                                      'memory': self.init_mem
                                                  })
        # Configure Pod template container
        container = client.V1Container(
            name=pod_name,
            image=image,
            resources=resources,
            ports=[client.V1ContainerPort(container_port=port)],
            volume_mounts=volume_mounts,
            command=['/bin/bash'],
            args=launch_args,
            env=[environment_vars],
            security_context=security_context)

        # Create a secret to enable pulling images from secure repositories
        secret = None
        if self.secret:
            secret = client.V1LocalObjectReference(name=self.secret)

        # Create list of volumes from (pvc, mount) tuples
        volume_defs = []
        for volume in volumes:
            volume_defs.append(
                client.V1Volume(
                    name=volume[0],
                    persistent_volume_claim=client.
                    V1PersistentVolumeClaimVolumeSource(claim_name=volume[0])))

        metadata = client.V1ObjectMeta(name=pod_name, labels={"app": job_name})
        spec = client.V1PodSpec(containers=[container],
                                image_pull_secrets=[secret],
                                volumes=volume_defs)

        pod = client.V1Pod(spec=spec, metadata=metadata)
        api_response = self.kube_client.create_namespaced_pod(
            namespace=self.namespace, body=pod)
        logger.debug("Pod created. status='{0}'".format(
            str(api_response.status)))
Esempio n. 3
0
def got_image_pipeline(
    trainingsteps=4000,
    learningrate=0.01,
    trainbatchsize=100,
):

    persistent_volume_name = 'azure-files'
    persistent_volume_path = '/tf-output'
    azure_file_secret_name = 'azure-file-secret'
    azure_file_share_name = 'pipelines'
    field_path = 'metadata.name'

    operations = {}

    # preprocess images
    operations['preprocess'] = dsl.ContainerOp(
        name='preprocess',
        image='gotcr.azurecr.io/chzbrgr71/got-image-preprocess:2.00',
        arguments=[
            '--bottleneck_dir', "/tf-output/bottlenecks",
            '--image_dir', '/images'
        ]
    )

    # train
    operations['train'] = dsl.ContainerOp(
        name='train',
        image='gotcr.azurecr.io/chzbrgr71/got-image-training:2.00',
        arguments=[
            '--bottleneck_dir', "/tmp/tensorflow/bottlenecks",
            '--model_dir', "/tmp/tensorflow/inception",
            '--summaries_dir', '/tf-output',
            '--output_graph', '/tf-output',
            '--output_labels', '/tf-output',
            '--image_dir', "/images",
            '--saved_model_dir', '/tf-output',
            '--how_many_training_steps', trainingsteps,
            '--learning_rate', learningrate,
            '--train_batch_size', trainbatchsize
        ]
    )
    operations['train'].after(operations['preprocess'])

    # score model
    operations['score'] = dsl.ContainerOp(
        name='score',
        image='gotcr.azurecr.io/chzbrgr71/got-model-scoring:2.01',
        arguments=[
            '/tf-output/latest_model'
        ]
    )
    operations['score'].after(operations['train'])

    # convert onnx
    operations['onnx'] = dsl.ContainerOp(
        name='onnx',
        image='gotcr.azurecr.io/chzbrgr71/onnx-convert:2.00',
        arguments=[
            'show',
            '--dir', '/tf-output/latest_model/exported_model/1/',
            '--tag_set', 'serve',
            '--signature_def', 'serving_default'
        ]
    )
    operations['onnx'].after(operations['score'])

    # convert tflite
    operations['convert-tflite'] = dsl.ContainerOp(
        name='convert-tflite',
        image='gotcr.azurecr.io/chzbrgr71/tflite-convert:2.00',
        arguments=[
            '--graph_def_file', '/tf-output/latest_model/got_retrained_graph.pb',
            '--output_file', '/tf-output/latest_model/optimized_graph.lite',
            '--input_format', 'TENSORFLOW_GRAPHDEF',
            '--output_format', 'TFLITE',
            '--input_sedhape', '1,299,299,3',
            '--input_array', 'Mul',
            '--output_array', 'final_result',
            '--inference_type', 'FLOAT',
            '--input_data_type', 'FLOAT'
        ]
    )    
    operations['convert-tflite'].after(operations['score'])

    # copy models to external storage
    operations['export-to-cloud'] = dsl.ContainerOp(
        name='export-to-cloud',
        image='alpine',
        command=['cp'],
        arguments=[
            '/tf-output/latest_model/got_retrained_graph.pb', 
            '/tf-output/latest_model/got_retrained_graph-latest.pb'
        ]
    )
    operations['export-to-cloud'].after(operations['onnx']).after(operations['convert-tflite'])

    for _, op in operations.items():
        op.add_volume(
                k8s_client.V1Volume(
                    azure_file=k8s_client.V1AzureFileVolumeSource(
                        secret_name=azure_file_secret_name,
                        share_name=azure_file_share_name,
                        read_only=False),
                        name=persistent_volume_name)
                ) \
            .add_volume_mount(k8s_client.V1VolumeMount(
                mount_path=persistent_volume_path, 
                name=persistent_volume_name)
                ) \
            .add_env_variable(k8s_client.V1EnvVar(name='MSG', value='HELLO!')
                ) \
            .add_env_variable(k8s_client.V1EnvVar(name='KUBE_POD_NAME',
                value_from=k8s_client.V1EnvVarSource(
                    field_ref=k8s_client.V1ObjectFieldSelector(field_path=field_path)
            )))
Esempio n. 4
0
    def construct_job(self, run):
        check.inst_param(run, 'run', PipelineRun)

        dagster_labels = {
            'app.kubernetes.io/name': 'dagster',
            'app.kubernetes.io/instance': 'dagster',
            'app.kubernetes.io/version': dagster_version,
        }

        execution_params = {
            'executionParams': {
                'selector': run.selector.to_graphql_input(),
                "environmentConfigData": run.environment_dict,
                'executionMetadata': {
                    "runId": run.run_id
                },
                "mode": run.mode,
            },
        }

        job_container = client.V1Container(
            name='dagster-job-%s' % run.run_id,
            image=self.job_image,
            command=['dagster-graphql'],
            args=[
                "-p", "startPipelineExecution", "-v",
                json.dumps(execution_params)
            ],
            image_pull_policy=self.image_pull_policy,
            env=[
                client.V1EnvVar(name='DAGSTER_HOME',
                                value='/opt/dagster/dagster_home')
            ],
            env_from=[
                client.V1EnvFromSource(
                    config_map_ref=client.V1ConfigMapEnvSource(
                        name='dagster-job-env'))
            ] + self.env_froms,
            volume_mounts=[
                client.V1VolumeMount(
                    name='dagster-instance',
                    mount_path='/opt/dagster/dagster_home/dagster.yaml',
                    sub_path='dagster.yaml',
                )
            ],
        )

        config_map_volume = client.V1Volume(
            name='dagster-instance',
            config_map=client.V1ConfigMapVolumeSource(
                name=self.instance_config_map),
        )

        template = client.V1PodTemplateSpec(
            metadata=client.V1ObjectMeta(
                name='dagster-job-pod-%s' % run.run_id,
                labels=dagster_labels,
            ),
            spec=client.V1PodSpec(
                image_pull_secrets=self.image_pull_secrets,
                service_account_name=self.service_account_name,
                restart_policy='Never',
                containers=[job_container],
                volumes=[config_map_volume],
            ),
        )

        job = client.V1Job(
            api_version="batch/v1",
            kind="Job",
            metadata=client.V1ObjectMeta(name='dagster-job-%s' % run.run_id,
                                         labels=dagster_labels),
            spec=client.V1JobSpec(
                template=template,
                backoff_limit=BACKOFF_LIMIT,
                ttl_seconds_after_finished=TTL_SECONDS_AFTER_FINISHED,
            ),
        )
        return job
Esempio n. 5
0
def get_volume_mount(volume, volume_mount=None):
    return client.V1VolumeMount(name=volume, mount_path=volume_mount)
Esempio n. 6
0
def create_job(MODEL):

    assert MODEL is not None, "model name is None, cannot spawn a new worker"

    api = client.BatchV1Api()

    body = client.V1Job(api_version="batch/v1", kind="Job")
    name = 'speechlab-worker-job-{}-{}'.format(MODEL.lower().replace("_", "-"),
                                               id_generator())
    body.metadata = client.V1ObjectMeta(namespace=NAMESPACE, name=name)
    body.status = client.V1JobStatus()
    template = client.V1PodTemplate()
    template.template = client.V1PodTemplateSpec()
    template.template.metadata = client.V1ObjectMeta(
        annotations={
            "prometheus.io/scrape": "true",
            "prometheus.io/port": "8081"
        })
    azure_file_volume = client.V1AzureFileVolumeSource(
        read_only=True,
        secret_name=MODELS_FILESHARE_SECRET,
        share_name=MODELS_SHARE_NAME)
    volume = client.V1Volume(name="models-azurefiles",
                             azure_file=azure_file_volume)
    env_vars = {
        "AZURE_STORAGE_ACCOUNT": AZURE_STORAGE_ACCOUNT,
        "AZURE_STORAGE_ACCESS_KEY": AZURE_STORAGE_ACCESS_KEY,
        "AZURE_CONTAINER": AZURE_CONTAINER,
        "MASTER": MASTER,
        "NAMESPACE": NAMESPACE,
        "RUN_FREQ": "ONCE",
        "MODEL_DIR": MODEL,  # important
        "MODELS_FILESHARE_SECRET": MODELS_FILESHARE_SECRET,
        "MODELS_SHARE_NAME": MODELS_SHARE_NAME
    }

    env_list = []
    if env_vars:
        for env_name, env_value in env_vars.items():
            env_list.append(client.V1EnvVar(name=env_name, value=env_value))

    container = client.V1Container(
        name='{}-c'.format(name),
        image=IMAGE,
        image_pull_policy="IfNotPresent",
        command=[
            "/home/appuser/opt/tini", "--", "/home/appuser/opt/start_worker.sh"
        ],
        env=env_list,
        ports=[client.V1ContainerPort(container_port=8081, name="prometheus")],
        security_context=client.V1SecurityContext(
            privileged=True,
            capabilities=client.V1Capabilities(add=["SYS_ADMIN"])),
        resources=client.V1ResourceRequirements(limits={
            "memory": "5G",
            "cpu": "1"
        },
                                                requests={
                                                    "memory": "5G",
                                                    "cpu": "1"
                                                }),
        volume_mounts=[
            client.V1VolumeMount(mount_path="/home/appuser/opt/models",
                                 name="models-azurefiles",
                                 read_only=True)
        ])
    template.template.spec = client.V1PodSpec(
        containers=[container],
        image_pull_secrets=[{
            "name": "azure-cr-secret"
        }],
        # reason to use OnFailure https://github.com/kubernetes/kubernetes/issues/20255
        restart_policy="OnFailure",
        volumes=[volume])

    # And finaly we can create our V1JobSpec!
    body.spec = client.V1JobSpec(ttl_seconds_after_finished=100,
                                 template=template.template)

    try:
        api_response = api.create_namespaced_job(NAMESPACE, body)
        print("api_response=" + str(api_response))
        return True
    except ApiException as e:
        logging.exception('error spawning new job')
        print("Exception when creating a job: %s\n" % e)
    def createStatefulSet(
        cls, cluster_object: V1MongoClusterConfiguration
    ) -> client.V1beta1StatefulSet:
        """
        Creates a the stateful set configuration for the given cluster.
        :param cluster_object: The cluster object from the YAML file.
        :return: The stateful set object.
        """

        # Parse cluster data object.
        name = cluster_object.metadata.name
        namespace = cluster_object.metadata.namespace
        replicas = cluster_object.spec.mongodb.replicas
        cpu_limit = cluster_object.spec.mongodb.cpu_limit or cls.DEFAULT_CPU_LIMIT
        memory_limit = cluster_object.spec.mongodb.memory_limit or cls.DEFAULT_MEMORY_LIMIT

        # create container
        mongo_container = client.V1Container(
            name=cls.MONGO_NAME,
            env=[
                client.V1EnvVar(
                    name="POD_IP",
                    value_from=client.V1EnvVarSource(
                        field_ref=client.V1ObjectFieldSelector(
                            api_version="v1", field_path="status.podIP")))
            ],
            command=cls.MONGO_COMMAND.format(name=name).split(),
            image=cls.MONGO_IMAGE,
            ports=[
                client.V1ContainerPort(name=cls.MONGO_NAME,
                                       container_port=cls.MONGO_PORT,
                                       protocol="TCP")
            ],
            volume_mounts=[
                client.V1VolumeMount(name=cls.MONGO_STORAGE_NAME,
                                     read_only=False,
                                     mount_path=cls.STORAGE_MOUNT_PATH)
            ],
            resources=client.V1ResourceRequirements(limits={
                "cpu": cpu_limit,
                "memory": memory_limit
            },
                                                    requests={
                                                        "cpu": cpu_limit,
                                                        "memory": memory_limit
                                                    }))

        # Create stateful set.
        return client.V1beta1StatefulSet(
            metadata=client.V1ObjectMeta(name=name,
                                         namespace=namespace,
                                         labels=cls.createDefaultLabels(name)),
            spec=client.V1beta1StatefulSetSpec(
                replicas=replicas,
                service_name=name,
                template=client.V1PodTemplateSpec(
                    metadata=client.V1ObjectMeta(
                        labels=cls.createDefaultLabels(name)),
                    spec=client.V1PodSpec(containers=[mongo_container]),
                ),
                volume_claim_templates=[
                    client.V1PersistentVolumeClaim(
                        metadata=client.V1ObjectMeta(
                            name=cls.MONGO_STORAGE_NAME),
                        spec=client.V1PersistentVolumeClaimSpec(
                            access_modes=["ReadWriteOnce"],
                            resources=client.V1ResourceRequirements(
                                requests={"storage": cls.STORAGE_SIZE}),
                        ),
                    )
                ],
            ),
        )
    def create_job_object(request_id, image, chunk_size, rabbitmq_uri, workers,
                          result_destination, result_format, x509_secret,
                          kafka_broker, generated_code_cm):
        volume_mounts = [
            client.V1VolumeMount(name='x509-secret',
                                 mount_path='/etc/grid-security-ro')
        ]

        volumes = [
            client.V1Volume(
                name='x509-secret',
                secret=client.V1SecretVolumeSource(secret_name=x509_secret))
        ]

        if generated_code_cm:
            volumes.append(
                client.V1Volume(name='generated-code',
                                config_map=client.V1ConfigMapVolumeSource(
                                    name=generated_code_cm)))
            volume_mounts.append(
                client.V1VolumeMount(mount_path="/generated",
                                     name='generated-code'))

        if "TRANSFORMER_LOCAL_PATH" in current_app.config:
            path = current_app.config['TRANSFORMER_LOCAL_PATH']
            volumes.append(
                client.V1Volume(
                    name='rootfiles',
                    host_path=client.V1HostPathVolumeSource(path=path)))
            volume_mounts.append(
                client.V1VolumeMount(mount_path="/data", name='rootfiles'))

        # Compute Environment Vars
        env = [client.V1EnvVar(name="BASH_ENV", value="/servicex/.bashrc")]

        # Provide each pod with an environment var holding that pod's name
        pod_name_value_from = client.V1EnvVarSource(
            field_ref=client.V1ObjectFieldSelector(field_path="metadata.name"))
        env_var_pod_name = client.V1EnvVar("POD_NAME",
                                           value_from=pod_name_value_from)

        env = env + [env_var_pod_name]

        if result_destination == 'object-store':
            env = env + [
                client.V1EnvVar(
                    name='MINIO_URL',
                    value=current_app.config['MINIO_URL_TRANSFORMER']),
                client.V1EnvVar(name='MINIO_ACCESS_KEY',
                                value=current_app.config['MINIO_ACCESS_KEY']),
                client.V1EnvVar(name='MINIO_SECRET_KEY',
                                value=current_app.config['MINIO_SECRET_KEY']),
            ]

        python_args = [
            "/servicex/proxy-exporter.sh & sleep 5 && " +
            "PYTHONPATH=/generated:$PYTHONPATH " +
            "python /servicex/transformer.py " + " --request-id " +
            request_id + " --rabbit-uri " + rabbitmq_uri + " --chunks " +
            str(chunk_size) + " --result-destination " + result_destination +
            " --result-format " + result_format
        ]

        if kafka_broker:
            python_args[0] += " --brokerlist " + kafka_broker

        resources = client.V1ResourceRequirements(
            limits={"cpu": current_app.config['TRANSFORMER_CPU_LIMIT']})
        # Configure Pod template container
        container = client.V1Container(
            name="transformer-" + request_id,
            image=image,
            image_pull_policy=current_app.config['TRANSFORMER_PULL_POLICY'],
            volume_mounts=volume_mounts,
            command=["bash", "-c"],  # Can't get bash to pick up my .bashrc!
            env=env,
            args=python_args,
            resources=resources)

        # Create and Configure a spec section
        template = client.V1PodTemplateSpec(metadata=client.V1ObjectMeta(
            labels={'app': "transformer-" + request_id}),
                                            spec=client.V1PodSpec(
                                                restart_policy="Always",
                                                containers=[container],
                                                volumes=volumes))

        # Create the specification of deployment
        selector = client.V1LabelSelector(
            match_labels={"app": "transformer-" + request_id})

        # If we are using Autoscaler then always start with one replica
        replicas = 1 if current_app.config[
            'TRANSFORMER_AUTOSCALE_ENABLED'] else workers
        spec = client.V1DeploymentSpec(template=template,
                                       selector=selector,
                                       replicas=replicas)

        deployment = client.V1Deployment(
            api_version="apps/v1",
            kind="Deployment",
            metadata=client.V1ObjectMeta(name="transformer-" + request_id),
            spec=spec)

        return deployment
Esempio n. 9
0
    def submit(self):
        """Submit a build pod to create the image for the repository."""
        volume_mounts = [
            client.V1VolumeMount(mount_path="/var/run/docker.sock",
                                 name="docker-socket")
        ]
        docker_socket_path = urlparse(self.docker_host).path
        volumes = [
            client.V1Volume(name="docker-socket",
                            host_path=client.V1HostPathVolumeSource(
                                path=docker_socket_path, type='Socket'))
        ]

        if self.push_secret:
            volume_mounts.append(
                client.V1VolumeMount(mount_path="/root/.docker",
                                     name='docker-push-secret'))
            volumes.append(
                client.V1Volume(name='docker-push-secret',
                                secret=client.V1SecretVolumeSource(
                                    secret_name=self.push_secret)))

        env = []
        if self.git_credentials:
            env.append(
                client.V1EnvVar(name='GIT_CREDENTIAL_ENV',
                                value=self.git_credentials))

        self.pod = client.V1Pod(
            metadata=client.V1ObjectMeta(
                name=self.name,
                labels={
                    "name": self.name,
                    "component": self._component_label,
                },
                annotations={
                    "binder-repo": self.repo_url,
                },
            ),
            spec=client.V1PodSpec(
                containers=[
                    client.V1Container(
                        image=self.build_image,
                        name="builder",
                        args=self.get_cmd(),
                        volume_mounts=volume_mounts,
                        resources=client.V1ResourceRequirements(
                            limits={'memory': self.memory_limit},
                            requests={'memory': self.memory_request},
                        ),
                        env=env)
                ],
                tolerations=[
                    client.V1Toleration(
                        key='hub.jupyter.org/dedicated',
                        operator='Equal',
                        value='user',
                        effect='NoSchedule',
                    ),
                    # GKE currently does not permit creating taints on a node pool
                    # with a `/` in the key field
                    client.V1Toleration(
                        key='hub.jupyter.org_dedicated',
                        operator='Equal',
                        value='user',
                        effect='NoSchedule',
                    ),
                ],
                node_selector=self.node_selector,
                volumes=volumes,
                restart_policy="Never",
                affinity=self.get_affinity()))

        try:
            ret = self.api.create_namespaced_pod(self.namespace, self.pod)
        except client.rest.ApiException as e:
            if e.status == 409:
                # Someone else created it!
                app_log.info("Build %s already running", self.name)
                pass
            else:
                raise
        else:
            app_log.info("Started build %s", self.name)

        app_log.info("Watching build pod %s", self.name)
        while not self.stop_event.is_set():
            w = watch.Watch()
            try:
                for f in w.stream(
                        self.api.list_namespaced_pod,
                        self.namespace,
                        label_selector="name={}".format(self.name),
                        timeout_seconds=30,
                ):
                    if f['type'] == 'DELETED':
                        self.progress('pod.phasechange', 'Deleted')
                        return
                    self.pod = f['object']
                    if not self.stop_event.is_set():
                        self.progress('pod.phasechange', self.pod.status.phase)
                    if self.pod.status.phase == 'Succeeded':
                        self.cleanup()
                    elif self.pod.status.phase == 'Failed':
                        self.cleanup()
            except Exception as e:
                app_log.exception("Error in watch stream for %s", self.name)
                raise
            finally:
                w.stop()
            if self.stop_event.is_set():
                app_log.info("Stopping watch of %s", self.name)
                return