def my_pipeline(msg1, json, kind, msg2='value2'): op = dsl.ContainerOp(name='echo', image='image', command=['sh', '-c'], arguments=['echo %s %s | tee /tmp/message.txt' % (msg1, msg2)], file_outputs={'merged': '/tmp/message.txt'}) \ .add_volume_mount(k8s_client.V1VolumeMount( mount_path='/secret/gcp-credentials', name='gcp-credentials')) \ .add_env_variable(k8s_client.V1EnvVar( name='GOOGLE_APPLICATION_CREDENTIALS', value='/secret/gcp-credentials/user-gcp-sa.json')) res = dsl.ResourceOp( name="test-resource", k8s_resource=k8s_client.V1PersistentVolumeClaim( api_version="v1", kind=kind, metadata=k8s_client.V1ObjectMeta(name="resource")), attribute_outputs={"out": json}) golden_output = { 'container': { 'image': 'image', 'args': [ 'echo {{inputs.parameters.msg1}} {{inputs.parameters.msg2}} | tee /tmp/message.txt' ], 'command': ['sh', '-c'], 'env': [{ 'name': 'GOOGLE_APPLICATION_CREDENTIALS', 'value': '/secret/gcp-credentials/user-gcp-sa.json' }], 'volumeMounts': [{ 'mountPath': '/secret/gcp-credentials', 'name': 'gcp-credentials', }] }, 'inputs': { 'parameters': [ { 'name': 'msg1' }, { 'name': 'msg2' }, ] }, 'name': 'echo', 'outputs': { 'artifacts': [ { 'name': 'echo-merged', 'path': '/tmp/message.txt', }, ], 'parameters': [{ 'name': 'echo-merged', 'valueFrom': { 'path': '/tmp/message.txt' } }], } } res_output = { 'inputs': { 'parameters': [{ 'name': 'json' }, { 'name': 'kind' }] }, 'name': 'test-resource', 'outputs': { 'parameters': [{ 'name': 'test-resource-manifest', 'valueFrom': { 'jsonPath': '{}' } }, { 'name': 'test-resource-name', 'valueFrom': { 'jsonPath': '{.metadata.name}' } }, { 'name': 'test-resource-out', 'valueFrom': { 'jsonPath': '{{inputs.parameters.json}}' } }] }, 'resource': { 'action': 'create', 'manifest': ("apiVersion: v1\n" "kind: '{{inputs.parameters.kind}}'\n" "metadata:\n" " name: resource\n") } } self.maxDiff = None self.assertEqual(golden_output, compiler._op_to_template._op_to_template(op)) self.assertEqual(res_output, compiler._op_to_template._op_to_template(res))
def _create_pod(self, image, pod_name, job_name, port=80, cmd_string=None, volumes=[]): """ Create a kubernetes pod for the job. Args: - image (string) : Docker image to launch - pod_name (string) : Name of the pod - job_name (string) : App label KWargs: - port (integer) : Container port Returns: - None """ security_context = None if self.user_id and self.group_id: security_context = client.V1SecurityContext( run_as_group=self.group_id, run_as_user=self.user_id, run_as_non_root=self.run_as_non_root) # Create the enviornment variables and command to initiate IPP environment_vars = client.V1EnvVar(name="TEST", value="SOME DATA") launch_args = ["-c", "{0};".format(cmd_string)] volume_mounts = [] # Create mount paths for the volumes for volume in volumes: volume_mounts.append( client.V1VolumeMount(mount_path=volume[1], name=volume[0])) resources = client.V1ResourceRequirements(limits={ 'cpu': str(self.max_cpu), 'memory': self.max_mem }, requests={ 'cpu': str(self.init_cpu), 'memory': self.init_mem }) # Configure Pod template container container = client.V1Container( name=pod_name, image=image, resources=resources, ports=[client.V1ContainerPort(container_port=port)], volume_mounts=volume_mounts, command=['/bin/bash'], args=launch_args, env=[environment_vars], security_context=security_context) # Create a secret to enable pulling images from secure repositories secret = None if self.secret: secret = client.V1LocalObjectReference(name=self.secret) # Create list of volumes from (pvc, mount) tuples volume_defs = [] for volume in volumes: volume_defs.append( client.V1Volume( name=volume[0], persistent_volume_claim=client. V1PersistentVolumeClaimVolumeSource(claim_name=volume[0]))) metadata = client.V1ObjectMeta(name=pod_name, labels={"app": job_name}) spec = client.V1PodSpec(containers=[container], image_pull_secrets=[secret], volumes=volume_defs) pod = client.V1Pod(spec=spec, metadata=metadata) api_response = self.kube_client.create_namespaced_pod( namespace=self.namespace, body=pod) logger.debug("Pod created. status='{0}'".format( str(api_response.status)))
def got_image_pipeline( trainingsteps=4000, learningrate=0.01, trainbatchsize=100, ): persistent_volume_name = 'azure-files' persistent_volume_path = '/tf-output' azure_file_secret_name = 'azure-file-secret' azure_file_share_name = 'pipelines' field_path = 'metadata.name' operations = {} # preprocess images operations['preprocess'] = dsl.ContainerOp( name='preprocess', image='gotcr.azurecr.io/chzbrgr71/got-image-preprocess:2.00', arguments=[ '--bottleneck_dir', "/tf-output/bottlenecks", '--image_dir', '/images' ] ) # train operations['train'] = dsl.ContainerOp( name='train', image='gotcr.azurecr.io/chzbrgr71/got-image-training:2.00', arguments=[ '--bottleneck_dir', "/tmp/tensorflow/bottlenecks", '--model_dir', "/tmp/tensorflow/inception", '--summaries_dir', '/tf-output', '--output_graph', '/tf-output', '--output_labels', '/tf-output', '--image_dir', "/images", '--saved_model_dir', '/tf-output', '--how_many_training_steps', trainingsteps, '--learning_rate', learningrate, '--train_batch_size', trainbatchsize ] ) operations['train'].after(operations['preprocess']) # score model operations['score'] = dsl.ContainerOp( name='score', image='gotcr.azurecr.io/chzbrgr71/got-model-scoring:2.01', arguments=[ '/tf-output/latest_model' ] ) operations['score'].after(operations['train']) # convert onnx operations['onnx'] = dsl.ContainerOp( name='onnx', image='gotcr.azurecr.io/chzbrgr71/onnx-convert:2.00', arguments=[ 'show', '--dir', '/tf-output/latest_model/exported_model/1/', '--tag_set', 'serve', '--signature_def', 'serving_default' ] ) operations['onnx'].after(operations['score']) # convert tflite operations['convert-tflite'] = dsl.ContainerOp( name='convert-tflite', image='gotcr.azurecr.io/chzbrgr71/tflite-convert:2.00', arguments=[ '--graph_def_file', '/tf-output/latest_model/got_retrained_graph.pb', '--output_file', '/tf-output/latest_model/optimized_graph.lite', '--input_format', 'TENSORFLOW_GRAPHDEF', '--output_format', 'TFLITE', '--input_sedhape', '1,299,299,3', '--input_array', 'Mul', '--output_array', 'final_result', '--inference_type', 'FLOAT', '--input_data_type', 'FLOAT' ] ) operations['convert-tflite'].after(operations['score']) # copy models to external storage operations['export-to-cloud'] = dsl.ContainerOp( name='export-to-cloud', image='alpine', command=['cp'], arguments=[ '/tf-output/latest_model/got_retrained_graph.pb', '/tf-output/latest_model/got_retrained_graph-latest.pb' ] ) operations['export-to-cloud'].after(operations['onnx']).after(operations['convert-tflite']) for _, op in operations.items(): op.add_volume( k8s_client.V1Volume( azure_file=k8s_client.V1AzureFileVolumeSource( secret_name=azure_file_secret_name, share_name=azure_file_share_name, read_only=False), name=persistent_volume_name) ) \ .add_volume_mount(k8s_client.V1VolumeMount( mount_path=persistent_volume_path, name=persistent_volume_name) ) \ .add_env_variable(k8s_client.V1EnvVar(name='MSG', value='HELLO!') ) \ .add_env_variable(k8s_client.V1EnvVar(name='KUBE_POD_NAME', value_from=k8s_client.V1EnvVarSource( field_ref=k8s_client.V1ObjectFieldSelector(field_path=field_path) )))
def construct_job(self, run): check.inst_param(run, 'run', PipelineRun) dagster_labels = { 'app.kubernetes.io/name': 'dagster', 'app.kubernetes.io/instance': 'dagster', 'app.kubernetes.io/version': dagster_version, } execution_params = { 'executionParams': { 'selector': run.selector.to_graphql_input(), "environmentConfigData": run.environment_dict, 'executionMetadata': { "runId": run.run_id }, "mode": run.mode, }, } job_container = client.V1Container( name='dagster-job-%s' % run.run_id, image=self.job_image, command=['dagster-graphql'], args=[ "-p", "startPipelineExecution", "-v", json.dumps(execution_params) ], image_pull_policy=self.image_pull_policy, env=[ client.V1EnvVar(name='DAGSTER_HOME', value='/opt/dagster/dagster_home') ], env_from=[ client.V1EnvFromSource( config_map_ref=client.V1ConfigMapEnvSource( name='dagster-job-env')) ] + self.env_froms, volume_mounts=[ client.V1VolumeMount( name='dagster-instance', mount_path='/opt/dagster/dagster_home/dagster.yaml', sub_path='dagster.yaml', ) ], ) config_map_volume = client.V1Volume( name='dagster-instance', config_map=client.V1ConfigMapVolumeSource( name=self.instance_config_map), ) template = client.V1PodTemplateSpec( metadata=client.V1ObjectMeta( name='dagster-job-pod-%s' % run.run_id, labels=dagster_labels, ), spec=client.V1PodSpec( image_pull_secrets=self.image_pull_secrets, service_account_name=self.service_account_name, restart_policy='Never', containers=[job_container], volumes=[config_map_volume], ), ) job = client.V1Job( api_version="batch/v1", kind="Job", metadata=client.V1ObjectMeta(name='dagster-job-%s' % run.run_id, labels=dagster_labels), spec=client.V1JobSpec( template=template, backoff_limit=BACKOFF_LIMIT, ttl_seconds_after_finished=TTL_SECONDS_AFTER_FINISHED, ), ) return job
def get_volume_mount(volume, volume_mount=None): return client.V1VolumeMount(name=volume, mount_path=volume_mount)
def create_job(MODEL): assert MODEL is not None, "model name is None, cannot spawn a new worker" api = client.BatchV1Api() body = client.V1Job(api_version="batch/v1", kind="Job") name = 'speechlab-worker-job-{}-{}'.format(MODEL.lower().replace("_", "-"), id_generator()) body.metadata = client.V1ObjectMeta(namespace=NAMESPACE, name=name) body.status = client.V1JobStatus() template = client.V1PodTemplate() template.template = client.V1PodTemplateSpec() template.template.metadata = client.V1ObjectMeta( annotations={ "prometheus.io/scrape": "true", "prometheus.io/port": "8081" }) azure_file_volume = client.V1AzureFileVolumeSource( read_only=True, secret_name=MODELS_FILESHARE_SECRET, share_name=MODELS_SHARE_NAME) volume = client.V1Volume(name="models-azurefiles", azure_file=azure_file_volume) env_vars = { "AZURE_STORAGE_ACCOUNT": AZURE_STORAGE_ACCOUNT, "AZURE_STORAGE_ACCESS_KEY": AZURE_STORAGE_ACCESS_KEY, "AZURE_CONTAINER": AZURE_CONTAINER, "MASTER": MASTER, "NAMESPACE": NAMESPACE, "RUN_FREQ": "ONCE", "MODEL_DIR": MODEL, # important "MODELS_FILESHARE_SECRET": MODELS_FILESHARE_SECRET, "MODELS_SHARE_NAME": MODELS_SHARE_NAME } env_list = [] if env_vars: for env_name, env_value in env_vars.items(): env_list.append(client.V1EnvVar(name=env_name, value=env_value)) container = client.V1Container( name='{}-c'.format(name), image=IMAGE, image_pull_policy="IfNotPresent", command=[ "/home/appuser/opt/tini", "--", "/home/appuser/opt/start_worker.sh" ], env=env_list, ports=[client.V1ContainerPort(container_port=8081, name="prometheus")], security_context=client.V1SecurityContext( privileged=True, capabilities=client.V1Capabilities(add=["SYS_ADMIN"])), resources=client.V1ResourceRequirements(limits={ "memory": "5G", "cpu": "1" }, requests={ "memory": "5G", "cpu": "1" }), volume_mounts=[ client.V1VolumeMount(mount_path="/home/appuser/opt/models", name="models-azurefiles", read_only=True) ]) template.template.spec = client.V1PodSpec( containers=[container], image_pull_secrets=[{ "name": "azure-cr-secret" }], # reason to use OnFailure https://github.com/kubernetes/kubernetes/issues/20255 restart_policy="OnFailure", volumes=[volume]) # And finaly we can create our V1JobSpec! body.spec = client.V1JobSpec(ttl_seconds_after_finished=100, template=template.template) try: api_response = api.create_namespaced_job(NAMESPACE, body) print("api_response=" + str(api_response)) return True except ApiException as e: logging.exception('error spawning new job') print("Exception when creating a job: %s\n" % e)
def createStatefulSet( cls, cluster_object: V1MongoClusterConfiguration ) -> client.V1beta1StatefulSet: """ Creates a the stateful set configuration for the given cluster. :param cluster_object: The cluster object from the YAML file. :return: The stateful set object. """ # Parse cluster data object. name = cluster_object.metadata.name namespace = cluster_object.metadata.namespace replicas = cluster_object.spec.mongodb.replicas cpu_limit = cluster_object.spec.mongodb.cpu_limit or cls.DEFAULT_CPU_LIMIT memory_limit = cluster_object.spec.mongodb.memory_limit or cls.DEFAULT_MEMORY_LIMIT # create container mongo_container = client.V1Container( name=cls.MONGO_NAME, env=[ client.V1EnvVar( name="POD_IP", value_from=client.V1EnvVarSource( field_ref=client.V1ObjectFieldSelector( api_version="v1", field_path="status.podIP"))) ], command=cls.MONGO_COMMAND.format(name=name).split(), image=cls.MONGO_IMAGE, ports=[ client.V1ContainerPort(name=cls.MONGO_NAME, container_port=cls.MONGO_PORT, protocol="TCP") ], volume_mounts=[ client.V1VolumeMount(name=cls.MONGO_STORAGE_NAME, read_only=False, mount_path=cls.STORAGE_MOUNT_PATH) ], resources=client.V1ResourceRequirements(limits={ "cpu": cpu_limit, "memory": memory_limit }, requests={ "cpu": cpu_limit, "memory": memory_limit })) # Create stateful set. return client.V1beta1StatefulSet( metadata=client.V1ObjectMeta(name=name, namespace=namespace, labels=cls.createDefaultLabels(name)), spec=client.V1beta1StatefulSetSpec( replicas=replicas, service_name=name, template=client.V1PodTemplateSpec( metadata=client.V1ObjectMeta( labels=cls.createDefaultLabels(name)), spec=client.V1PodSpec(containers=[mongo_container]), ), volume_claim_templates=[ client.V1PersistentVolumeClaim( metadata=client.V1ObjectMeta( name=cls.MONGO_STORAGE_NAME), spec=client.V1PersistentVolumeClaimSpec( access_modes=["ReadWriteOnce"], resources=client.V1ResourceRequirements( requests={"storage": cls.STORAGE_SIZE}), ), ) ], ), )
def create_job_object(request_id, image, chunk_size, rabbitmq_uri, workers, result_destination, result_format, x509_secret, kafka_broker, generated_code_cm): volume_mounts = [ client.V1VolumeMount(name='x509-secret', mount_path='/etc/grid-security-ro') ] volumes = [ client.V1Volume( name='x509-secret', secret=client.V1SecretVolumeSource(secret_name=x509_secret)) ] if generated_code_cm: volumes.append( client.V1Volume(name='generated-code', config_map=client.V1ConfigMapVolumeSource( name=generated_code_cm))) volume_mounts.append( client.V1VolumeMount(mount_path="/generated", name='generated-code')) if "TRANSFORMER_LOCAL_PATH" in current_app.config: path = current_app.config['TRANSFORMER_LOCAL_PATH'] volumes.append( client.V1Volume( name='rootfiles', host_path=client.V1HostPathVolumeSource(path=path))) volume_mounts.append( client.V1VolumeMount(mount_path="/data", name='rootfiles')) # Compute Environment Vars env = [client.V1EnvVar(name="BASH_ENV", value="/servicex/.bashrc")] # Provide each pod with an environment var holding that pod's name pod_name_value_from = client.V1EnvVarSource( field_ref=client.V1ObjectFieldSelector(field_path="metadata.name")) env_var_pod_name = client.V1EnvVar("POD_NAME", value_from=pod_name_value_from) env = env + [env_var_pod_name] if result_destination == 'object-store': env = env + [ client.V1EnvVar( name='MINIO_URL', value=current_app.config['MINIO_URL_TRANSFORMER']), client.V1EnvVar(name='MINIO_ACCESS_KEY', value=current_app.config['MINIO_ACCESS_KEY']), client.V1EnvVar(name='MINIO_SECRET_KEY', value=current_app.config['MINIO_SECRET_KEY']), ] python_args = [ "/servicex/proxy-exporter.sh & sleep 5 && " + "PYTHONPATH=/generated:$PYTHONPATH " + "python /servicex/transformer.py " + " --request-id " + request_id + " --rabbit-uri " + rabbitmq_uri + " --chunks " + str(chunk_size) + " --result-destination " + result_destination + " --result-format " + result_format ] if kafka_broker: python_args[0] += " --brokerlist " + kafka_broker resources = client.V1ResourceRequirements( limits={"cpu": current_app.config['TRANSFORMER_CPU_LIMIT']}) # Configure Pod template container container = client.V1Container( name="transformer-" + request_id, image=image, image_pull_policy=current_app.config['TRANSFORMER_PULL_POLICY'], volume_mounts=volume_mounts, command=["bash", "-c"], # Can't get bash to pick up my .bashrc! env=env, args=python_args, resources=resources) # Create and Configure a spec section template = client.V1PodTemplateSpec(metadata=client.V1ObjectMeta( labels={'app': "transformer-" + request_id}), spec=client.V1PodSpec( restart_policy="Always", containers=[container], volumes=volumes)) # Create the specification of deployment selector = client.V1LabelSelector( match_labels={"app": "transformer-" + request_id}) # If we are using Autoscaler then always start with one replica replicas = 1 if current_app.config[ 'TRANSFORMER_AUTOSCALE_ENABLED'] else workers spec = client.V1DeploymentSpec(template=template, selector=selector, replicas=replicas) deployment = client.V1Deployment( api_version="apps/v1", kind="Deployment", metadata=client.V1ObjectMeta(name="transformer-" + request_id), spec=spec) return deployment
def submit(self): """Submit a build pod to create the image for the repository.""" volume_mounts = [ client.V1VolumeMount(mount_path="/var/run/docker.sock", name="docker-socket") ] docker_socket_path = urlparse(self.docker_host).path volumes = [ client.V1Volume(name="docker-socket", host_path=client.V1HostPathVolumeSource( path=docker_socket_path, type='Socket')) ] if self.push_secret: volume_mounts.append( client.V1VolumeMount(mount_path="/root/.docker", name='docker-push-secret')) volumes.append( client.V1Volume(name='docker-push-secret', secret=client.V1SecretVolumeSource( secret_name=self.push_secret))) env = [] if self.git_credentials: env.append( client.V1EnvVar(name='GIT_CREDENTIAL_ENV', value=self.git_credentials)) self.pod = client.V1Pod( metadata=client.V1ObjectMeta( name=self.name, labels={ "name": self.name, "component": self._component_label, }, annotations={ "binder-repo": self.repo_url, }, ), spec=client.V1PodSpec( containers=[ client.V1Container( image=self.build_image, name="builder", args=self.get_cmd(), volume_mounts=volume_mounts, resources=client.V1ResourceRequirements( limits={'memory': self.memory_limit}, requests={'memory': self.memory_request}, ), env=env) ], tolerations=[ client.V1Toleration( key='hub.jupyter.org/dedicated', operator='Equal', value='user', effect='NoSchedule', ), # GKE currently does not permit creating taints on a node pool # with a `/` in the key field client.V1Toleration( key='hub.jupyter.org_dedicated', operator='Equal', value='user', effect='NoSchedule', ), ], node_selector=self.node_selector, volumes=volumes, restart_policy="Never", affinity=self.get_affinity())) try: ret = self.api.create_namespaced_pod(self.namespace, self.pod) except client.rest.ApiException as e: if e.status == 409: # Someone else created it! app_log.info("Build %s already running", self.name) pass else: raise else: app_log.info("Started build %s", self.name) app_log.info("Watching build pod %s", self.name) while not self.stop_event.is_set(): w = watch.Watch() try: for f in w.stream( self.api.list_namespaced_pod, self.namespace, label_selector="name={}".format(self.name), timeout_seconds=30, ): if f['type'] == 'DELETED': self.progress('pod.phasechange', 'Deleted') return self.pod = f['object'] if not self.stop_event.is_set(): self.progress('pod.phasechange', self.pod.status.phase) if self.pod.status.phase == 'Succeeded': self.cleanup() elif self.pod.status.phase == 'Failed': self.cleanup() except Exception as e: app_log.exception("Error in watch stream for %s", self.name) raise finally: w.stop() if self.stop_event.is_set(): app_log.info("Stopping watch of %s", self.name) return