def add_azure_files(kube_manager, pod_spec, namespace): context_hash = pod_spec.containers[0].args[1].split(':')[-1] secret_name = constants.AZURE_STORAGE_CREDS_SECRET_NAME_PREFIX + context_hash.lower( ) if not kube_manager.secret_exists(secret_name, namespace): raise Exception("Secret '{}' not found in namespace '{}'".format( secret_name, namespace)) volume_mount = client.V1VolumeMount(name='azure-files', mount_path='/mnt/azure/', read_only=True) if pod_spec.containers[0].volume_mounts: pod_spec.containers[0].volume_mounts.append(volume_mount) else: pod_spec.containers[0].volume_mounts = [volume_mount] volume = client.V1Volume( name='azure-files', azure_file=client.V1AzureFileVolumeSource( secret_name=secret_name, share_name=constants.AZURE_FILES_SHARED_FOLDER)) if pod_spec.volumes: pod_spec.volumes.append(volume) else: pod_spec.volumes = [volume]
def prepare_azure_volumes(self, volume_sub_path: str, afs_volume_name: str, azure_mount_path: str): assert afs_volume_name, f"Check afs_volume_name {afs_volume_name}" assert azure_mount_path, f"Check azure_mount_path {azure_mount_path}" volume_mounts = [ client.V1VolumeMount(name=afs_volume_name, mount_path=azure_mount_path, sub_path=volume_sub_path) ] azure_volume = client.V1AzureFileVolumeSource( secret_name=self.azure_secret, share_name=self.afs_share) volumes = [ client.V1Volume(name=afs_volume_name, azure_file=azure_volume) ] return volumes, volume_mounts
def got_image_pipeline( trainingsteps=4000, learningrate=0.01, trainbatchsize=100, ): persistent_volume_name = 'azure-files' persistent_volume_path = '/tf-output' azure_file_secret_name = 'azure-file-secret' azure_file_share_name = 'aksshare' field_path = 'metadata.name' operations = {} # preprocess images operations['preprocess'] = dsl.ContainerOp( name='preprocess', image='briaracr.azurecr.io/chzbrgr71/got-image-preprocess:1.63', arguments=[ '--bottleneck_dir', "/tf-output/bottlenecks", '--image_dir', '/images' ]) # train operations['train'] = dsl.ContainerOp( name='train', image='briaracr.azurecr.io/chzbrgr71/got-image-training:1.63', arguments=[ '--bottleneck_dir', "/tmp/tensorflow/bottlenecks", '--model_dir', "/tmp/tensorflow/inception", '--summaries_dir', '/tf-output', '--output_graph', '/tf-output', '--output_labels', '/tf-output', '--image_dir', "/images", '--saved_model_dir', '/tf-output', '--how_many_training_steps', trainingsteps, '--learning_rate', learningrate, '--train_batch_size', trainbatchsize ]) operations['train'].after(operations['preprocess']) # score model operations['score'] = dsl.ContainerOp( name='score', image='briaracr.azurecr.io/chzbrgr71/got-model-scoring:1.0', arguments=['/tf-output/latest_model']) operations['score'].after(operations['train']) # convert onnx operations['onnx'] = dsl.ContainerOp( name='onnx', image='briaracr.azurecr.io/chzbrgr71/onnx-convert:1.1', arguments=[ 'show', '--dir', '/tf-output/latest_model/exported_model/1/', '--tag_set', 'serve', '--signature_def', 'serving_default' ]) operations['onnx'].after(operations['score']) # convert tflite operations['convert-tflite'] = dsl.ContainerOp( name='convert-tflite', image='briaracr.azurecr.io/chzbrgr71/tflite-convert:1.0', arguments=[ '--graph_def_file', '/tf-output/latest_model/got_retrained_graph.pb', '--output_file', '/tf-output/latest_model/optimized_graph.lite', '--input_format', 'TENSORFLOW_GRAPHDEF', '--output_format', 'TFLITE', '--input_sedhape', '1,299,299,3', '--input_array', 'Mul', '--output_array', 'final_result', '--inference_type', 'FLOAT', '--input_data_type', 'FLOAT' ]) operations['convert-tflite'].after(operations['score']) # copy models to external storage operations['export-to-cloud'] = dsl.ContainerOp( name='export-to-cloud', image='alpine', command=['cp'], arguments=[ '/tf-output/latest_model/got_retrained_graph.pb', '/tf-output/latest_model/got_retrained_graph-latest.pb' ]) operations['export-to-cloud'].after(operations['onnx']).after( operations['convert-tflite']) for _, op in operations.items(): op.add_volume( k8s_client.V1Volume( azure_file=k8s_client.V1AzureFileVolumeSource( secret_name=azure_file_secret_name, share_name=azure_file_share_name, read_only=False), name=persistent_volume_name) ) \ .add_volume_mount(k8s_client.V1VolumeMount( mount_path=persistent_volume_path, name=persistent_volume_name) ) \ .add_env_variable(k8s_client.V1EnvVar(name='MSG', value='HELLO!') ) \ .add_env_variable(k8s_client.V1EnvVar(name='KUBE_POD_NAME', value_from=k8s_client.V1EnvVarSource( field_ref=k8s_client.V1ObjectFieldSelector(field_path=field_path) )))
def create_job(MODEL): assert MODEL is not None, "model name is None, cannot spawn a new worker" api = client.BatchV1Api() body = client.V1Job(api_version="batch/v1", kind="Job") name = 'speechlab-worker-job-{}-{}'.format(MODEL.lower().replace("_", "-"), id_generator()) body.metadata = client.V1ObjectMeta(namespace=NAMESPACE, name=name) body.status = client.V1JobStatus() template = client.V1PodTemplate() template.template = client.V1PodTemplateSpec() template.template.metadata = client.V1ObjectMeta( annotations={ "prometheus.io/scrape": "true", "prometheus.io/port": "8081" }) azure_file_volume = client.V1AzureFileVolumeSource( read_only=True, secret_name=MODELS_FILESHARE_SECRET, share_name=MODELS_SHARE_NAME) volume = client.V1Volume(name="models-azurefiles", azure_file=azure_file_volume) env_vars = { "AZURE_STORAGE_ACCOUNT": AZURE_STORAGE_ACCOUNT, "AZURE_STORAGE_ACCESS_KEY": AZURE_STORAGE_ACCESS_KEY, "AZURE_CONTAINER": AZURE_CONTAINER, "MASTER": MASTER, "NAMESPACE": NAMESPACE, "RUN_FREQ": "ONCE", "MODEL_DIR": MODEL, # important "MODELS_FILESHARE_SECRET": MODELS_FILESHARE_SECRET, "MODELS_SHARE_NAME": MODELS_SHARE_NAME } env_list = [] if env_vars: for env_name, env_value in env_vars.items(): env_list.append(client.V1EnvVar(name=env_name, value=env_value)) container = client.V1Container( name='{}-c'.format(name), image=IMAGE, image_pull_policy="IfNotPresent", command=[ "/home/appuser/opt/tini", "--", "/home/appuser/opt/start_worker.sh" ], env=env_list, ports=[client.V1ContainerPort(container_port=8081, name="prometheus")], security_context=client.V1SecurityContext( privileged=True, capabilities=client.V1Capabilities(add=["SYS_ADMIN"])), resources=client.V1ResourceRequirements(limits={ "memory": "5G", "cpu": "1" }, requests={ "memory": "5G", "cpu": "1" }), volume_mounts=[ client.V1VolumeMount(mount_path="/home/appuser/opt/models", name="models-azurefiles", read_only=True) ]) template.template.spec = client.V1PodSpec( containers=[container], image_pull_secrets=[{ "name": "azure-cr-secret" }], # reason to use OnFailure https://github.com/kubernetes/kubernetes/issues/20255 restart_policy="OnFailure", volumes=[volume]) # And finaly we can create our V1JobSpec! body.spec = client.V1JobSpec(ttl_seconds_after_finished=100, template=template.template) try: api_response = api.create_namespaced_job(NAMESPACE, body) print("api_response=" + str(api_response)) return True except ApiException as e: logging.exception('error spawning new job') print("Exception when creating a job: %s\n" % e)