def demo_pipeline( fastqs=["/mnt/data/file1.fastq.gz", "/mnt/data/file2.fastq.gz"], leading: int = 5, trailing: int = 5, minlen: int = 80, sliding_window: str = "4:25", bar_color: str = "white", flier_color: str = "grey", plot_color: str = "darkgrid", ): """ func_to_container_op simply converts the function into a factory that produces ops when called. add_pvolumes is a method of the op itself, so we need to apply it here when the op is actually generated, NOT above where the trim_op factory is created. """ with dsl.ParallelFor(fastqs) as fastq: trim_task = trim_op( fastq=fastq, leading=leading, trailing=trailing, minlen=minlen, sliding_window=sliding_window, ).add_pvolumes( {"/mnt/data": dsl.PipelineVolume(pvc="test-data-pv-claim")}) _ = plot_op(fastq=fastq, trimmed_fastq=trim_task.outputs["trimmed_fastq"], bar_color=bar_color, flier_color=flier_color, plot_color=plot_color).add_pvolumes({ "/mnt/data": dsl.PipelineVolume(pvc="test-data-pv-claim") })
def demo_model(): unZipFile = dsl.ContainerOp( name="Upzip file", image=config["depend_image"], command=["sh", "-c"], arguments=["python3 " + config["unzipPath"]], pvolumes={config["pvolumePath"]: dsl.PipelineVolume(pvc="downloads-pvc")} )
def demo_op(input_notebook: str, output_notebook: str): return dsl.ContainerOp( name='papermill', image=image, command=['sh', '-c'], pvolumes={"Mount": dsl.PipelineVolume(pvc="xyz", name='xyz') }, #Mount here and replace xyz arguments=['papermill $0 $1', input_notebook, output_notebook])
def binding_to_volumes( self, params: DictConfig, # inputs args: t.List[t.Text], volume_mounts: t.Dict) -> None: # outputs pvc_name = self.mlcube.runner.pvc vol_mount_prefix = '/mnt/mlcube' for param_name, param_def in params.items(): args.append(f"--{param_name}=" + vol_mount_prefix + pvc_name + "/" + param_def.default) volume_mounts[vol_mount_prefix + pvc_name] = dsl.PipelineVolume(pvc=pvc_name)
def demo_model(): createModel = dsl.ContainerOp( name="create model", image=config["model_depend_image"], command=["sh", "-c"], arguments=["python3 " + config["modelPath"]], pvolumes={config["pvolumePath"]: dsl.PipelineVolume(pvc="downloads-pvc")} ) moveModel = dsl.ContainerOp( name="Move_Model_to_IPC", image=config["move_depend_image"], pvolumes={config["pvolumePath"]: createModel.pvolume} )
def sl_segmentation_pipeline( python_train_path='/deephealth/use_case_pipeline/python/skin_lesion_segmentation_training.py', input_dataset_yaml='/deephealth/dataset/isic_segmentation/isic_segmentation.yml', output_path='/deephealth/outputs', num_epochs: Integer() = 1, num_batch_size: Integer() = 10, output_dataset_folder='/deephealth/dataset/isic_segmentation', split_partition_number: Integer() = 3, python_inference_path='/deephealth/use_case_pipeline/python/skin_lesion_segmentation_inference.py', model_file_folder='/deephealth/outputs', is_gpu_used='no'): dhealth_vop = dsl.PipelineVolume(pvc='dhealth-efs-claim') dhealth_vop_param = {'/deephealth': dhealth_vop} # gpu = '' # if is_gpu_used == 'yes': # gpu = 'yes' _gpu = _get_gpu_op(is_gpu_used) \ .set_display_name("GPU input parameter") _train_op = dhealth_train_sl_segmentation_op(python_train_path, input_dataset_yaml, output_path, num_epochs, num_batch_size, _gpu.outputs) \ .after(_gpu) \ .add_pvolumes(dhealth_vop_param) \ .set_display_name('Training Model') _split_yaml_op = dhealth_splityaml_op(input_dataset_yaml, output_dataset_folder, split_partition_number) \ .after(_train_op) \ .add_pvolumes(dhealth_vop_param) \ .set_display_name('Split Datset YAML') model: ContainerOp = _get_latest_model_op(model_file_folder) \ .after(_split_yaml_op) \ .add_pvolumes({"/deephealth": dhealth_vop}) \ .set_display_name('Load Model') subyamls = _get_yaml_op(output_dataset_folder) \ .after(_split_yaml_op) \ .add_pvolumes(dhealth_vop_param) \ .set_display_name('Load sub-YAMLs') with dsl.ParallelFor(subyamls.outputs['yamlfile']) as sub_yaml: dhealth_inference_sl_segmentation_op( python_inference_path, sub_yaml, model.output, output_path, 2, is_gpu_used) \ .add_pvolumes(dhealth_vop_param) \ .set_display_name('Inference')
def volumeop_sequential(): step1 = dsl.ContainerOp( name="step1", image="library/bash:4.4.23", command=["sh", "-c"], arguments=["echo 1|tee /data/file1"], pvolumes={"/data": dsl.PipelineVolume(pvc="hostpath-pvc")}) step2 = dsl.ContainerOp(name="step2", image="library/bash:4.4.23", command=["sh", "-c"], arguments=["cp /data/file1 /data/file2"], pvolumes={"/data": step1.pvolume}) step3 = dsl.ContainerOp(name="step3", image="library/bash:4.4.23", command=["cat", "/mnt/file1", "/mnt/file2"], pvolumes={ "/mnt": step2.pvolume, })
def volumeop_sequential(): createModel = dsl.ContainerOp( name="create_model", image="tensorflow/tensorflow:2.3.0", command=["sh", "-c"], arguments=["python3 /data/models/dslModel.py"], file_outputs={'output': '/data/serving_status'}, pvolumes={"/data": dsl.PipelineVolume(pvc="hostpath-pvc")}) with dsl.Condition(createModel.output != '1'): create_serving = dsl.ResourceOp( name='create_serving', k8s_resource=json.loads(_CONTAINER_MANIFEST), action='apply') test_serving = dsl.ContainerOp( name="test_serving", image="yanqin/fabric:v1", ).after(create_serving) with dsl.Condition(createModel.output == '1'): test_serving = dsl.ContainerOp( name="test_serving", image="yanqin/fabric:v1", )
def auto_generated_pipeline(CANDIES='20', vol_shared_volume='efs'): pvolumes_dict = OrderedDict() annotations = {} volume = dsl.PipelineVolume(pvc=vol_shared_volume) pvolumes_dict['/shared_volume/'] = volume sack_task = sack_op(CANDIES, vol_shared_volume)\ .add_pvolumes(pvolumes_dict)\ .after() sack_task.container.working_dir = "/shared_volume/notebooks/kale-base-example" sack_task.container.set_security_context( k8s_client.V1SecurityContext(run_as_user=0)) kid1_task = kid1_op(CANDIES, vol_shared_volume)\ .add_pvolumes(pvolumes_dict)\ .after(sack_task) kid1_task.container.working_dir = "/shared_volume/notebooks/kale-base-example" kid1_task.container.set_security_context( k8s_client.V1SecurityContext(run_as_user=0)) kid2_task = kid2_op(CANDIES, vol_shared_volume)\ .add_pvolumes(pvolumes_dict)\ .after(kid1_task) kid2_task.container.working_dir = "/shared_volume/notebooks/kale-base-example" kid2_task.container.set_security_context( k8s_client.V1SecurityContext(run_as_user=0)) kid3_task = kid3_op(CANDIES, vol_shared_volume)\ .add_pvolumes(pvolumes_dict)\ .after(kid2_task) kid3_task.container.working_dir = "/shared_volume/notebooks/kale-base-example" kid3_task.container.set_security_context( k8s_client.V1SecurityContext(run_as_user=0))
def auto_generated_pipeline(TRAIN_STEPS='2', vol_shared_volume='efs'): pvolumes_dict = OrderedDict() annotations = {} volume = dsl.PipelineVolume(pvc=vol_shared_volume) pvolumes_dict['/shared_volume/'] = volume dataprocessing_task = dataprocessing_op(TRAIN_STEPS, vol_shared_volume)\ .add_pvolumes(pvolumes_dict)\ .after() dataprocessing_task.container.working_dir = "/shared_volume/notebooks/pytorch-classif" dataprocessing_task.container.set_security_context( k8s_client.V1SecurityContext(run_as_user=0)) train_task = train_op(TRAIN_STEPS, vol_shared_volume)\ .add_pvolumes(pvolumes_dict)\ .after(dataprocessing_task) train_task.container.working_dir = "/shared_volume/notebooks/pytorch-classif" train_task.container.set_security_context( k8s_client.V1SecurityContext(run_as_user=0)) testontest_task = testontest_op(TRAIN_STEPS, vol_shared_volume)\ .add_pvolumes(pvolumes_dict)\ .after(train_task) testontest_task.container.working_dir = "/shared_volume/notebooks/pytorch-classif" testontest_task.container.set_security_context( k8s_client.V1SecurityContext(run_as_user=0)) testwhole_task = testwhole_op(TRAIN_STEPS, vol_shared_volume)\ .add_pvolumes(pvolumes_dict)\ .after(testontest_task) testwhole_task.container.working_dir = "/shared_volume/notebooks/pytorch-classif" testwhole_task.container.set_security_context( k8s_client.V1SecurityContext(run_as_user=0))
def flipcoin_pipeline(): createModel = dsl.ContainerOp( name="create_model", image="yanqin/request:v1", file_outputs={'output': '/data/file1'}, pvolumes={"/data": dsl.PipelineVolume(pvc="hostpath-pvc")})
def build_pipeline( bucket='pysearchml', es_host='elasticsearch.elastic-system.svc.cluster.local:9200', force_restart=False, train_init_date='20160801', train_end_date='20160801', validation_init_date='20160802', validation_end_date='20160802', test_init_date='20160803', test_end_date='20160803', model_name='lambdamart0', ranker='lambdamart', index='pysearchml' ): pvc = dsl.PipelineVolume(pvc='pysearchml-nfs') prepare_op = dsl.ContainerOp( name='prepare env', image=f'gcr.io/{PROJECT_ID}/prepare_env', arguments=[ f'--force_restart={force_restart}', f'--es_host={es_host}', f'--bucket={bucket}', f'--model_name={model_name}' ], pvolumes={'/data': pvc} ) val_reg_dataset_op = dsl.ContainerOp( name='validation regular dataset', image=f'gcr.io/{PROJECT_ID}/data_validation', arguments=[ f'--bucket={bucket}/validation/regular', f'--validation_init_date={validation_init_date}', f'--validation_end_date={validation_end_date}', f'--destination=/data/pysearchml/{model_name}/validation_regular' ], pvolumes={'/data': pvc} ).set_display_name('Build Regular Validation Dataset').after(prepare_op) val_train_dataset_op = dsl.ContainerOp( name='validation train dataset', image=f'gcr.io/{PROJECT_ID}/data_validation', arguments=[ f'--bucket={bucket}/validation/train', f'--validation_init_date={train_init_date}', f'--validation_end_date={train_end_date}', f'--destination=/data/pysearchml/{model_name}/validation_train' ], pvolumes={'/data': pvc} ).set_display_name('Build Train Validation Dataset').after(prepare_op) val_test_dataset_op = dsl.ContainerOp( name='validation test dataset', image=f'gcr.io/{PROJECT_ID}/data_validation', arguments=[ f'--bucket={bucket}/validation/test', f'--validation_init_date={test_init_date}', f'--validation_end_date={test_end_date}', f'--destination=/data/pysearchml/{model_name}/validation_test' ], pvolumes={'/data': pvc} ).set_display_name('Build Test Validation Dataset').after(prepare_op) train_dataset_op = dsl.ContainerOp( name='train dataset', image=f'gcr.io/{PROJECT_ID}/data_train', command=['python', '/train/run.py'], arguments=[ f'--bucket={bucket}', f'--train_init_date={train_init_date}', f'--train_end_date={train_end_date}', f'--es_host={es_host}', f'--model_name={model_name}', f'--index={index}', f'--destination=/data/pysearchml/{model_name}/train' ], pvolumes={'/data': pvc} ).set_display_name('Build Training Dataset').after(prepare_op) katib_op = dsl.ContainerOp( name='pySearchML Bayesian Optimization Model', image=f'gcr.io/{PROJECT_ID}/model', command=['python', '/model/launch_katib.py'], arguments=[ f'--es_host={es_host}', f'--model_name={model_name}', f'--ranker={ranker}', '--name=pysearchml', f'--train_file_path=/data/pysearchml/{model_name}/train/train_dataset.txt', f'--validation_files_path=/data/pysearchml/{model_name}/validation_regular', ('--validation_train_files_path=/data/pysearchml/{model_name}/' 'validation_train'), f'--destination=/data/pysearchml/{model_name}/' ], pvolumes={'/data': pvc} ).set_display_name('Katib Optimization Process').after( val_reg_dataset_op, val_train_dataset_op, val_test_dataset_op, train_dataset_op ) post_model_op = dsl.ContainerOp( name='Post Best RankLib Model to ES', image=f'gcr.io/{PROJECT_ID}/model', command=['python', '/model/post_model.py'], arguments=[ f'--es_host={es_host}', f'--model_name={model_name}', f'--destination=/data/pysearchml/{model_name}/best_model.txt' ], pvolumes={'/data': pvc} ).set_display_name('Post RankLib Model to ES').after(katib_op) _ = dsl.ContainerOp( name='Test Model', image=f'gcr.io/{PROJECT_ID}/model', command=['python', '/model/test.py'], arguments=[ f'--files_path=/data/pysearchml/{model_name}/validation_test', f'--index={index}', f'--es_host={es_host}', f'--model_name={model_name}', ], pvolumes={'/data': pvc} ).set_display_name('Run Test Step').after(post_model_op)
def auto_generated_pipeline(density_p='1000000000', vol_shared_volume='hostpath-pvc'): _kale_pvolumes_dict = OrderedDict() _kale_volume_step_names = [] _kale_volume_name_parameters = [] _kale_annotations = {} _kale_volume = _kfp_dsl.PipelineVolume(pvc=vol_shared_volume) _kale_pvolumes_dict['/shared_volume'] = _kale_volume _kale_volume_step_names.sort() _kale_volume_name_parameters.sort() _kale_integral1_task = _kale_integral1_op(density_p)\ .add_pvolumes(_kale_pvolumes_dict)\ .after() #see: https://kubeflow-pipelines.readthedocs.io/en/latest/source/kfp.dsl.html#kfp.dsl.Sidecar.add_resource_limit #can also be: _kale_step_limits = {'nvidia.com/gpu': '1'} _kale_step_limits = {'cpu': '1', 'memory': '40Gi'} for _kale_k, _kale_v in _kale_step_limits.items(): _kale_integral1_task.container.add_resource_limit(_kale_k, _kale_v) _kale_integral1_task.container.working_dir = "//shared_volume" _kale_integral1_task.container.set_security_context( k8s_client.V1SecurityContext(run_as_user=0)) _kale_output_artifacts = {} _kale_output_artifacts.update( {'mlpipeline-ui-metadata': '/tmp/mlpipeline-ui-metadata.json'}) _kale_output_artifacts.update({'integral1': '/integral1.html'}) _kale_integral1_task.output_artifact_paths.update(_kale_output_artifacts) _kale_integral1_task.add_pod_label( "pipelines.kubeflow.org/metadata_written", "true") _kale_dep_names = (_kale_integral1_task.dependent_names + _kale_volume_step_names) _kale_integral1_task.add_pod_annotation( "kubeflow-kale.org/dependent-templates", json.dumps(_kale_dep_names)) if _kale_volume_name_parameters: _kale_integral1_task.add_pod_annotation( "kubeflow-kale.org/volume-name-parameters", json.dumps(_kale_volume_name_parameters)) _kale_integral2_task = _kale_integral2_op(density_p)\ .add_pvolumes(_kale_pvolumes_dict)\ .after() _kale_step_limits = {'cpu': '1', 'memory': '40Gi'} for _kale_k, _kale_v in _kale_step_limits.items(): _kale_integral2_task.container.add_resource_limit(_kale_k, _kale_v) _kale_integral2_task.container.working_dir = "//shared_volume" _kale_integral2_task.container.set_security_context( k8s_client.V1SecurityContext(run_as_user=0)) _kale_output_artifacts = {} _kale_output_artifacts.update( {'mlpipeline-ui-metadata': '/tmp/mlpipeline-ui-metadata.json'}) _kale_output_artifacts.update({'integral2': '/integral2.html'}) _kale_integral2_task.output_artifact_paths.update(_kale_output_artifacts) _kale_integral2_task.add_pod_label( "pipelines.kubeflow.org/metadata_written", "true") _kale_dep_names = (_kale_integral2_task.dependent_names + _kale_volume_step_names) _kale_integral2_task.add_pod_annotation( "kubeflow-kale.org/dependent-templates", json.dumps(_kale_dep_names)) if _kale_volume_name_parameters: _kale_integral2_task.add_pod_annotation( "kubeflow-kale.org/volume-name-parameters", json.dumps(_kale_volume_name_parameters)) _kale_integral3_task = _kale_integral3_op(density_p)\ .add_pvolumes(_kale_pvolumes_dict)\ .after() _kale_step_limits = {'cpu': '1', 'memory': '40Gi'} for _kale_k, _kale_v in _kale_step_limits.items(): _kale_integral3_task.container.add_resource_limit(_kale_k, _kale_v) _kale_integral3_task.container.working_dir = "//shared_volume" _kale_integral3_task.container.set_security_context( k8s_client.V1SecurityContext(run_as_user=0)) _kale_output_artifacts = {} _kale_output_artifacts.update( {'mlpipeline-ui-metadata': '/tmp/mlpipeline-ui-metadata.json'}) _kale_output_artifacts.update({'integral3': '/integral3.html'}) _kale_integral3_task.output_artifact_paths.update(_kale_output_artifacts) _kale_integral3_task.add_pod_label( "pipelines.kubeflow.org/metadata_written", "true") _kale_dep_names = (_kale_integral3_task.dependent_names + _kale_volume_step_names) _kale_integral3_task.add_pod_annotation( "kubeflow-kale.org/dependent-templates", json.dumps(_kale_dep_names)) if _kale_volume_name_parameters: _kale_integral3_task.add_pod_annotation( "kubeflow-kale.org/volume-name-parameters", json.dumps(_kale_volume_name_parameters)) _kale_integral4_task = _kale_integral4_op(density_p)\ .add_pvolumes(_kale_pvolumes_dict)\ .after() _kale_step_limits = {'cpu': '1', 'memory': '40Gi'} for _kale_k, _kale_v in _kale_step_limits.items(): _kale_integral4_task.container.add_resource_limit(_kale_k, _kale_v) _kale_integral4_task.container.working_dir = "//shared_volume" _kale_integral4_task.container.set_security_context( k8s_client.V1SecurityContext(run_as_user=0)) _kale_output_artifacts = {} _kale_output_artifacts.update( {'mlpipeline-ui-metadata': '/tmp/mlpipeline-ui-metadata.json'}) _kale_output_artifacts.update({'integral4': '/integral4.html'}) _kale_integral4_task.output_artifact_paths.update(_kale_output_artifacts) _kale_integral4_task.add_pod_label( "pipelines.kubeflow.org/metadata_written", "true") _kale_dep_names = (_kale_integral4_task.dependent_names + _kale_volume_step_names) _kale_integral4_task.add_pod_annotation( "kubeflow-kale.org/dependent-templates", json.dumps(_kale_dep_names)) if _kale_volume_name_parameters: _kale_integral4_task.add_pod_annotation( "kubeflow-kale.org/volume-name-parameters", json.dumps(_kale_volume_name_parameters)) _kale_integral5_task = _kale_integral5_op(density_p)\ .add_pvolumes(_kale_pvolumes_dict)\ .after() _kale_step_limits = {'cpu': '1', 'memory': '40Gi'} for _kale_k, _kale_v in _kale_step_limits.items(): _kale_integral5_task.container.add_resource_limit(_kale_k, _kale_v) _kale_integral5_task.container.working_dir = "//shared_volume" _kale_integral5_task.container.set_security_context( k8s_client.V1SecurityContext(run_as_user=0)) _kale_output_artifacts = {} _kale_output_artifacts.update( {'mlpipeline-ui-metadata': '/tmp/mlpipeline-ui-metadata.json'}) _kale_output_artifacts.update({'integral5': '/integral5.html'}) _kale_integral5_task.output_artifact_paths.update(_kale_output_artifacts) _kale_integral5_task.add_pod_label( "pipelines.kubeflow.org/metadata_written", "true") _kale_dep_names = (_kale_integral5_task.dependent_names + _kale_volume_step_names) _kale_integral5_task.add_pod_annotation( "kubeflow-kale.org/dependent-templates", json.dumps(_kale_dep_names)) if _kale_volume_name_parameters: _kale_integral5_task.add_pod_annotation( "kubeflow-kale.org/volume-name-parameters", json.dumps(_kale_volume_name_parameters)) _kale_integral6_task = _kale_integral6_op(density_p)\ .add_pvolumes(_kale_pvolumes_dict)\ .after() _kale_step_limits = {'cpu': '1', 'memory': '40Gi'} for _kale_k, _kale_v in _kale_step_limits.items(): _kale_integral6_task.container.add_resource_limit(_kale_k, _kale_v) _kale_integral6_task.container.working_dir = "//shared_volume" _kale_integral6_task.container.set_security_context( k8s_client.V1SecurityContext(run_as_user=0)) _kale_output_artifacts = {} _kale_output_artifacts.update( {'mlpipeline-ui-metadata': '/tmp/mlpipeline-ui-metadata.json'}) _kale_output_artifacts.update({'integral6': '/integral6.html'}) _kale_integral6_task.output_artifact_paths.update(_kale_output_artifacts) _kale_integral6_task.add_pod_label( "pipelines.kubeflow.org/metadata_written", "true") _kale_dep_names = (_kale_integral6_task.dependent_names + _kale_volume_step_names) _kale_integral6_task.add_pod_annotation( "kubeflow-kale.org/dependent-templates", json.dumps(_kale_dep_names)) if _kale_volume_name_parameters: _kale_integral6_task.add_pod_annotation( "kubeflow-kale.org/volume-name-parameters", json.dumps(_kale_volume_name_parameters))
def auto_generated_pipeline(date_of_processing='02_06_2021', dir_mask_specie='Ponca_DV', dir_specie='Ponca_DV_loc', dir_years='forest_jEquihua_mar', file_mask_specie='poncamask.tif', file_specie='poncadav2', specie='pan_onca', vol_shared_volume='hostpath-pvc'): _kale_pvolumes_dict = OrderedDict() _kale_volume_step_names = [] _kale_volume_name_parameters = [] _kale_annotations = {} _kale_volume = _kfp_dsl.PipelineVolume(pvc=vol_shared_volume) _kale_pvolumes_dict['/shared_volume'] = _kale_volume _kale_volume_step_names.sort() _kale_volume_name_parameters.sort() _kale_downloadfroms3_task = _kale_downloadfroms3_op()\ .add_pvolumes(_kale_pvolumes_dict)\ .after() _kale_downloadfroms3_task.container.working_dir = "//shared_volume/kube_sipecam_playground/hsi/notebooks" _kale_downloadfroms3_task.container.set_security_context( k8s_client.V1SecurityContext(run_as_user=0)) _kale_output_artifacts = {} _kale_output_artifacts.update( {'mlpipeline-ui-metadata': '/tmp/mlpipeline-ui-metadata.json'}) _kale_output_artifacts.update({'downloadfroms3': '/downloadfroms3.html'}) _kale_downloadfroms3_task.output_artifact_paths.update( _kale_output_artifacts) _kale_downloadfroms3_task.add_pod_label( "pipelines.kubeflow.org/metadata_written", "true") _kale_dep_names = (_kale_downloadfroms3_task.dependent_names + _kale_volume_step_names) _kale_downloadfroms3_task.add_pod_annotation( "kubeflow-kale.org/dependent-templates", json.dumps(_kale_dep_names)) if _kale_volume_name_parameters: _kale_downloadfroms3_task.add_pod_annotation( "kubeflow-kale.org/volume-name-parameters", json.dumps(_kale_volume_name_parameters)) _kale_readdatainput_task = _kale_readdatainput_op(dir_mask_specie, dir_specie, file_mask_specie, file_specie)\ .add_pvolumes(_kale_pvolumes_dict)\ .after(_kale_downloadfroms3_task) _kale_readdatainput_task.container.working_dir = "//shared_volume/kube_sipecam_playground/hsi/notebooks" _kale_readdatainput_task.container.set_security_context( k8s_client.V1SecurityContext(run_as_user=0)) _kale_output_artifacts = {} _kale_output_artifacts.update( {'mlpipeline-ui-metadata': '/tmp/mlpipeline-ui-metadata.json'}) _kale_output_artifacts.update({'readdatainput': '/readdatainput.html'}) _kale_readdatainput_task.output_artifact_paths.update( _kale_output_artifacts) _kale_readdatainput_task.add_pod_label( "pipelines.kubeflow.org/metadata_written", "true") _kale_dep_names = (_kale_readdatainput_task.dependent_names + _kale_volume_step_names) _kale_readdatainput_task.add_pod_annotation( "kubeflow-kale.org/dependent-templates", json.dumps(_kale_dep_names)) if _kale_volume_name_parameters: _kale_readdatainput_task.add_pod_annotation( "kubeflow-kale.org/volume-name-parameters", json.dumps(_kale_volume_name_parameters)) _kale_reproject_task = _kale_reproject_op()\ .add_pvolumes(_kale_pvolumes_dict)\ .after(_kale_readdatainput_task) _kale_reproject_task.container.working_dir = "//shared_volume/kube_sipecam_playground/hsi/notebooks" _kale_reproject_task.container.set_security_context( k8s_client.V1SecurityContext(run_as_user=0)) _kale_output_artifacts = {} _kale_output_artifacts.update( {'mlpipeline-ui-metadata': '/tmp/mlpipeline-ui-metadata.json'}) _kale_output_artifacts.update({'reproject': '/reproject.html'}) _kale_reproject_task.output_artifact_paths.update(_kale_output_artifacts) _kale_reproject_task.add_pod_label( "pipelines.kubeflow.org/metadata_written", "true") _kale_dep_names = (_kale_reproject_task.dependent_names + _kale_volume_step_names) _kale_reproject_task.add_pod_annotation( "kubeflow-kale.org/dependent-templates", json.dumps(_kale_dep_names)) if _kale_volume_name_parameters: _kale_reproject_task.add_pod_annotation( "kubeflow-kale.org/volume-name-parameters", json.dumps(_kale_volume_name_parameters)) _kale_createtestdata_task = _kale_createtestdata_op(dir_years)\ .add_pvolumes(_kale_pvolumes_dict)\ .after(_kale_reproject_task) _kale_createtestdata_task.container.working_dir = "//shared_volume/kube_sipecam_playground/hsi/notebooks" _kale_createtestdata_task.container.set_security_context( k8s_client.V1SecurityContext(run_as_user=0)) _kale_output_artifacts = {} _kale_output_artifacts.update( {'mlpipeline-ui-metadata': '/tmp/mlpipeline-ui-metadata.json'}) _kale_output_artifacts.update({'createtestdata': '/createtestdata.html'}) _kale_createtestdata_task.output_artifact_paths.update( _kale_output_artifacts) _kale_createtestdata_task.add_pod_label( "pipelines.kubeflow.org/metadata_written", "true") _kale_dep_names = (_kale_createtestdata_task.dependent_names + _kale_volume_step_names) _kale_createtestdata_task.add_pod_annotation( "kubeflow-kale.org/dependent-templates", json.dumps(_kale_dep_names)) if _kale_volume_name_parameters: _kale_createtestdata_task.add_pod_annotation( "kubeflow-kale.org/volume-name-parameters", json.dumps(_kale_volume_name_parameters)) _kale_maskandextract_task = _kale_maskandextract_op()\ .add_pvolumes(_kale_pvolumes_dict)\ .after(_kale_createtestdata_task) _kale_maskandextract_task.container.working_dir = "//shared_volume/kube_sipecam_playground/hsi/notebooks" _kale_maskandextract_task.container.set_security_context( k8s_client.V1SecurityContext(run_as_user=0)) _kale_output_artifacts = {} _kale_output_artifacts.update( {'mlpipeline-ui-metadata': '/tmp/mlpipeline-ui-metadata.json'}) _kale_output_artifacts.update({'maskandextract': '/maskandextract.html'}) _kale_maskandextract_task.output_artifact_paths.update( _kale_output_artifacts) _kale_maskandextract_task.add_pod_label( "pipelines.kubeflow.org/metadata_written", "true") _kale_dep_names = (_kale_maskandextract_task.dependent_names + _kale_volume_step_names) _kale_maskandextract_task.add_pod_annotation( "kubeflow-kale.org/dependent-templates", json.dumps(_kale_dep_names)) if _kale_volume_name_parameters: _kale_maskandextract_task.add_pod_annotation( "kubeflow-kale.org/volume-name-parameters", json.dumps(_kale_volume_name_parameters)) _kale_bestmodel_task = _kale_bestmodel_op()\ .add_pvolumes(_kale_pvolumes_dict)\ .after(_kale_maskandextract_task) _kale_bestmodel_task.container.working_dir = "//shared_volume/kube_sipecam_playground/hsi/notebooks" _kale_bestmodel_task.container.set_security_context( k8s_client.V1SecurityContext(run_as_user=0)) _kale_output_artifacts = {} _kale_output_artifacts.update( {'mlpipeline-ui-metadata': '/tmp/mlpipeline-ui-metadata.json'}) _kale_output_artifacts.update({'bestmodel': '/bestmodel.html'}) _kale_bestmodel_task.output_artifact_paths.update(_kale_output_artifacts) _kale_bestmodel_task.add_pod_label( "pipelines.kubeflow.org/metadata_written", "true") _kale_dep_names = (_kale_bestmodel_task.dependent_names + _kale_volume_step_names) _kale_bestmodel_task.add_pod_annotation( "kubeflow-kale.org/dependent-templates", json.dumps(_kale_dep_names)) if _kale_volume_name_parameters: _kale_bestmodel_task.add_pod_annotation( "kubeflow-kale.org/volume-name-parameters", json.dumps(_kale_volume_name_parameters)) _kale_temporalprojection_task = _kale_temporalprojection_op(date_of_processing, specie)\ .add_pvolumes(_kale_pvolumes_dict)\ .after(_kale_bestmodel_task) _kale_temporalprojection_task.container.working_dir = "//shared_volume/kube_sipecam_playground/hsi/notebooks" _kale_temporalprojection_task.container.set_security_context( k8s_client.V1SecurityContext(run_as_user=0)) _kale_output_artifacts = {} _kale_output_artifacts.update( {'mlpipeline-ui-metadata': '/tmp/mlpipeline-ui-metadata.json'}) _kale_output_artifacts.update( {'temporalprojection': '/temporalprojection.html'}) _kale_temporalprojection_task.output_artifact_paths.update( _kale_output_artifacts) _kale_temporalprojection_task.add_pod_label( "pipelines.kubeflow.org/metadata_written", "true") _kale_dep_names = (_kale_temporalprojection_task.dependent_names + _kale_volume_step_names) _kale_temporalprojection_task.add_pod_annotation( "kubeflow-kale.org/dependent-templates", json.dumps(_kale_dep_names)) if _kale_volume_name_parameters: _kale_temporalprojection_task.add_pod_annotation( "kubeflow-kale.org/volume-name-parameters", json.dumps(_kale_volume_name_parameters)) _kale_uploadtos3_task = _kale_uploadtos3_op(date_of_processing)\ .add_pvolumes(_kale_pvolumes_dict)\ .after(_kale_temporalprojection_task) _kale_uploadtos3_task.container.working_dir = "//shared_volume/kube_sipecam_playground/hsi/notebooks" _kale_uploadtos3_task.container.set_security_context( k8s_client.V1SecurityContext(run_as_user=0)) _kale_output_artifacts = {} _kale_output_artifacts.update( {'mlpipeline-ui-metadata': '/tmp/mlpipeline-ui-metadata.json'}) _kale_output_artifacts.update({'uploadtos3': '/uploadtos3.html'}) _kale_uploadtos3_task.output_artifact_paths.update(_kale_output_artifacts) _kale_uploadtos3_task.add_pod_label( "pipelines.kubeflow.org/metadata_written", "true") _kale_dep_names = (_kale_uploadtos3_task.dependent_names + _kale_volume_step_names) _kale_uploadtos3_task.add_pod_annotation( "kubeflow-kale.org/dependent-templates", json.dumps(_kale_dep_names)) if _kale_volume_name_parameters: _kale_uploadtos3_task.add_pod_annotation( "kubeflow-kale.org/volume-name-parameters", json.dumps(_kale_volume_name_parameters))
import kfp.dsl as dsl import kfp.gcp as gcp import kfp.components as comp import os from kubernetes import client as k8s_client feature_extraction_op = comp.load_component_from_file( os.path.join("./components/feature-extractor/", 'feature_extractor_component.yaml')) ipc_shared_mem_volume = dsl.PipelineVolume(name='shm-vol', empty_dir={'medium': 'Memory'}) train_op = comp.load_component_from_file( os.path.join("./components/train/", 'train_component.yaml')) @dsl.pipeline(name='VoxCeleb Baseline Reproduction Pipeline', description='Train baseline models') # Define a pipeline and create a task from a component # @TODO abstract code shared with test_full_pipeline.py def baseline_repro_pipeline( data_bucket: str = 'voxsrc-2020-voxceleb-v4', test_list: str = 'vox1_full.txt', # @note test_utterances_list is in the same format as train_list, but for # the test data. Whereas test_list contains utterance pairs for # evaluation test_utterances_list: str = 'vox1_full_utterances.txt', train_list: str = 'vox2_full.txt', test_path: str = 'vox1_full.tar.gz', train_path: str = 'vox2_full.tar.gz',
def auto_generated_pipeline(vol_shared_volume='efs'): pvolumes_dict = OrderedDict() annotations = {} volume = dsl.PipelineVolume(pvc=vol_shared_volume) pvolumes_dict['/shared_volume/'] = volume loaddata_task = loaddata_op(vol_shared_volume)\ .add_pvolumes(pvolumes_dict)\ .after() loaddata_task.container.working_dir = "/shared_volume/notebooks/titanic" loaddata_task.container.set_security_context( k8s_client.V1SecurityContext(run_as_user=0)) datapreprocessing_task = datapreprocessing_op(vol_shared_volume)\ .add_pvolumes(pvolumes_dict)\ .after(loaddata_task) datapreprocessing_task.container.working_dir = "/shared_volume/notebooks/titanic" datapreprocessing_task.container.set_security_context( k8s_client.V1SecurityContext(run_as_user=0)) featureengineering_task = featureengineering_op(vol_shared_volume)\ .add_pvolumes(pvolumes_dict)\ .after(datapreprocessing_task) featureengineering_task.container.working_dir = "/shared_volume/notebooks/titanic" featureengineering_task.container.set_security_context( k8s_client.V1SecurityContext(run_as_user=0)) decisiontree_task = decisiontree_op(vol_shared_volume)\ .add_pvolumes(pvolumes_dict)\ .after(featureengineering_task) decisiontree_task.container.working_dir = "/shared_volume/notebooks/titanic" decisiontree_task.container.set_security_context( k8s_client.V1SecurityContext(run_as_user=0)) svm_task = svm_op(vol_shared_volume)\ .add_pvolumes(pvolumes_dict)\ .after(featureengineering_task) svm_task.container.working_dir = "/shared_volume/notebooks/titanic" svm_task.container.set_security_context( k8s_client.V1SecurityContext(run_as_user=0)) naivebayes_task = naivebayes_op(vol_shared_volume)\ .add_pvolumes(pvolumes_dict)\ .after(featureengineering_task) naivebayes_task.container.working_dir = "/shared_volume/notebooks/titanic" naivebayes_task.container.set_security_context( k8s_client.V1SecurityContext(run_as_user=0)) logisticregression_task = logisticregression_op(vol_shared_volume)\ .add_pvolumes(pvolumes_dict)\ .after(featureengineering_task) logisticregression_task.container.working_dir = "/shared_volume/notebooks/titanic" logisticregression_task.container.set_security_context( k8s_client.V1SecurityContext(run_as_user=0)) randomforest_task = randomforest_op(vol_shared_volume)\ .add_pvolumes(pvolumes_dict)\ .after(featureengineering_task) randomforest_task.container.working_dir = "/shared_volume/notebooks/titanic" randomforest_task.container.set_security_context( k8s_client.V1SecurityContext(run_as_user=0)) results_task = results_op(vol_shared_volume)\ .add_pvolumes(pvolumes_dict)\ .after(randomforest_task, logisticregression_task, naivebayes_task, svm_task, decisiontree_task) results_task.container.working_dir = "/shared_volume/notebooks/titanic" results_task.container.set_security_context( k8s_client.V1SecurityContext(run_as_user=0))
def cnn_pipeline(dist_training=params.dist_training, gpu_support=params.gpu_support): now = datetime.datetime.utcnow().strftime("%Y%m%d%H%M%S") workspace_name = 'cnn_txt_classifier' + now run_name = workspace_name + now prepare_data_task = prepare_data_operation(train_data_path, test_data_path, column_target_value, column_text_value, val_data_pct, gcp_bucket) dataset_input_values_train = { "description": "Train data", "name": train_data_path, "owner": "*****@*****.**", "data_uri": 'gs://' + "train_data_path", "version": "v0.1", "query": "autogen", "labels": None } dataset_log_train_data_task = metadata_logger_operation( log_type='dataset', workspace_name=workspace_name, run_name=run_name, input_values=dataset_input_values_train) dataset_input_values_test = { "description": "Test data", "name": test_data_path, "owner": "*****@*****.**", "data_uri": 'gs://' + "test_data_path", "version": "v0.1", "query": "autogen", "labels": None } log_test_data_task = metadata_logger_operation( log_type='dataset', workspace_name=workspace_name, run_name=run_name, input_values=dataset_input_values_test) prepare_emb_task = prepare_emb_operation( gcp_bucket, num_words, w2v_model_path, embbeding_dim, prepare_data_task.outputs['json_tokenizer_path']) generate_model_task = generate_model_operation( num_conv_layers, maxpool_strides, dropout, filter_sizes, num_filters, embbeding_dim, prepare_emb_task.outputs['output_emb_matrix_path'], prepare_emb_task.outputs['vocabulary_size_path'], prepare_data_task.outputs['max_sequence_lenght_path'], prepare_data_task.outputs['num_classes_path']) model_input_values_cnn = { "description": "CNN Keras model", "name": "CNN 100d - 3 convolutions - 100 filters - softmax", "owner": "*****@*****.**", "model_uri": workdir, "version": "v0.1", "hyperparameters": "epochs:" + str(epochs) + " batch_size:" + str(batch_size) + " dropout:" + str(dropout) + "num filters:" + str(num_filters), "learning_rate": None, "layers": filter_sizes, "early_stop": True, "labels": None } log_test_data_task = metadata_logger_operation( log_type='model', workspace_name=workspace_name, run_name=run_name, input_values=model_input_values_cnn).after(generate_model_task) with dsl.Condition(dist_training == 'yes'): move_data_pvc = move_data_pvc_operation( generate_model_task.outputs['output_keras_model_path'], prepare_data_task.outputs['x_train_data_path'], prepare_data_task.outputs['x_val_data_path'], prepare_data_task.outputs['y_train_data_path'], prepare_data_task.outputs['y_val_data_path'], workdir).add_pvolumes( {workdir: dsl.PipelineVolume(pvc="kfpipeline-data-pvc")}) train_time_start = datetime.datetime.utcnow() train_model_task_dist = train_model_operation_dist( epochs, batch_size, namespace, workdir).after(move_data_pvc) train_time_finish = datetime.datetime.utcnow() metrics_cnn_input_values = { "description": "Training metrics", "name": "training_metrics", "owner": "*****@*****.**", "metric_uri": workdir, "data_set_id": None, "model_id": None, "metrics_type": metadata.Metrics.TESTING, "values": { "train_start_time": train_time_start.strftime("%Y%m%d%H%M%S"), "train_finish_time": train_time_finish.strftime("%Y%m%d%H%M%S") }, "early_stop": True, "labels": None } log_cnn_metric_data_task = metadata_logger_operation( log_type='metrics', workspace_name=workspace_name, run_name=run_name, input_values=metrics_cnn_input_values).after( train_model_task_dist) deploy_model_task = deploy_model_operation_pvc( namespace, workdir).after(train_model_task_dist) with dsl.Condition(dist_training == 'no'): train_time_start = datetime.datetime.utcnow() with dsl.Condition(gpu_support == 'no'): train_model_task = train_model_operation( generate_model_task.outputs['output_keras_model_path'], prepare_data_task.outputs['x_train_data_path'], prepare_data_task.outputs['x_val_data_path'], prepare_data_task.outputs['y_train_data_path'], prepare_data_task.outputs['y_val_data_path'], batch_size, epochs) train_time_finish = datetime.datetime.utcnow() metrics_cnn_input_values = { "description": "Training metrics", "name": "training_metrics", "owner": "*****@*****.**", "metric_uri": workdir, "data_set_id": None, "model_id": None, "metrics_type": metadata.Metrics.TESTING, "values": { "train_start_time": train_time_start.strftime("%Y%m%d%H%M%S"), "train_finish_time": train_time_finish.strftime("%Y%m%d%H%M%S") }, "early_stop": True, "labels": None } log_cnn_metric_data_task = metadata_logger_operation( log_type='metrics', workspace_name=workspace_name, run_name=run_name, input_values=metrics_cnn_input_values).after( train_model_task) deploy_model_task = deploy_model_operation_par( namespace, train_model_task.outputs['output_trained_model_path']) with dsl.Condition(gpu_support == 'yes'): train_model_task = train_model_operation( generate_model_task.outputs['output_keras_model_path'], prepare_data_task.outputs['x_train_data_path'], prepare_data_task.outputs['x_val_data_path'], prepare_data_task.outputs['y_train_data_path'], prepare_data_task.outputs['y_val_data_path'], batch_size, epochs).set_gpu_limit(1) train_time_finish = datetime.datetime.utcnow() metrics_cnn_input_values = { "description": "Training metrics", "name": "training_metrics", "owner": "*****@*****.**", "metric_uri": workdir, "data_set_id": None, "model_id": None, "metrics_type": metadata.Metrics.TESTING, "values": { "train_start_time": train_time_start.strftime("%Y%m%d%H%M%S"), "train_finish_time": train_time_finish.strftime("%Y%m%d%H%M%S") }, "early_stop": True, "labels": None } log_cnn_metric_data_task = metadata_logger_operation( log_type='metrics', workspace_name=workspace_name, run_name=run_name, input_values=metrics_cnn_input_values).after( train_model_task) deploy_model_task = deploy_model_operation_par( namespace, train_model_task.outputs['output_trained_model_path'])