Ejemplo n.º 1
0
def demo_pipeline(
    fastqs=["/mnt/data/file1.fastq.gz", "/mnt/data/file2.fastq.gz"],
    leading: int = 5,
    trailing: int = 5,
    minlen: int = 80,
    sliding_window: str = "4:25",
    bar_color: str = "white",
    flier_color: str = "grey",
    plot_color: str = "darkgrid",
):
    """
    func_to_container_op simply converts the function into a factory that produces ops
    when called. add_pvolumes is a method of the op itself, so we need to apply it here
    when the op is actually generated, NOT above where the trim_op factory is created.
    """
    with dsl.ParallelFor(fastqs) as fastq:
        trim_task = trim_op(
            fastq=fastq,
            leading=leading,
            trailing=trailing,
            minlen=minlen,
            sliding_window=sliding_window,
        ).add_pvolumes(
            {"/mnt/data": dsl.PipelineVolume(pvc="test-data-pv-claim")})

        _ = plot_op(fastq=fastq,
                    trimmed_fastq=trim_task.outputs["trimmed_fastq"],
                    bar_color=bar_color,
                    flier_color=flier_color,
                    plot_color=plot_color).add_pvolumes({
                        "/mnt/data":
                        dsl.PipelineVolume(pvc="test-data-pv-claim")
                    })
Ejemplo n.º 2
0
def demo_model():
    unZipFile = dsl.ContainerOp(
        name="Upzip file",
        image=config["depend_image"],
        command=["sh", "-c"],
        arguments=["python3 " + config["unzipPath"]],
        pvolumes={config["pvolumePath"]: dsl.PipelineVolume(pvc="downloads-pvc")}
    )
 def demo_op(input_notebook: str, output_notebook: str):
     return dsl.ContainerOp(
         name='papermill',
         image=image,
         command=['sh', '-c'],
         pvolumes={"Mount": dsl.PipelineVolume(pvc="xyz", name='xyz')
                   },  #Mount here and replace xyz
         arguments=['papermill $0 $1', input_notebook, output_notebook])
Ejemplo n.º 4
0
 def binding_to_volumes(
         self,
         params: DictConfig,  # inputs
         args: t.List[t.Text],
         volume_mounts: t.Dict) -> None:  # outputs
     pvc_name = self.mlcube.runner.pvc
     vol_mount_prefix = '/mnt/mlcube'
     for param_name, param_def in params.items():
         args.append(f"--{param_name}=" + vol_mount_prefix + pvc_name +
                     "/" + param_def.default)
     volume_mounts[vol_mount_prefix +
                   pvc_name] = dsl.PipelineVolume(pvc=pvc_name)
Ejemplo n.º 5
0
def demo_model():
    createModel = dsl.ContainerOp(
        name="create model",
        image=config["model_depend_image"],
        command=["sh", "-c"],
        arguments=["python3 " + config["modelPath"]],
        pvolumes={config["pvolumePath"]: dsl.PipelineVolume(pvc="downloads-pvc")}
    )

    moveModel = dsl.ContainerOp(
        name="Move_Model_to_IPC",
        image=config["move_depend_image"],
        pvolumes={config["pvolumePath"]: createModel.pvolume}
    )
def sl_segmentation_pipeline(
        python_train_path='/deephealth/use_case_pipeline/python/skin_lesion_segmentation_training.py',
        input_dataset_yaml='/deephealth/dataset/isic_segmentation/isic_segmentation.yml',
        output_path='/deephealth/outputs',
        num_epochs: Integer() = 1,
        num_batch_size: Integer() = 10,
        output_dataset_folder='/deephealth/dataset/isic_segmentation',
        split_partition_number: Integer() = 3,
        python_inference_path='/deephealth/use_case_pipeline/python/skin_lesion_segmentation_inference.py',
        model_file_folder='/deephealth/outputs',
        is_gpu_used='no'):
    dhealth_vop = dsl.PipelineVolume(pvc='dhealth-efs-claim')
    dhealth_vop_param = {'/deephealth': dhealth_vop}

    # gpu = ''
    # if is_gpu_used == 'yes':
    #     gpu = 'yes'

    _gpu = _get_gpu_op(is_gpu_used) \
        .set_display_name("GPU input parameter")

    _train_op = dhealth_train_sl_segmentation_op(python_train_path, input_dataset_yaml, output_path, num_epochs,
                                                 num_batch_size, _gpu.outputs) \
        .after(_gpu) \
        .add_pvolumes(dhealth_vop_param) \
        .set_display_name('Training Model')

    _split_yaml_op = dhealth_splityaml_op(input_dataset_yaml, output_dataset_folder, split_partition_number) \
        .after(_train_op) \
        .add_pvolumes(dhealth_vop_param) \
        .set_display_name('Split Datset YAML')

    model: ContainerOp = _get_latest_model_op(model_file_folder) \
        .after(_split_yaml_op) \
        .add_pvolumes({"/deephealth": dhealth_vop}) \
        .set_display_name('Load Model')

    subyamls = _get_yaml_op(output_dataset_folder) \
        .after(_split_yaml_op) \
        .add_pvolumes(dhealth_vop_param) \
        .set_display_name('Load sub-YAMLs')

    with dsl.ParallelFor(subyamls.outputs['yamlfile']) as sub_yaml:
        dhealth_inference_sl_segmentation_op(
            python_inference_path, sub_yaml, model.output, output_path, 2, is_gpu_used) \
            .add_pvolumes(dhealth_vop_param) \
            .set_display_name('Inference')
Ejemplo n.º 7
0
def volumeop_sequential():

    step1 = dsl.ContainerOp(
        name="step1",
        image="library/bash:4.4.23",
        command=["sh", "-c"],
        arguments=["echo 1|tee /data/file1"],
        pvolumes={"/data": dsl.PipelineVolume(pvc="hostpath-pvc")})

    step2 = dsl.ContainerOp(name="step2",
                            image="library/bash:4.4.23",
                            command=["sh", "-c"],
                            arguments=["cp /data/file1 /data/file2"],
                            pvolumes={"/data": step1.pvolume})

    step3 = dsl.ContainerOp(name="step3",
                            image="library/bash:4.4.23",
                            command=["cat", "/mnt/file1", "/mnt/file2"],
                            pvolumes={
                                "/mnt": step2.pvolume,
                            })
Ejemplo n.º 8
0
def volumeop_sequential():
    createModel = dsl.ContainerOp(
        name="create_model",
        image="tensorflow/tensorflow:2.3.0",
        command=["sh", "-c"],
        arguments=["python3 /data/models/dslModel.py"],
        file_outputs={'output': '/data/serving_status'},
        pvolumes={"/data": dsl.PipelineVolume(pvc="hostpath-pvc")})
    with dsl.Condition(createModel.output != '1'):
        create_serving = dsl.ResourceOp(
            name='create_serving',
            k8s_resource=json.loads(_CONTAINER_MANIFEST),
            action='apply')
        test_serving = dsl.ContainerOp(
            name="test_serving",
            image="yanqin/fabric:v1",
        ).after(create_serving)
    with dsl.Condition(createModel.output == '1'):
        test_serving = dsl.ContainerOp(
            name="test_serving",
            image="yanqin/fabric:v1",
        )
Ejemplo n.º 9
0
def auto_generated_pipeline(CANDIES='20', vol_shared_volume='efs'):
    pvolumes_dict = OrderedDict()

    annotations = {}

    volume = dsl.PipelineVolume(pvc=vol_shared_volume)

    pvolumes_dict['/shared_volume/'] = volume

    sack_task = sack_op(CANDIES, vol_shared_volume)\
        .add_pvolumes(pvolumes_dict)\
        .after()
    sack_task.container.working_dir = "/shared_volume/notebooks/kale-base-example"
    sack_task.container.set_security_context(
        k8s_client.V1SecurityContext(run_as_user=0))

    kid1_task = kid1_op(CANDIES, vol_shared_volume)\
        .add_pvolumes(pvolumes_dict)\
        .after(sack_task)
    kid1_task.container.working_dir = "/shared_volume/notebooks/kale-base-example"
    kid1_task.container.set_security_context(
        k8s_client.V1SecurityContext(run_as_user=0))

    kid2_task = kid2_op(CANDIES, vol_shared_volume)\
        .add_pvolumes(pvolumes_dict)\
        .after(kid1_task)
    kid2_task.container.working_dir = "/shared_volume/notebooks/kale-base-example"
    kid2_task.container.set_security_context(
        k8s_client.V1SecurityContext(run_as_user=0))

    kid3_task = kid3_op(CANDIES, vol_shared_volume)\
        .add_pvolumes(pvolumes_dict)\
        .after(kid2_task)
    kid3_task.container.working_dir = "/shared_volume/notebooks/kale-base-example"
    kid3_task.container.set_security_context(
        k8s_client.V1SecurityContext(run_as_user=0))
Ejemplo n.º 10
0
def auto_generated_pipeline(TRAIN_STEPS='2', vol_shared_volume='efs'):
    pvolumes_dict = OrderedDict()

    annotations = {}

    volume = dsl.PipelineVolume(pvc=vol_shared_volume)

    pvolumes_dict['/shared_volume/'] = volume

    dataprocessing_task = dataprocessing_op(TRAIN_STEPS, vol_shared_volume)\
        .add_pvolumes(pvolumes_dict)\
        .after()
    dataprocessing_task.container.working_dir = "/shared_volume/notebooks/pytorch-classif"
    dataprocessing_task.container.set_security_context(
        k8s_client.V1SecurityContext(run_as_user=0))

    train_task = train_op(TRAIN_STEPS, vol_shared_volume)\
        .add_pvolumes(pvolumes_dict)\
        .after(dataprocessing_task)
    train_task.container.working_dir = "/shared_volume/notebooks/pytorch-classif"
    train_task.container.set_security_context(
        k8s_client.V1SecurityContext(run_as_user=0))

    testontest_task = testontest_op(TRAIN_STEPS, vol_shared_volume)\
        .add_pvolumes(pvolumes_dict)\
        .after(train_task)
    testontest_task.container.working_dir = "/shared_volume/notebooks/pytorch-classif"
    testontest_task.container.set_security_context(
        k8s_client.V1SecurityContext(run_as_user=0))

    testwhole_task = testwhole_op(TRAIN_STEPS, vol_shared_volume)\
        .add_pvolumes(pvolumes_dict)\
        .after(testontest_task)
    testwhole_task.container.working_dir = "/shared_volume/notebooks/pytorch-classif"
    testwhole_task.container.set_security_context(
        k8s_client.V1SecurityContext(run_as_user=0))
Ejemplo n.º 11
0
def flipcoin_pipeline():
    createModel = dsl.ContainerOp(
        name="create_model",
        image="yanqin/request:v1",
        file_outputs={'output': '/data/file1'},
        pvolumes={"/data": dsl.PipelineVolume(pvc="hostpath-pvc")})
Ejemplo n.º 12
0
def build_pipeline(
    bucket='pysearchml',
    es_host='elasticsearch.elastic-system.svc.cluster.local:9200',
    force_restart=False,
    train_init_date='20160801',
    train_end_date='20160801',
    validation_init_date='20160802',
    validation_end_date='20160802',
    test_init_date='20160803',
    test_end_date='20160803',
    model_name='lambdamart0',
    ranker='lambdamart',
    index='pysearchml'
):
    pvc = dsl.PipelineVolume(pvc='pysearchml-nfs')

    prepare_op = dsl.ContainerOp(
        name='prepare env',
        image=f'gcr.io/{PROJECT_ID}/prepare_env',
        arguments=[
            f'--force_restart={force_restart}',
            f'--es_host={es_host}',
            f'--bucket={bucket}',
            f'--model_name={model_name}'
        ],
        pvolumes={'/data': pvc}
    )

    val_reg_dataset_op = dsl.ContainerOp(
        name='validation regular dataset',
        image=f'gcr.io/{PROJECT_ID}/data_validation',
        arguments=[
            f'--bucket={bucket}/validation/regular',
            f'--validation_init_date={validation_init_date}',
            f'--validation_end_date={validation_end_date}',
            f'--destination=/data/pysearchml/{model_name}/validation_regular'
        ],
        pvolumes={'/data': pvc}
    ).set_display_name('Build Regular Validation Dataset').after(prepare_op)

    val_train_dataset_op = dsl.ContainerOp(
        name='validation train dataset',
        image=f'gcr.io/{PROJECT_ID}/data_validation',
        arguments=[
            f'--bucket={bucket}/validation/train',
            f'--validation_init_date={train_init_date}',
            f'--validation_end_date={train_end_date}',
            f'--destination=/data/pysearchml/{model_name}/validation_train'
        ],
        pvolumes={'/data': pvc}
    ).set_display_name('Build Train Validation Dataset').after(prepare_op)

    val_test_dataset_op = dsl.ContainerOp(
        name='validation test dataset',
        image=f'gcr.io/{PROJECT_ID}/data_validation',
        arguments=[
            f'--bucket={bucket}/validation/test',
            f'--validation_init_date={test_init_date}',
            f'--validation_end_date={test_end_date}',
            f'--destination=/data/pysearchml/{model_name}/validation_test'
        ],
        pvolumes={'/data': pvc}
    ).set_display_name('Build Test Validation Dataset').after(prepare_op)

    train_dataset_op = dsl.ContainerOp(
        name='train dataset',
        image=f'gcr.io/{PROJECT_ID}/data_train',
        command=['python', '/train/run.py'],
        arguments=[
            f'--bucket={bucket}',
            f'--train_init_date={train_init_date}',
            f'--train_end_date={train_end_date}',
            f'--es_host={es_host}',
            f'--model_name={model_name}',
            f'--index={index}',
            f'--destination=/data/pysearchml/{model_name}/train'
        ],
        pvolumes={'/data': pvc}
    ).set_display_name('Build Training Dataset').after(prepare_op)

    katib_op = dsl.ContainerOp(
        name='pySearchML Bayesian Optimization Model',
        image=f'gcr.io/{PROJECT_ID}/model',
        command=['python', '/model/launch_katib.py'],
        arguments=[
            f'--es_host={es_host}',
            f'--model_name={model_name}',
            f'--ranker={ranker}',
            '--name=pysearchml',
            f'--train_file_path=/data/pysearchml/{model_name}/train/train_dataset.txt',
            f'--validation_files_path=/data/pysearchml/{model_name}/validation_regular',
            ('--validation_train_files_path=/data/pysearchml/{model_name}/'
             'validation_train'),
            f'--destination=/data/pysearchml/{model_name}/'
        ],
        pvolumes={'/data': pvc}
    ).set_display_name('Katib Optimization Process').after(
        val_reg_dataset_op, val_train_dataset_op, val_test_dataset_op, train_dataset_op
    )

    post_model_op = dsl.ContainerOp(
        name='Post Best RankLib Model to ES',
        image=f'gcr.io/{PROJECT_ID}/model',
        command=['python', '/model/post_model.py'],
        arguments=[
            f'--es_host={es_host}',
            f'--model_name={model_name}',
            f'--destination=/data/pysearchml/{model_name}/best_model.txt'
        ],
        pvolumes={'/data': pvc}
    ).set_display_name('Post RankLib Model to ES').after(katib_op)

    _ = dsl.ContainerOp(
        name='Test Model',
        image=f'gcr.io/{PROJECT_ID}/model',
        command=['python', '/model/test.py'],
        arguments=[
            f'--files_path=/data/pysearchml/{model_name}/validation_test',
            f'--index={index}',
            f'--es_host={es_host}',
            f'--model_name={model_name}',
        ],
        pvolumes={'/data': pvc}
    ).set_display_name('Run Test Step').after(post_model_op)
def auto_generated_pipeline(density_p='1000000000',
                            vol_shared_volume='hostpath-pvc'):
    _kale_pvolumes_dict = OrderedDict()
    _kale_volume_step_names = []
    _kale_volume_name_parameters = []

    _kale_annotations = {}

    _kale_volume = _kfp_dsl.PipelineVolume(pvc=vol_shared_volume)

    _kale_pvolumes_dict['/shared_volume'] = _kale_volume

    _kale_volume_step_names.sort()
    _kale_volume_name_parameters.sort()

    _kale_integral1_task = _kale_integral1_op(density_p)\
        .add_pvolumes(_kale_pvolumes_dict)\
        .after()

    #see: https://kubeflow-pipelines.readthedocs.io/en/latest/source/kfp.dsl.html#kfp.dsl.Sidecar.add_resource_limit
    #can also be: _kale_step_limits = {'nvidia.com/gpu': '1'}
    _kale_step_limits = {'cpu': '1', 'memory': '40Gi'}
    for _kale_k, _kale_v in _kale_step_limits.items():
        _kale_integral1_task.container.add_resource_limit(_kale_k, _kale_v)

    _kale_integral1_task.container.working_dir = "//shared_volume"
    _kale_integral1_task.container.set_security_context(
        k8s_client.V1SecurityContext(run_as_user=0))
    _kale_output_artifacts = {}
    _kale_output_artifacts.update(
        {'mlpipeline-ui-metadata': '/tmp/mlpipeline-ui-metadata.json'})
    _kale_output_artifacts.update({'integral1': '/integral1.html'})
    _kale_integral1_task.output_artifact_paths.update(_kale_output_artifacts)
    _kale_integral1_task.add_pod_label(
        "pipelines.kubeflow.org/metadata_written", "true")
    _kale_dep_names = (_kale_integral1_task.dependent_names +
                       _kale_volume_step_names)
    _kale_integral1_task.add_pod_annotation(
        "kubeflow-kale.org/dependent-templates", json.dumps(_kale_dep_names))
    if _kale_volume_name_parameters:
        _kale_integral1_task.add_pod_annotation(
            "kubeflow-kale.org/volume-name-parameters",
            json.dumps(_kale_volume_name_parameters))

    _kale_integral2_task = _kale_integral2_op(density_p)\
        .add_pvolumes(_kale_pvolumes_dict)\
        .after()

    _kale_step_limits = {'cpu': '1', 'memory': '40Gi'}
    for _kale_k, _kale_v in _kale_step_limits.items():
        _kale_integral2_task.container.add_resource_limit(_kale_k, _kale_v)

    _kale_integral2_task.container.working_dir = "//shared_volume"
    _kale_integral2_task.container.set_security_context(
        k8s_client.V1SecurityContext(run_as_user=0))
    _kale_output_artifacts = {}
    _kale_output_artifacts.update(
        {'mlpipeline-ui-metadata': '/tmp/mlpipeline-ui-metadata.json'})
    _kale_output_artifacts.update({'integral2': '/integral2.html'})
    _kale_integral2_task.output_artifact_paths.update(_kale_output_artifacts)
    _kale_integral2_task.add_pod_label(
        "pipelines.kubeflow.org/metadata_written", "true")
    _kale_dep_names = (_kale_integral2_task.dependent_names +
                       _kale_volume_step_names)
    _kale_integral2_task.add_pod_annotation(
        "kubeflow-kale.org/dependent-templates", json.dumps(_kale_dep_names))
    if _kale_volume_name_parameters:
        _kale_integral2_task.add_pod_annotation(
            "kubeflow-kale.org/volume-name-parameters",
            json.dumps(_kale_volume_name_parameters))

    _kale_integral3_task = _kale_integral3_op(density_p)\
        .add_pvolumes(_kale_pvolumes_dict)\
        .after()

    _kale_step_limits = {'cpu': '1', 'memory': '40Gi'}
    for _kale_k, _kale_v in _kale_step_limits.items():
        _kale_integral3_task.container.add_resource_limit(_kale_k, _kale_v)

    _kale_integral3_task.container.working_dir = "//shared_volume"
    _kale_integral3_task.container.set_security_context(
        k8s_client.V1SecurityContext(run_as_user=0))
    _kale_output_artifacts = {}
    _kale_output_artifacts.update(
        {'mlpipeline-ui-metadata': '/tmp/mlpipeline-ui-metadata.json'})
    _kale_output_artifacts.update({'integral3': '/integral3.html'})
    _kale_integral3_task.output_artifact_paths.update(_kale_output_artifacts)
    _kale_integral3_task.add_pod_label(
        "pipelines.kubeflow.org/metadata_written", "true")
    _kale_dep_names = (_kale_integral3_task.dependent_names +
                       _kale_volume_step_names)
    _kale_integral3_task.add_pod_annotation(
        "kubeflow-kale.org/dependent-templates", json.dumps(_kale_dep_names))
    if _kale_volume_name_parameters:
        _kale_integral3_task.add_pod_annotation(
            "kubeflow-kale.org/volume-name-parameters",
            json.dumps(_kale_volume_name_parameters))

    _kale_integral4_task = _kale_integral4_op(density_p)\
        .add_pvolumes(_kale_pvolumes_dict)\
        .after()

    _kale_step_limits = {'cpu': '1', 'memory': '40Gi'}
    for _kale_k, _kale_v in _kale_step_limits.items():
        _kale_integral4_task.container.add_resource_limit(_kale_k, _kale_v)

    _kale_integral4_task.container.working_dir = "//shared_volume"
    _kale_integral4_task.container.set_security_context(
        k8s_client.V1SecurityContext(run_as_user=0))
    _kale_output_artifacts = {}
    _kale_output_artifacts.update(
        {'mlpipeline-ui-metadata': '/tmp/mlpipeline-ui-metadata.json'})
    _kale_output_artifacts.update({'integral4': '/integral4.html'})
    _kale_integral4_task.output_artifact_paths.update(_kale_output_artifacts)
    _kale_integral4_task.add_pod_label(
        "pipelines.kubeflow.org/metadata_written", "true")
    _kale_dep_names = (_kale_integral4_task.dependent_names +
                       _kale_volume_step_names)
    _kale_integral4_task.add_pod_annotation(
        "kubeflow-kale.org/dependent-templates", json.dumps(_kale_dep_names))
    if _kale_volume_name_parameters:
        _kale_integral4_task.add_pod_annotation(
            "kubeflow-kale.org/volume-name-parameters",
            json.dumps(_kale_volume_name_parameters))

    _kale_integral5_task = _kale_integral5_op(density_p)\
        .add_pvolumes(_kale_pvolumes_dict)\
        .after()

    _kale_step_limits = {'cpu': '1', 'memory': '40Gi'}
    for _kale_k, _kale_v in _kale_step_limits.items():
        _kale_integral5_task.container.add_resource_limit(_kale_k, _kale_v)

    _kale_integral5_task.container.working_dir = "//shared_volume"
    _kale_integral5_task.container.set_security_context(
        k8s_client.V1SecurityContext(run_as_user=0))
    _kale_output_artifacts = {}
    _kale_output_artifacts.update(
        {'mlpipeline-ui-metadata': '/tmp/mlpipeline-ui-metadata.json'})
    _kale_output_artifacts.update({'integral5': '/integral5.html'})
    _kale_integral5_task.output_artifact_paths.update(_kale_output_artifacts)
    _kale_integral5_task.add_pod_label(
        "pipelines.kubeflow.org/metadata_written", "true")
    _kale_dep_names = (_kale_integral5_task.dependent_names +
                       _kale_volume_step_names)
    _kale_integral5_task.add_pod_annotation(
        "kubeflow-kale.org/dependent-templates", json.dumps(_kale_dep_names))
    if _kale_volume_name_parameters:
        _kale_integral5_task.add_pod_annotation(
            "kubeflow-kale.org/volume-name-parameters",
            json.dumps(_kale_volume_name_parameters))

    _kale_integral6_task = _kale_integral6_op(density_p)\
        .add_pvolumes(_kale_pvolumes_dict)\
        .after()

    _kale_step_limits = {'cpu': '1', 'memory': '40Gi'}
    for _kale_k, _kale_v in _kale_step_limits.items():
        _kale_integral6_task.container.add_resource_limit(_kale_k, _kale_v)

    _kale_integral6_task.container.working_dir = "//shared_volume"
    _kale_integral6_task.container.set_security_context(
        k8s_client.V1SecurityContext(run_as_user=0))
    _kale_output_artifacts = {}
    _kale_output_artifacts.update(
        {'mlpipeline-ui-metadata': '/tmp/mlpipeline-ui-metadata.json'})
    _kale_output_artifacts.update({'integral6': '/integral6.html'})
    _kale_integral6_task.output_artifact_paths.update(_kale_output_artifacts)
    _kale_integral6_task.add_pod_label(
        "pipelines.kubeflow.org/metadata_written", "true")
    _kale_dep_names = (_kale_integral6_task.dependent_names +
                       _kale_volume_step_names)
    _kale_integral6_task.add_pod_annotation(
        "kubeflow-kale.org/dependent-templates", json.dumps(_kale_dep_names))
    if _kale_volume_name_parameters:
        _kale_integral6_task.add_pod_annotation(
            "kubeflow-kale.org/volume-name-parameters",
            json.dumps(_kale_volume_name_parameters))
Ejemplo n.º 14
0
def auto_generated_pipeline(date_of_processing='02_06_2021',
                            dir_mask_specie='Ponca_DV',
                            dir_specie='Ponca_DV_loc',
                            dir_years='forest_jEquihua_mar',
                            file_mask_specie='poncamask.tif',
                            file_specie='poncadav2',
                            specie='pan_onca',
                            vol_shared_volume='hostpath-pvc'):
    _kale_pvolumes_dict = OrderedDict()
    _kale_volume_step_names = []
    _kale_volume_name_parameters = []

    _kale_annotations = {}

    _kale_volume = _kfp_dsl.PipelineVolume(pvc=vol_shared_volume)

    _kale_pvolumes_dict['/shared_volume'] = _kale_volume

    _kale_volume_step_names.sort()
    _kale_volume_name_parameters.sort()

    _kale_downloadfroms3_task = _kale_downloadfroms3_op()\
        .add_pvolumes(_kale_pvolumes_dict)\
        .after()
    _kale_downloadfroms3_task.container.working_dir = "//shared_volume/kube_sipecam_playground/hsi/notebooks"
    _kale_downloadfroms3_task.container.set_security_context(
        k8s_client.V1SecurityContext(run_as_user=0))
    _kale_output_artifacts = {}
    _kale_output_artifacts.update(
        {'mlpipeline-ui-metadata': '/tmp/mlpipeline-ui-metadata.json'})
    _kale_output_artifacts.update({'downloadfroms3': '/downloadfroms3.html'})
    _kale_downloadfroms3_task.output_artifact_paths.update(
        _kale_output_artifacts)
    _kale_downloadfroms3_task.add_pod_label(
        "pipelines.kubeflow.org/metadata_written", "true")
    _kale_dep_names = (_kale_downloadfroms3_task.dependent_names +
                       _kale_volume_step_names)
    _kale_downloadfroms3_task.add_pod_annotation(
        "kubeflow-kale.org/dependent-templates", json.dumps(_kale_dep_names))
    if _kale_volume_name_parameters:
        _kale_downloadfroms3_task.add_pod_annotation(
            "kubeflow-kale.org/volume-name-parameters",
            json.dumps(_kale_volume_name_parameters))

    _kale_readdatainput_task = _kale_readdatainput_op(dir_mask_specie, dir_specie, file_mask_specie, file_specie)\
        .add_pvolumes(_kale_pvolumes_dict)\
        .after(_kale_downloadfroms3_task)
    _kale_readdatainput_task.container.working_dir = "//shared_volume/kube_sipecam_playground/hsi/notebooks"
    _kale_readdatainput_task.container.set_security_context(
        k8s_client.V1SecurityContext(run_as_user=0))
    _kale_output_artifacts = {}
    _kale_output_artifacts.update(
        {'mlpipeline-ui-metadata': '/tmp/mlpipeline-ui-metadata.json'})
    _kale_output_artifacts.update({'readdatainput': '/readdatainput.html'})
    _kale_readdatainput_task.output_artifact_paths.update(
        _kale_output_artifacts)
    _kale_readdatainput_task.add_pod_label(
        "pipelines.kubeflow.org/metadata_written", "true")
    _kale_dep_names = (_kale_readdatainput_task.dependent_names +
                       _kale_volume_step_names)
    _kale_readdatainput_task.add_pod_annotation(
        "kubeflow-kale.org/dependent-templates", json.dumps(_kale_dep_names))
    if _kale_volume_name_parameters:
        _kale_readdatainput_task.add_pod_annotation(
            "kubeflow-kale.org/volume-name-parameters",
            json.dumps(_kale_volume_name_parameters))

    _kale_reproject_task = _kale_reproject_op()\
        .add_pvolumes(_kale_pvolumes_dict)\
        .after(_kale_readdatainput_task)
    _kale_reproject_task.container.working_dir = "//shared_volume/kube_sipecam_playground/hsi/notebooks"
    _kale_reproject_task.container.set_security_context(
        k8s_client.V1SecurityContext(run_as_user=0))
    _kale_output_artifacts = {}
    _kale_output_artifacts.update(
        {'mlpipeline-ui-metadata': '/tmp/mlpipeline-ui-metadata.json'})
    _kale_output_artifacts.update({'reproject': '/reproject.html'})
    _kale_reproject_task.output_artifact_paths.update(_kale_output_artifacts)
    _kale_reproject_task.add_pod_label(
        "pipelines.kubeflow.org/metadata_written", "true")
    _kale_dep_names = (_kale_reproject_task.dependent_names +
                       _kale_volume_step_names)
    _kale_reproject_task.add_pod_annotation(
        "kubeflow-kale.org/dependent-templates", json.dumps(_kale_dep_names))
    if _kale_volume_name_parameters:
        _kale_reproject_task.add_pod_annotation(
            "kubeflow-kale.org/volume-name-parameters",
            json.dumps(_kale_volume_name_parameters))

    _kale_createtestdata_task = _kale_createtestdata_op(dir_years)\
        .add_pvolumes(_kale_pvolumes_dict)\
        .after(_kale_reproject_task)
    _kale_createtestdata_task.container.working_dir = "//shared_volume/kube_sipecam_playground/hsi/notebooks"
    _kale_createtestdata_task.container.set_security_context(
        k8s_client.V1SecurityContext(run_as_user=0))
    _kale_output_artifacts = {}
    _kale_output_artifacts.update(
        {'mlpipeline-ui-metadata': '/tmp/mlpipeline-ui-metadata.json'})
    _kale_output_artifacts.update({'createtestdata': '/createtestdata.html'})
    _kale_createtestdata_task.output_artifact_paths.update(
        _kale_output_artifacts)
    _kale_createtestdata_task.add_pod_label(
        "pipelines.kubeflow.org/metadata_written", "true")
    _kale_dep_names = (_kale_createtestdata_task.dependent_names +
                       _kale_volume_step_names)
    _kale_createtestdata_task.add_pod_annotation(
        "kubeflow-kale.org/dependent-templates", json.dumps(_kale_dep_names))
    if _kale_volume_name_parameters:
        _kale_createtestdata_task.add_pod_annotation(
            "kubeflow-kale.org/volume-name-parameters",
            json.dumps(_kale_volume_name_parameters))

    _kale_maskandextract_task = _kale_maskandextract_op()\
        .add_pvolumes(_kale_pvolumes_dict)\
        .after(_kale_createtestdata_task)
    _kale_maskandextract_task.container.working_dir = "//shared_volume/kube_sipecam_playground/hsi/notebooks"
    _kale_maskandextract_task.container.set_security_context(
        k8s_client.V1SecurityContext(run_as_user=0))
    _kale_output_artifacts = {}
    _kale_output_artifacts.update(
        {'mlpipeline-ui-metadata': '/tmp/mlpipeline-ui-metadata.json'})
    _kale_output_artifacts.update({'maskandextract': '/maskandextract.html'})
    _kale_maskandextract_task.output_artifact_paths.update(
        _kale_output_artifacts)
    _kale_maskandextract_task.add_pod_label(
        "pipelines.kubeflow.org/metadata_written", "true")
    _kale_dep_names = (_kale_maskandextract_task.dependent_names +
                       _kale_volume_step_names)
    _kale_maskandextract_task.add_pod_annotation(
        "kubeflow-kale.org/dependent-templates", json.dumps(_kale_dep_names))
    if _kale_volume_name_parameters:
        _kale_maskandextract_task.add_pod_annotation(
            "kubeflow-kale.org/volume-name-parameters",
            json.dumps(_kale_volume_name_parameters))

    _kale_bestmodel_task = _kale_bestmodel_op()\
        .add_pvolumes(_kale_pvolumes_dict)\
        .after(_kale_maskandextract_task)
    _kale_bestmodel_task.container.working_dir = "//shared_volume/kube_sipecam_playground/hsi/notebooks"
    _kale_bestmodel_task.container.set_security_context(
        k8s_client.V1SecurityContext(run_as_user=0))
    _kale_output_artifacts = {}
    _kale_output_artifacts.update(
        {'mlpipeline-ui-metadata': '/tmp/mlpipeline-ui-metadata.json'})
    _kale_output_artifacts.update({'bestmodel': '/bestmodel.html'})
    _kale_bestmodel_task.output_artifact_paths.update(_kale_output_artifacts)
    _kale_bestmodel_task.add_pod_label(
        "pipelines.kubeflow.org/metadata_written", "true")
    _kale_dep_names = (_kale_bestmodel_task.dependent_names +
                       _kale_volume_step_names)
    _kale_bestmodel_task.add_pod_annotation(
        "kubeflow-kale.org/dependent-templates", json.dumps(_kale_dep_names))
    if _kale_volume_name_parameters:
        _kale_bestmodel_task.add_pod_annotation(
            "kubeflow-kale.org/volume-name-parameters",
            json.dumps(_kale_volume_name_parameters))

    _kale_temporalprojection_task = _kale_temporalprojection_op(date_of_processing, specie)\
        .add_pvolumes(_kale_pvolumes_dict)\
        .after(_kale_bestmodel_task)
    _kale_temporalprojection_task.container.working_dir = "//shared_volume/kube_sipecam_playground/hsi/notebooks"
    _kale_temporalprojection_task.container.set_security_context(
        k8s_client.V1SecurityContext(run_as_user=0))
    _kale_output_artifacts = {}
    _kale_output_artifacts.update(
        {'mlpipeline-ui-metadata': '/tmp/mlpipeline-ui-metadata.json'})
    _kale_output_artifacts.update(
        {'temporalprojection': '/temporalprojection.html'})
    _kale_temporalprojection_task.output_artifact_paths.update(
        _kale_output_artifacts)
    _kale_temporalprojection_task.add_pod_label(
        "pipelines.kubeflow.org/metadata_written", "true")
    _kale_dep_names = (_kale_temporalprojection_task.dependent_names +
                       _kale_volume_step_names)
    _kale_temporalprojection_task.add_pod_annotation(
        "kubeflow-kale.org/dependent-templates", json.dumps(_kale_dep_names))
    if _kale_volume_name_parameters:
        _kale_temporalprojection_task.add_pod_annotation(
            "kubeflow-kale.org/volume-name-parameters",
            json.dumps(_kale_volume_name_parameters))

    _kale_uploadtos3_task = _kale_uploadtos3_op(date_of_processing)\
        .add_pvolumes(_kale_pvolumes_dict)\
        .after(_kale_temporalprojection_task)
    _kale_uploadtos3_task.container.working_dir = "//shared_volume/kube_sipecam_playground/hsi/notebooks"
    _kale_uploadtos3_task.container.set_security_context(
        k8s_client.V1SecurityContext(run_as_user=0))
    _kale_output_artifacts = {}
    _kale_output_artifacts.update(
        {'mlpipeline-ui-metadata': '/tmp/mlpipeline-ui-metadata.json'})
    _kale_output_artifacts.update({'uploadtos3': '/uploadtos3.html'})
    _kale_uploadtos3_task.output_artifact_paths.update(_kale_output_artifacts)
    _kale_uploadtos3_task.add_pod_label(
        "pipelines.kubeflow.org/metadata_written", "true")
    _kale_dep_names = (_kale_uploadtos3_task.dependent_names +
                       _kale_volume_step_names)
    _kale_uploadtos3_task.add_pod_annotation(
        "kubeflow-kale.org/dependent-templates", json.dumps(_kale_dep_names))
    if _kale_volume_name_parameters:
        _kale_uploadtos3_task.add_pod_annotation(
            "kubeflow-kale.org/volume-name-parameters",
            json.dumps(_kale_volume_name_parameters))
Ejemplo n.º 15
0
import kfp.dsl as dsl
import kfp.gcp as gcp
import kfp.components as comp
import os
from kubernetes import client as k8s_client

feature_extraction_op = comp.load_component_from_file(
    os.path.join("./components/feature-extractor/",
                 'feature_extractor_component.yaml'))

ipc_shared_mem_volume = dsl.PipelineVolume(name='shm-vol',
                                           empty_dir={'medium': 'Memory'})

train_op = comp.load_component_from_file(
    os.path.join("./components/train/", 'train_component.yaml'))


@dsl.pipeline(name='VoxCeleb Baseline Reproduction Pipeline',
              description='Train baseline models')
# Define a pipeline and create a task from a component
# @TODO abstract code shared with test_full_pipeline.py
def baseline_repro_pipeline(
    data_bucket: str = 'voxsrc-2020-voxceleb-v4',
    test_list: str = 'vox1_full.txt',
    # @note test_utterances_list is in the same format as train_list, but for
    #       the test data. Whereas test_list contains utterance pairs for
    #       evaluation
    test_utterances_list: str = 'vox1_full_utterances.txt',
    train_list: str = 'vox2_full.txt',
    test_path: str = 'vox1_full.tar.gz',
    train_path: str = 'vox2_full.tar.gz',
Ejemplo n.º 16
0
def auto_generated_pipeline(vol_shared_volume='efs'):
    pvolumes_dict = OrderedDict()

    annotations = {}

    volume = dsl.PipelineVolume(pvc=vol_shared_volume)

    pvolumes_dict['/shared_volume/'] = volume

    loaddata_task = loaddata_op(vol_shared_volume)\
        .add_pvolumes(pvolumes_dict)\
        .after()
    loaddata_task.container.working_dir = "/shared_volume/notebooks/titanic"
    loaddata_task.container.set_security_context(
        k8s_client.V1SecurityContext(run_as_user=0))

    datapreprocessing_task = datapreprocessing_op(vol_shared_volume)\
        .add_pvolumes(pvolumes_dict)\
        .after(loaddata_task)
    datapreprocessing_task.container.working_dir = "/shared_volume/notebooks/titanic"
    datapreprocessing_task.container.set_security_context(
        k8s_client.V1SecurityContext(run_as_user=0))

    featureengineering_task = featureengineering_op(vol_shared_volume)\
        .add_pvolumes(pvolumes_dict)\
        .after(datapreprocessing_task)
    featureengineering_task.container.working_dir = "/shared_volume/notebooks/titanic"
    featureengineering_task.container.set_security_context(
        k8s_client.V1SecurityContext(run_as_user=0))

    decisiontree_task = decisiontree_op(vol_shared_volume)\
        .add_pvolumes(pvolumes_dict)\
        .after(featureengineering_task)
    decisiontree_task.container.working_dir = "/shared_volume/notebooks/titanic"
    decisiontree_task.container.set_security_context(
        k8s_client.V1SecurityContext(run_as_user=0))

    svm_task = svm_op(vol_shared_volume)\
        .add_pvolumes(pvolumes_dict)\
        .after(featureengineering_task)
    svm_task.container.working_dir = "/shared_volume/notebooks/titanic"
    svm_task.container.set_security_context(
        k8s_client.V1SecurityContext(run_as_user=0))

    naivebayes_task = naivebayes_op(vol_shared_volume)\
        .add_pvolumes(pvolumes_dict)\
        .after(featureengineering_task)
    naivebayes_task.container.working_dir = "/shared_volume/notebooks/titanic"
    naivebayes_task.container.set_security_context(
        k8s_client.V1SecurityContext(run_as_user=0))

    logisticregression_task = logisticregression_op(vol_shared_volume)\
        .add_pvolumes(pvolumes_dict)\
        .after(featureengineering_task)
    logisticregression_task.container.working_dir = "/shared_volume/notebooks/titanic"
    logisticregression_task.container.set_security_context(
        k8s_client.V1SecurityContext(run_as_user=0))

    randomforest_task = randomforest_op(vol_shared_volume)\
        .add_pvolumes(pvolumes_dict)\
        .after(featureengineering_task)
    randomforest_task.container.working_dir = "/shared_volume/notebooks/titanic"
    randomforest_task.container.set_security_context(
        k8s_client.V1SecurityContext(run_as_user=0))

    results_task = results_op(vol_shared_volume)\
        .add_pvolumes(pvolumes_dict)\
        .after(randomforest_task, logisticregression_task, naivebayes_task, svm_task, decisiontree_task)
    results_task.container.working_dir = "/shared_volume/notebooks/titanic"
    results_task.container.set_security_context(
        k8s_client.V1SecurityContext(run_as_user=0))
Ejemplo n.º 17
0
    def cnn_pipeline(dist_training=params.dist_training,
                     gpu_support=params.gpu_support):
        now = datetime.datetime.utcnow().strftime("%Y%m%d%H%M%S")
        workspace_name = 'cnn_txt_classifier' + now
        run_name = workspace_name + now

        prepare_data_task = prepare_data_operation(train_data_path,
                                                   test_data_path,
                                                   column_target_value,
                                                   column_text_value,
                                                   val_data_pct, gcp_bucket)
        dataset_input_values_train = {
            "description": "Train data",
            "name": train_data_path,
            "owner": "*****@*****.**",
            "data_uri": 'gs://' + "train_data_path",
            "version": "v0.1",
            "query": "autogen",
            "labels": None
        }
        dataset_log_train_data_task = metadata_logger_operation(
            log_type='dataset',
            workspace_name=workspace_name,
            run_name=run_name,
            input_values=dataset_input_values_train)

        dataset_input_values_test = {
            "description": "Test data",
            "name": test_data_path,
            "owner": "*****@*****.**",
            "data_uri": 'gs://' + "test_data_path",
            "version": "v0.1",
            "query": "autogen",
            "labels": None
        }
        log_test_data_task = metadata_logger_operation(
            log_type='dataset',
            workspace_name=workspace_name,
            run_name=run_name,
            input_values=dataset_input_values_test)

        prepare_emb_task = prepare_emb_operation(
            gcp_bucket, num_words, w2v_model_path, embbeding_dim,
            prepare_data_task.outputs['json_tokenizer_path'])

        generate_model_task = generate_model_operation(
            num_conv_layers, maxpool_strides, dropout, filter_sizes,
            num_filters, embbeding_dim,
            prepare_emb_task.outputs['output_emb_matrix_path'],
            prepare_emb_task.outputs['vocabulary_size_path'],
            prepare_data_task.outputs['max_sequence_lenght_path'],
            prepare_data_task.outputs['num_classes_path'])

        model_input_values_cnn = {
            "description":
            "CNN Keras model",
            "name":
            "CNN 100d - 3 convolutions - 100 filters - softmax",
            "owner":
            "*****@*****.**",
            "model_uri":
            workdir,
            "version":
            "v0.1",
            "hyperparameters":
            "epochs:" + str(epochs) + " batch_size:" + str(batch_size) +
            " dropout:" + str(dropout) + "num filters:" + str(num_filters),
            "learning_rate":
            None,
            "layers":
            filter_sizes,
            "early_stop":
            True,
            "labels":
            None
        }

        log_test_data_task = metadata_logger_operation(
            log_type='model',
            workspace_name=workspace_name,
            run_name=run_name,
            input_values=model_input_values_cnn).after(generate_model_task)

        with dsl.Condition(dist_training == 'yes'):
            move_data_pvc = move_data_pvc_operation(
                generate_model_task.outputs['output_keras_model_path'],
                prepare_data_task.outputs['x_train_data_path'],
                prepare_data_task.outputs['x_val_data_path'],
                prepare_data_task.outputs['y_train_data_path'],
                prepare_data_task.outputs['y_val_data_path'],
                workdir).add_pvolumes(
                    {workdir: dsl.PipelineVolume(pvc="kfpipeline-data-pvc")})
            train_time_start = datetime.datetime.utcnow()
            train_model_task_dist = train_model_operation_dist(
                epochs, batch_size, namespace, workdir).after(move_data_pvc)
            train_time_finish = datetime.datetime.utcnow()
            metrics_cnn_input_values = {
                "description": "Training metrics",
                "name": "training_metrics",
                "owner": "*****@*****.**",
                "metric_uri": workdir,
                "data_set_id": None,
                "model_id": None,
                "metrics_type": metadata.Metrics.TESTING,
                "values": {
                    "train_start_time":
                    train_time_start.strftime("%Y%m%d%H%M%S"),
                    "train_finish_time":
                    train_time_finish.strftime("%Y%m%d%H%M%S")
                },
                "early_stop": True,
                "labels": None
            }
            log_cnn_metric_data_task = metadata_logger_operation(
                log_type='metrics',
                workspace_name=workspace_name,
                run_name=run_name,
                input_values=metrics_cnn_input_values).after(
                    train_model_task_dist)
            deploy_model_task = deploy_model_operation_pvc(
                namespace, workdir).after(train_model_task_dist)

        with dsl.Condition(dist_training == 'no'):
            train_time_start = datetime.datetime.utcnow()
            with dsl.Condition(gpu_support == 'no'):
                train_model_task = train_model_operation(
                    generate_model_task.outputs['output_keras_model_path'],
                    prepare_data_task.outputs['x_train_data_path'],
                    prepare_data_task.outputs['x_val_data_path'],
                    prepare_data_task.outputs['y_train_data_path'],
                    prepare_data_task.outputs['y_val_data_path'], batch_size,
                    epochs)
                train_time_finish = datetime.datetime.utcnow()
                metrics_cnn_input_values = {
                    "description": "Training metrics",
                    "name": "training_metrics",
                    "owner": "*****@*****.**",
                    "metric_uri": workdir,
                    "data_set_id": None,
                    "model_id": None,
                    "metrics_type": metadata.Metrics.TESTING,
                    "values": {
                        "train_start_time":
                        train_time_start.strftime("%Y%m%d%H%M%S"),
                        "train_finish_time":
                        train_time_finish.strftime("%Y%m%d%H%M%S")
                    },
                    "early_stop": True,
                    "labels": None
                }
                log_cnn_metric_data_task = metadata_logger_operation(
                    log_type='metrics',
                    workspace_name=workspace_name,
                    run_name=run_name,
                    input_values=metrics_cnn_input_values).after(
                        train_model_task)
                deploy_model_task = deploy_model_operation_par(
                    namespace,
                    train_model_task.outputs['output_trained_model_path'])
            with dsl.Condition(gpu_support == 'yes'):
                train_model_task = train_model_operation(
                    generate_model_task.outputs['output_keras_model_path'],
                    prepare_data_task.outputs['x_train_data_path'],
                    prepare_data_task.outputs['x_val_data_path'],
                    prepare_data_task.outputs['y_train_data_path'],
                    prepare_data_task.outputs['y_val_data_path'], batch_size,
                    epochs).set_gpu_limit(1)
                train_time_finish = datetime.datetime.utcnow()
                metrics_cnn_input_values = {
                    "description": "Training metrics",
                    "name": "training_metrics",
                    "owner": "*****@*****.**",
                    "metric_uri": workdir,
                    "data_set_id": None,
                    "model_id": None,
                    "metrics_type": metadata.Metrics.TESTING,
                    "values": {
                        "train_start_time":
                        train_time_start.strftime("%Y%m%d%H%M%S"),
                        "train_finish_time":
                        train_time_finish.strftime("%Y%m%d%H%M%S")
                    },
                    "early_stop": True,
                    "labels": None
                }
                log_cnn_metric_data_task = metadata_logger_operation(
                    log_type='metrics',
                    workspace_name=workspace_name,
                    run_name=run_name,
                    input_values=metrics_cnn_input_values).after(
                        train_model_task)
                deploy_model_task = deploy_model_operation_par(
                    namespace,
                    train_model_task.outputs['output_trained_model_path'])