def mnist_pipeline(model_export_dir='gs://your-bucket/export', train_steps='200', learning_rate='0.01', batch_size='100', pvc_name=''): """ Pipeline with three stages: 1. train an MNIST classifier 2. deploy a tf-serving instance to the cluster 3. deploy a web-ui to interact with it """ train = dsl.ContainerOp( name='train', image='gcr.io/kubeflow-examples/mnist/model:v20190304-v0.2-176-g15d997b', arguments=[ "/opt/model.py", "--tf-export-dir", model_export_dir, "--tf-train-steps", train_steps, "--tf-batch-size", batch_size, "--tf-learning-rate", learning_rate ] ) serve_args = [ '--model-export-path', model_export_dir, '--server-name', "mnist-service" ] if platform != 'GCP': serve_args.extend([ '--cluster-name', "mnist-pipeline", '--pvc-name', pvc_name ]) serve = dsl.ContainerOp( name='serve', image='gcr.io/ml-pipeline/ml-pipeline-kubeflow-deployer:' '7775692adf28d6f79098e76e839986c9ee55dd61', arguments=serve_args ) serve.after(train) webui_args = [ '--image', 'gcr.io/kubeflow-examples/mnist/web-ui:' 'v20190304-v0.2-176-g15d997b-pipelines', '--name', 'web-ui', '--container-port', '5000', '--service-port', '80', '--service-type', "LoadBalancer" ] if platform != 'GCP': webui_args.extend([ '--cluster-name', "mnist-pipeline" ]) web_ui = dsl.ContainerOp( name='web-ui', image='gcr.io/kubeflow-examples/mnist/deploy-service:latest', arguments=webui_args ) web_ui.after(serve) steps = [train, serve, web_ui] for step in steps: if platform == 'GCP': step.apply(gcp.use_gcp_secret('user-gcp-sa')) else: step.apply(onprem.mount_pvc(pvc_name, 'local-storage', '/mnt'))
def __new__( cls, component_name: Text, input_dict: Dict[Text, Any], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any], executor_class_path: Text, pipeline_properties: PipelineProperties, ): """Creates a new component. Args: component_name: TFX component name. input_dict: Dictionary of input names to TFX types, or kfp.dsl.PipelineParam representing input parameters. output_dict: Dictionary of output names to List of TFX types. exec_properties: Execution properties. executor_class_path: <module>.<class> for Python class of executor. pipeline_properties: Pipeline level properties shared by all components. Returns: Newly constructed TFX Kubeflow component instance. """ outputs = output_dict.keys() file_outputs = { output: '/output/ml_metadata/{}'.format(output) for output in outputs } for k, v in pipeline_properties.exec_properties.items(): exec_properties[k] = v arguments = [ '--exec_properties', json.dumps(exec_properties), '--outputs', artifact_utils.jsonify_artifact_dict(output_dict), '--executor_class_path', executor_class_path, component_name, ] for k, v in input_dict.items(): if isinstance(v, float) or isinstance(v, int): v = str(v) arguments.append('--{}'.format(k)) arguments.append(v) container_op = dsl.ContainerOp( name=component_name, command=_COMMAND, image=pipeline_properties.tfx_image, arguments=arguments, file_outputs=file_outputs, ) # Add the Argo workflow ID to the container's environment variable so it # can be used to uniquely place pipeline outputs under the pipeline_root. field_path = "metadata.labels['workflows.argoproj.io/workflow']" container_op.add_env_variable( k8s_client.V1EnvVar( name='WORKFLOW_ID', value_from=k8s_client.V1EnvVarSource( field_ref=k8s_client.V1ObjectFieldSelector( field_path=field_path)))) named_outputs = {output: container_op.outputs[output] for output in outputs} # This allows user code to refer to the ContainerOp 'op' output named 'x' # as op.outputs.x component_outputs = type('Output', (), named_outputs) return type(component_name, (BaseComponent,), { 'container_op': container_op, 'outputs': component_outputs })
def some_op(): return dsl.ContainerOp( name='sleep', image='busybox', command=['sleep 1'], )
def op(): return dsl.ContainerOp(name='Some component name', image='image')
def init_container_pipeline(): dsl.ContainerOp(name='hello', image='alpine:latest', command=['echo', 'hello'], init_containers=[echo])
def mlflow_pipeline(): ml = dsl.ContainerOp( name="training pipeline", image="lego0142/pytorch_classifier:1.1", )
def taxi_cab_classification( output, project, column_names='gs://ml-pipeline-playground/tfx/taxi-cab-classification/column-names.json', key_columns='trip_start_timestamp', train='gs://ml-pipeline-playground/tfx/taxi-cab-classification/train.csv', evaluation='gs://ml-pipeline-playground/tfx/taxi-cab-classification/eval.csv', mode='local', preprocess_module='gs://ml-pipeline-playground/tfx/taxi-cab-classification/preprocessing.py', learning_rate=0.1, hidden_layer_size='1500', steps=3000, analyze_slice_column='trip_start_hour'): output_template = str(output) + '/{{workflow.uid}}/{{pod.name}}/data' target_lambda = """lambda x: (x['target'] > x['fare'] * 0.2)""" target_class_lambda = """lambda x: 1 if (x['target'] > x['fare'] * 0.2) else 0""" tf_server_name = 'taxi-cab-classification-model-{{workflow.uid}}' if platform != 'GCP': vop = dsl.VolumeOp(name="create_pvc", resource_name="pipeline-pvc", modes=dsl.VOLUME_MODE_RWM, size="1Gi") checkout = dsl.ContainerOp( name="checkout", image="alpine/git:latest", command=[ "git", "clone", "https://github.com/kubeflow/pipelines.git", str(output) + "/pipelines" ], ).apply(onprem.mount_pvc(vop.outputs["name"], 'local-storage', output)) checkout.after(vop) validation = dataflow_tf_data_validation_op( inference_data=train, validation_data=evaluation, column_names=column_names, key_columns=key_columns, gcp_project=project, run_mode=mode, validation_output=output_template, ) if platform != 'GCP': validation.after(checkout) preprocess = dataflow_tf_transform_op( training_data_file_pattern=train, evaluation_data_file_pattern=evaluation, schema=validation.outputs['schema'], gcp_project=project, run_mode=mode, preprocessing_module=preprocess_module, transformed_data_dir=output_template) training = tf_train_op(transformed_data_dir=preprocess.output, schema=validation.outputs['schema'], learning_rate=learning_rate, hidden_layer_size=hidden_layer_size, steps=steps, target='tips', preprocessing_module=preprocess_module, training_output_dir=output_template) analysis = dataflow_tf_model_analyze_op( model=training.output, evaluation_data=evaluation, schema=validation.outputs['schema'], gcp_project=project, run_mode=mode, slice_columns=analyze_slice_column, analysis_results_dir=output_template) prediction = dataflow_tf_predict_op(data_file_pattern=evaluation, schema=validation.outputs['schema'], target_column='tips', model=training.output, run_mode=mode, gcp_project=project, predictions_dir=output_template) cm = confusion_matrix_op(predictions=prediction.output, target_lambda=target_lambda, output_dir=output_template) roc = roc_op(predictions_dir=prediction.output, target_lambda=target_class_lambda, output_dir=output_template) if platform == 'GCP': deploy = kubeflow_deploy_op(model_dir=str(training.output) + '/export/export', server_name=tf_server_name) else: deploy = kubeflow_deploy_op(cluster_name=project, model_dir=str(training.output) + '/export/export', pvc_name=vop.outputs["name"], server_name=tf_server_name) steps = [ validation, preprocess, training, analysis, prediction, cm, roc, deploy ] for step in steps: if platform == 'GCP': step.apply(gcp.use_gcp_secret('user-gcp-sa')) else: step.apply( onprem.mount_pvc(vop.outputs["name"], 'local-storage', output))
def petcharts_pipeline_reptile( ACCESSKEY, SECRETKEY, BUCKET: str = "petcharts", CORPUSDATA: str = "corpus.txt", TRAINDATA: str = "unlabel_train1.csv", TESTDATA: str = "unlabel_test1.csv", TOKENIZER: str = "tokenizer.zip", PRETRAINED: str = "roberta.zip", TRANSFER: str = "roberta.transfer.zip", DOWNSTREAM: str = "reptile.zip", VOCABSIZE: int = 32000, CLASSES: int = 20, EPOCHS0: int = 24, EPOCHS1: int = 24, CONTRA_EPOCHS: int = 40, BATCHSIZE: int = 32, LOGDIR: str = "s3://petcharts/logs", LOGSTEPS: int = 500, SAVESTEPS: int = 10000, WEIGHTDECAY0: float = 0.1, WEIGHTDECAY1: float = 0.01, SCHEDULER0: str = "linear", SCHEDULER1: str = "linear", REGISTRYURL: str = "192.168.6.32:5000", HOSTURL: str = "http://minio-service.default.svc.cluster.local:9000", ): downstream = (dsl.ContainerOp( name="training transfer learning", image="{}/petclassify:reptile".format(REGISTRYURL), arguments=[ "--host", HOSTURL, "--accesskey", ACCESSKEY, "--secretkey", SECRETKEY, "--bucket", BUCKET, "--pretrained", PRETRAINED, "--transfer", TRANSFER, "--downstream", DOWNSTREAM, "--classes", CLASSES, "--epochs", EPOCHS1, "--batchsize", BATCHSIZE, "--weightdecay", WEIGHTDECAY1, "--scheduler", SCHEDULER1, "--logdir", "{}.{}".format(LOGDIR, "downstream"), ], output_artifact_paths={ "mlpipeline-ui-metadata": "/opt/mlpipeline-ui-metadata.json" }, ).add_env_variable(V1EnvVar( name="S3_ENDPOINT", value=HOSTURL)).add_env_variable( V1EnvVar(name="S3_USE_HTTPS", value="0")).add_env_variable( V1EnvVar(name="S3_VERIFY_SSL", value="0")).add_env_variable( V1EnvVar(name="AWS_ACCESS_KEY_ID", value=ACCESSKEY)).add_env_variable( V1EnvVar(name="AWS_SECRET_ACCESS_KEY", value=SECRETKEY))) downstream.set_gpu_limit(1) downstream.add_node_selector_constraint("gpu-accelerator", "nvidia-highend") downstream.container.set_image_pull_policy("Always")
def workflow1( input_handle_eval: dsl.PipelineParam = dsl.PipelineParam( name='input-handle-eval', value='gs://aju-dev-demos-codelabs/KF/taxidata/eval/data.csv'), input_handle_train: dsl.PipelineParam = dsl.PipelineParam( name='input-handle-train', value='gs://aju-dev-demos-codelabs/KF/taxidata/train/data.csv'), outfile_prefix_eval: dsl.PipelineParam = dsl.PipelineParam( name='outfile-prefix-eval', value='eval_transformed'), outfile_prefix_train: dsl.PipelineParam = dsl.PipelineParam( name='outfile-prefix-train', value='train_transformed'), train_steps: dsl.PipelineParam = dsl.PipelineParam(name='train-steps', value=10000), project: dsl.PipelineParam = dsl.PipelineParam( name='project', value='YOUR_PROJECT_HERE'), working_dir: dsl.PipelineParam = dsl.PipelineParam( name='working-dir', value='YOUR_GCS_DIR_HERE'), tft_setup_file: dsl.PipelineParam = dsl.PipelineParam( name='tft-setup-file', value='/ml/transform/setup.py'), tfma_setup_file: dsl.PipelineParam = dsl.PipelineParam( name='tfma-setup-file', value='/ml/analysis/setup.py'), workers: dsl.PipelineParam = dsl.PipelineParam(name='workers', value=1), pss: dsl.PipelineParam = dsl.PipelineParam(name='pss', value=1), max_rows: dsl.PipelineParam = dsl.PipelineParam(name='max-rows', value=10000), ts1: dsl.PipelineParam = dsl.PipelineParam(name='ts1', value=''), ts2: dsl.PipelineParam = dsl.PipelineParam(name='ts2', value=''), preprocessing_module1: dsl.PipelineParam = dsl. PipelineParam( name='preprocessing-module1', value='gs://aju-dev-demos-codelabs/KF/taxi-preproc/preprocessing.py'), preprocessing_module2: dsl.PipelineParam = dsl. PipelineParam( name='preprocessing-module2', value='gs://aju-dev-demos-codelabs/KF/taxi-preproc/preprocessing2.py'), preprocess_mode: dsl.PipelineParam = dsl.PipelineParam( name='preprocess-mode', value='local'), tfma_mode: dsl.PipelineParam = dsl.PipelineParam(name='tfma-mode', value='local')): tfteval = dsl.ContainerOp( name='tft-eval', image='gcr.io/google-samples/ml-pipeline-dataflow-tftbq-taxi', arguments=[ "--input_handle", input_handle_eval, "--outfile_prefix", outfile_prefix_eval, "--working_dir", '%s/%s/tft-eval' % (working_dir, '{{workflow.name}}'), "--project", project, "--mode", preprocess_mode, "--setup_file", tft_setup_file, "--max-rows", '5000', "--ts1", ts1, "--ts2", ts2, "--stage", "eval", "--preprocessing-module", preprocessing_module1 ] # file_outputs = {'transformed': '/output.txt'} ).apply(gcp.use_gcp_secret('user-gcp-sa')) tfttrain = dsl.ContainerOp( name='tft-train', image='gcr.io/google-samples/ml-pipeline-dataflow-tftbq-taxi', arguments=[ "--input_handle", input_handle_train, "--outfile_prefix", outfile_prefix_train, "--working_dir", '%s/%s/tft-train' % (working_dir, '{{workflow.name}}'), "--project", project, "--mode", preprocess_mode, "--setup_file", tft_setup_file, "--max_rows", max_rows, "--ts1", ts1, "--ts2", ts2, "--stage", "train", "--preprocessing_module", preprocessing_module1 ]).apply(gcp.use_gcp_secret('user-gcp-sa')) tfteval2 = dsl.ContainerOp( name='tft-eval2', image='gcr.io/google-samples/ml-pipeline-dataflow-tftbq-taxi', arguments=[ "--input_handle", input_handle_eval, "--outfile_prefix", outfile_prefix_eval, "--working_dir", '%s/%s/tft-eval2' % (working_dir, '{{workflow.name}}'), "--project", project, "--mode", preprocess_mode, "--setup_file", tft_setup_file, "--max_rows", '5000', "--ts1", ts1, "--ts2", ts2, "--stage", "eval", "--preprocessing_module", preprocessing_module2 ]).apply(gcp.use_gcp_secret('user-gcp-sa')) tfttrain2 = dsl.ContainerOp( name='tft-train2', image='gcr.io/google-samples/ml-pipeline-dataflow-tftbq-taxi', arguments=[ "--input_handle", input_handle_train, "--outfile_prefix", outfile_prefix_train, "--working_dir", '%s/%s/tft-train2' % (working_dir, '{{workflow.name}}'), "--project", project, "--mode", preprocess_mode, "--setup_file", tft_setup_file, "--max_rows", max_rows, "--ts1", ts1, "--ts2", ts2, "--stage", "train", "--preprocessing_module", preprocessing_module2 ]).apply(gcp.use_gcp_secret('user-gcp-sa')) train = dsl.ContainerOp( name='train', image='gcr.io/google-samples/ml-pipeline-kubeflow-tf-taxi', arguments=[ "--tf-transform-dir", '%s/%s/tft-train' % (working_dir, '{{workflow.name}}'), "--output-dir", '%s/%s/tf' % (working_dir, '{{workflow.name}}'), "--working-dir", '%s/%s/tf/serving_model_dir' % (working_dir, '{{workflow.name}}'), "--job-dir", '%s/%s/tf' % (working_dir, '{{workflow.name}}'), "--train-files-dir", '%s/%s/tft-train' % (working_dir, '{{workflow.name}}'), "--eval-files-dir", '%s/%s/tft-eval' % (working_dir, '{{workflow.name}}'), "--train-files-prefix", outfile_prefix_train, "--eval-files-prefix", outfile_prefix_eval, "--train-steps", train_steps, "--workers", workers, "--pss", pss ]) train.after(tfteval) train.after(tfttrain) train2 = dsl.ContainerOp( name='train2', image='gcr.io/google-samples/ml-pipeline-kubeflow-tf-taxi', arguments=[ "--tf-transform-dir", '%s/%s/tft-train2' % (working_dir, '{{workflow.name}}'), "--output-dir", '%s/%s/tf2' % (working_dir, '{{workflow.name}}'), "--working-dir", '%s/%s/tf2/serving_model_dir' % (working_dir, '{{workflow.name}}'), "--job-dir", '%s/%s/tf2' % (working_dir, '{{workflow.name}}'), "--train-files-dir", '%s/%s/tft-train2' % (working_dir, '{{workflow.name}}'), "--eval-files-dir", '%s/%s/tft-eval2' % (working_dir, '{{workflow.name}}'), "--train-files-prefix", outfile_prefix_train, "--eval-files-prefix", outfile_prefix_eval, "--train-steps", train_steps, "--workers", '1', "--pss", '1' ]) train2.after(tfteval2) train2.after(tfttrain2) analyze = dsl.ContainerOp( name='analyze', image='gcr.io/google-samples/ml-pipeline-dataflow-tfma-taxi', arguments=[ "--input_csv", input_handle_eval, "--tfma_run_dir", '%s/%s/tfma/output' % (working_dir, '{{workflow.name}}'), "--eval_model_dir", '%s/%s/tf/eval_model_dir' % (working_dir, '{{workflow.name}}'), "--mode", tfma_mode, "--setup_file", tfma_setup_file, "--project", project ]).apply(gcp.use_gcp_secret('user-gcp-sa')) analyze2 = dsl.ContainerOp( name='analyze2', image='gcr.io/google-samples/ml-pipeline-dataflow-tfma-taxi', arguments=[ "--input_csv", input_handle_eval, "--tfma_run_dir", '%s/%s/tfma2/output' % (working_dir, '{{workflow.name}}'), "--eval_model_dir", '%s/%s/tf2/eval_model_dir' % (working_dir, '{{workflow.name}}'), "--mode", tfma_mode, "--setup_file", tfma_setup_file, "--project", project ]).apply(gcp.use_gcp_secret('user-gcp-sa')) cmleop = dsl.ContainerOp( name='cmleop', image='gcr.io/google-samples/ml-pipeline-cmle-op', arguments=[ "--gcs-path", '%s/%s/tf/serving_model_dir/export/chicago-taxi' % (working_dir, '{{workflow.name}}'), "--version-name", '{{workflow.name}}', "--project", project ]).apply(gcp.use_gcp_secret('user-gcp-sa')) cmleop2 = dsl.ContainerOp( name='cmleop2', image='gcr.io/google-samples/ml-pipeline-cmle-op', arguments=[ "--gcs-path", '%s/%s/tf2/serving_model_dir/export/chicago-taxi' % (working_dir, '{{workflow.name}}'), "--version-name", '{{workflow.name}}_2', "--project", project ]).apply(gcp.use_gcp_secret('user-gcp-sa')) tfserving = dsl.ContainerOp( name='tfserving', image='gcr.io/google-samples/ml-pipeline-kubeflow-tfserve-taxi', arguments=[ "--model_name", '{{workflow.name}}', "--model_path", '%s/%s/tf/serving_model_dir/export/chicago-taxi' % (working_dir, '{{workflow.name}}') ]) tfserving2 = dsl.ContainerOp( name='tfserving2', image='gcr.io/google-samples/ml-pipeline-kubeflow-tfserve-taxi', arguments=[ "--model_name", '{{workflow.name}}-2', "--model_path", '%s/%s/tf2/serving_model_dir/export/chicago-taxi' % (working_dir, '{{workflow.name}}') ]) analyze.after(train) analyze2.after(train2) cmleop.after(train) cmleop2.after(train2) tfserving.after(train) tfserving2.after(train2)
def test_operator_to_template(self): """Test converting operator to template""" from kubernetes import client as k8s_client with dsl.Pipeline('somename') as p: msg1 = dsl.PipelineParam('msg1') msg2 = dsl.PipelineParam('msg2', value='value2') op = dsl.ContainerOp(name='echo', image='image', command=['sh', '-c'], arguments=['echo %s %s | tee /tmp/message.txt' % (msg1, msg2)], file_outputs={'merged': '/tmp/message.txt'}) \ .add_volume_mount(k8s_client.V1VolumeMount( mount_path='/secret/gcp-credentials', name='gcp-credentials')) \ .add_env_variable(k8s_client.V1EnvVar( name='GOOGLE_APPLICATION_CREDENTIALS', value='/secret/gcp-credentials/user-gcp-sa.json')) golden_output = { 'container': { 'image': 'image', 'args': [ 'echo {{inputs.parameters.msg1}} {{inputs.parameters.msg2}} | tee /tmp/message.txt' ], 'command': ['sh', '-c'], 'env': [{ 'name': 'GOOGLE_APPLICATION_CREDENTIALS', 'value': '/secret/gcp-credentials/user-gcp-sa.json' }], 'volumeMounts': [{ 'mountPath': '/secret/gcp-credentials', 'name': 'gcp-credentials', }] }, 'inputs': { 'parameters': [ { 'name': 'msg1' }, { 'name': 'msg2', 'value': 'value2' }, ] }, 'name': 'echo', 'outputs': { 'parameters': [{ 'name': 'echo-merged', 'valueFrom': { 'path': '/tmp/message.txt' } }], 'artifacts': [{ 'name': 'mlpipeline-ui-metadata', 'path': '/mlpipeline-ui-metadata.json', 's3': { 'accessKeySecret': { 'key': 'accesskey', 'name': 'mlpipeline-minio-artifact', }, 'bucket': 'mlpipeline', 'endpoint': 'minio-service.kubeflow:9000', 'insecure': True, 'key': 'runs/{{workflow.uid}}/{{pod.name}}/mlpipeline-ui-metadata.tgz', 'secretKeySecret': { 'key': 'secretkey', 'name': 'mlpipeline-minio-artifact', } } }, { 'name': 'mlpipeline-metrics', 'path': '/mlpipeline-metrics.json', 's3': { 'accessKeySecret': { 'key': 'accesskey', 'name': 'mlpipeline-minio-artifact', }, 'bucket': 'mlpipeline', 'endpoint': 'minio-service.kubeflow:9000', 'insecure': True, 'key': 'runs/{{workflow.uid}}/{{pod.name}}/mlpipeline-metrics.tgz', 'secretKeySecret': { 'key': 'secretkey', 'name': 'mlpipeline-minio-artifact', } } }] } } self.maxDiff = None self.assertEqual(golden_output, compiler.Compiler()._op_to_template(op))
def train_and_deploy(project=dsl.PipelineParam(name='project', value='cloud-training-demos'), bucket=dsl.PipelineParam(name='bucket', value='cloud-training-demos-ml'), startYear=dsl.PipelineParam(name='startYear', value='2000')): """Pipeline to train babyweight model""" start_step = 3 # Step 1: create training dataset using Apache Beam on Cloud Dataflow if start_step <= 1: preprocess = dsl.ContainerOp( name='preprocess', # image needs to be a compile-time string image= 'gcr.io/cloud-training-demos/babyweight-pipeline-bqtocsv:latest', arguments=[ '--project', project, '--mode', 'cloud', '--bucket', bucket, '--start_year', startYear ], file_outputs={'bucket': '/output.txt'}) else: preprocess = ObjectDict({'outputs': {'bucket': bucket}}) # Step 2: Do hyperparameter tuning of the model on Cloud ML Engine if start_step <= 2: hparam_train = dsl.ContainerOp( name='hypertrain', # image needs to be a compile-time string image= 'gcr.io/cloud-training-demos/babyweight-pipeline-hypertrain:latest', arguments=[preprocess.outputs['bucket']], file_outputs={'jobname': '/output.txt'}) else: hparam_train = ObjectDict( {'outputs': { 'jobname': 'babyweight_181008_210829' }}) # Step 3: Train the model some more, but on the pipelines cluster itself if start_step <= 3: # train: /output.txt is the model directory train_tuned = kubeflow_tfjob_launcher_op( container_image= 'gcr.io/cloud-training-demos/babyweight-pipeline-traintuned-trainer:latest', command=[hparam_train.outputs['jobname'], bucket], number_of_workers=10, number_of_parameter_servers=3, tfjob_timeout_minutes=5, step_name='traintuned') else: train_tuned = ObjectDict({ 'outputs': { 'train': 'gs://cloud-training-demos-ml/babyweight/hyperparam/15' } }) # Step 4: Deploy the trained model to Cloud ML Engine if start_step <= 4: deploy_cmle = dsl.ContainerOp( name='deploycmle', # image needs to be a compile-time string image= 'gcr.io/cloud-training-demos/babyweight-pipeline-deploycmle:latest', arguments=[ train_tuned.outputs['train'], # modeldir 'babyweight', 'mlp' ], file_outputs={ 'model': '/model.txt', 'version': '/version.txt' }) else: deploy_cmle = ObjectDict( {'outputs': { 'model': 'babyweight', 'version': 'mlp' }}) # Step 4: Deploy the trained model to Cloud ML Engine if start_step <= 5: deploy_cmle = dsl.ContainerOp( name='deployapp', # image needs to be a compile-time string image= 'gcr.io/cloud-training-demos/babyweight-pipeline-deployapp:latest', arguments=[ deploy_cmle.outputs['model'], deploy_cmle.outputs['version'] ], file_outputs={'appurl': '/appurl.txt'}) else: deploy_cmle = ObjectDict({ 'outputs': { 'appurl': 'https://cloud-training-demos.appspot.com/' } })
def nlp_pipeline( csv_url="https://raw.githubusercontent.com/axsauze/reddit-classification-exploration/master/data/reddit_train.csv", csv_encoding="ISO-8859-1", features_column="BODY", labels_column="REMOVED", raw_text_path='/mnt/text.data', labels_path='/mnt/labels.data', clean_text_path='/mnt/clean.data', spacy_tokens_path='/mnt/tokens.data', tfidf_vectors_path='/mnt/tfidf.data', lr_prediction_path='/mnt/prediction.data', tfidf_model_path='/mnt/tfidf.model', lr_model_path='/mnt/lr.model', lr_c_param=0.1, tfidf_max_features=10000, tfidf_ngram_range=3, batch_size='100', github_branch='master'): """ Pipeline """ pvc_metadata = V1ObjectMeta(name="{{workflow.name}}-my-pvc", labels={ "branch": "{{workflow.parameters.github-branch}}", "app": "nlp" }) requested_resources = V1ResourceRequirements(requests={"storage": "1Gi"}) pvc_spec = V1PersistentVolumeClaimSpec(access_modes=["ReadWriteOnce"], resources=requested_resources) pvc = V1PersistentVolumeClaim(api_version="v1", kind="PersistentVolumeClaim", metadata=pvc_metadata, spec=pvc_spec) vop = dsl.VolumeOp(name="create-pvc", k8s_resource=pvc, modes=None) download_step = dsl.ContainerOp( name='data_downloader', image='maximmold/data_downloader:0.1', command="python", arguments=[ "/microservice/pipeline_step.py", "--labels-path", labels_path, "--features-path", raw_text_path, "--csv-url", csv_url, "--csv-encoding", csv_encoding, "--features-column", features_column, "--labels-column", labels_column ], pvolumes={"/mnt": vop.volume}) clean_step = dsl.ContainerOp(name='clean_text', image='maximmold/clean_text_transformer:0.1', command="python", arguments=[ "/microservice/pipeline_step.py", "--in-path", raw_text_path, "--out-path", clean_text_path, ], pvolumes={"/mnt": download_step.pvolume}) tokenize_step = dsl.ContainerOp(name='tokenize', image='maximmold/spacy_tokenizer:0.1', command="python", arguments=[ "/microservice/pipeline_step.py", "--in-path", clean_text_path, "--out-path", spacy_tokens_path, ], pvolumes={"/mnt": clean_step.pvolume}) vectorize_step = dsl.ContainerOp(name='vectorize', image='maximmold/tfidf_vectorizer:0.1', command="python", arguments=[ "/microservice/pipeline_step.py", "--in-path", spacy_tokens_path, "--out-path", tfidf_vectors_path, "--max-features", tfidf_max_features, "--ngram-range", tfidf_ngram_range, "--action", "train", "--model-path", tfidf_model_path, ], pvolumes={"/mnt": tokenize_step.pvolume}) predict_step = dsl.ContainerOp(name='predictor', image='maximmold/lr_text_classifier:0.1', command="python", arguments=[ "/microservice/pipeline_step.py", "--in-path", tfidf_vectors_path, "--labels-path", labels_path, "--out-path", lr_prediction_path, "--c-param", lr_c_param, "--action", "train", "--model-path", lr_model_path, ], pvolumes={"/mnt": vectorize_step.pvolume}) try: seldon_config = yaml.load( open("../deploy_pipeline/seldon_production_pipeline.yaml")) except: # If this file is run from the project core directory seldon_config = yaml.load( open("deploy_pipeline/seldon_production_pipeline.yaml")) deploy_step = dsl.ResourceOp( action="apply", name="seldondeploy", k8s_resource=seldon_config, attribute_outputs={"name": "{.metadata.name}"}) deploy_step.after(predict_step) delete_previous_pvc = dsl.ContainerOp( name="deletepreviouspvc", image="bitnami/kubectl", command="kubectl", arguments=[ "delete", "pvc", "-l", "app=nlp,branch={{workflow.parameters.github-branch}}", "--field-selector", "metadata.name!={{workflow.name}}-my-pvc", "--grace-period=0", "--force", "--wait=false" ]) delete_previous_pvc.after(deploy_step) patch_pvc_finalizer = dsl.ContainerOp( name="patchpvcfinalizer", image="bitnami/kubectl", command=["bash"], arguments=[ "-c", 'for j in $(kubectl get pvc -o name -l app=nlp,branch={{workflow.parameters.github-branch}} --field-selector metadata.name!={{workflow.name}}-my-pvc -n kubeflow); do kubectl patch $j -p ' "'" '{"metadata":{"finalizers": []}}' "'" ' -n kubeflow --type=merge; done' ]) patch_pvc_finalizer.after(delete_previous_pvc)
def _build_kfp_ops( self, node_dependencies: Dict[Node, Set[Node]], image, image_pull_policy, ) -> Dict[str, dsl.ContainerOp]: """Build kfp container graph from Kedro node dependencies. """ kfp_ops = {} node_volumes = ( self._setup_volumes(image, image_pull_policy) if self.run_config.volume is not None else {} ) iap_env_var = k8s.V1EnvVar( name=IAP_CLIENT_ID, value=os.environ.get(IAP_CLIENT_ID, "") ) nodes_env = [iap_env_var] if is_mlflow_enabled(): kfp_ops["mlflow-start-run"] = self._customize_op( dsl.ContainerOp( name="mlflow-start-run", image=image, command=["kedro"], arguments=[ "kubeflow", "mlflow-start", dsl.RUN_ID_PLACEHOLDER, ], container_kwargs={"env": [iap_env_var]}, file_outputs={"mlflow_run_id": "/tmp/mlflow_run_id"}, ), image_pull_policy, ) nodes_env.append( k8s.V1EnvVar( name="MLFLOW_RUN_ID", value=kfp_ops["mlflow-start-run"].output, ) ) for node in node_dependencies: name = clean_name(node.name) params = ",".join( [ f"{param}:{dsl.PipelineParam(param)}" for param in self.context.params.keys() ] ) kwargs = {"env": nodes_env} if self.run_config.resources.is_set_for(node.name): kwargs["resources"] = k8s.V1ResourceRequirements( limits=self.run_config.resources.get_for(node.name), requests=self.run_config.resources.get_for(node.name), ) kfp_ops[node.name] = self._customize_op( dsl.ContainerOp( name=name, image=image, command=["kedro"], arguments=[ "run", "--params", params, "--node", node.name, ], pvolumes=node_volumes, container_kwargs=kwargs, file_outputs={ output: "/home/kedro/" + self.catalog[output]["filepath"] for output in node.outputs if output in self.catalog }, ), image_pull_policy, ) return kfp_ops
def mlrun_op(name: str = '', project: str = '', image: str = 'v3io/mlrun', runtime: str = '', command: str = '', secrets: list = [], params: dict = {}, hyperparams: dict = {}, param_file: str = '', inputs: dict = {}, outputs: dict = {}, in_path: str = '', out_path: str = '', rundb: str = '', mode: str = ''): """mlrun KubeFlow pipelines operator, use to form pipeline steps when using kubeflow pipelines, each step is wrapped in an mlrun_op one step can pass state and data to the next step, see example below. :param name: name used for the step :param project: optional, project name :param image: optional, run container image (will be executing the step) the container should host all requiered packages + code for the run, alternatively user can mount packages/code via shared file volumes like v3io (see example below) :param runtime: optional, runtime specification :param command: exec command (or URL for functions) :param secrets: extra secrets specs, will be injected into the runtime e.g. ['file=<filename>', 'env=ENV_KEY1,ENV_KEY2'] :param params: dictionary of run parameters and values :param hyperparams: dictionary of hyper parameters and list values, each hyperparam holds a list of values, the run will be executed for every parameter combination (GridSearch) :param param_file: a csv file with parameter combinations, first row hold the parameter names, following rows hold param values :param inputs: dictionary of input objects + optional paths (if path is omitted the path will be the in_path/key. :param outputs: dictionary of input objects + optional paths (if path is omitted the path will be the out_path/key. :param in_path: default input path/url (prefix) for inputs :param out_path: default output path/url (prefix) for artifacts :param rundb: path for rundb (or use 'MLRUN_META_DBPATH' env instead) :param mode: run mode, e.g. 'noctx' for pushing params as args :return: KFP step operation Example: from kfp import dsl from mlrun import mlrun_op from mlrun.platforms import mount_v3io def mlrun_train(p1, p2): return mlrun_op('training', command = '/User/kubeflow/training.py', params = {'p1':p1, 'p2':p2}, outputs = {'model.txt':'', 'dataset.csv':''}, out_path ='v3io:///bigdata/mlrun/{{workflow.uid}}/', rundb = '/User/kubeflow') # use data from the first step def mlrun_validate(modelfile): return mlrun_op('validation', command = '/User/kubeflow/validation.py', inputs = {'model.txt':modelfile}, out_path ='v3io:///bigdata/mlrun/{{workflow.uid}}/', rundb = '/User/kubeflow') @dsl.pipeline( name='My MLRUN pipeline', description='Shows how to use mlrun.' ) def mlrun_pipeline( p1 = 5 , p2 = '"text"' ): # run training, mount_v3io will mount "/User" into the pipeline step train = mlrun_train(p1, p2).apply(mount_v3io()) # feed 1st step results into the secound step validate = mlrun_validate(train.outputs['model-txt']).apply(mount_v3io()) """ from kfp import dsl from os import environ rundb = rundb or environ.get('MLRUN_META_DBPATH') cmd = [ 'python', '-m', 'mlrun', 'run', '--kfp', '--workflow', '{{workflow.uid}}', '--name', name ] file_outputs = {} for s in secrets: cmd += ['-s', '{}'.format(s)] for p, val in params.items(): cmd += ['-p', '{}={}'.format(p, val)] for x, val in hyperparams.items(): cmd += ['-x', '{}={}'.format(x, val)] for i, val in inputs.items(): cmd += ['-i', '{}={}'.format(i, val)] for o, val in outputs.items(): cmd += ['-o', '{}={}'.format(o, val)] file_outputs[o.replace('.', '-')] = '/tmp/{}'.format(o) if project: cmd += ['--project', project] if runtime: cmd += ['--runtime', runtime] if in_path: cmd += ['--in-path', in_path] if out_path: cmd += ['--out-path', out_path] if rundb: cmd += ['--rundb', rundb] if param_file: cmd += ['--param-file', param_file] if mode: cmd += ['--mode', mode] if hyperparams or param_file: file_outputs['iterations'] = '/tmp/iteration_results.csv' cop = dsl.ContainerOp( name=name, image=image, command=cmd + [command], file_outputs=file_outputs, ) return cop
def __init__(self, component: tfx_base_component.BaseComponent, component_launcher_class: Type[ base_component_launcher.BaseComponentLauncher], depends_on: Set[dsl.ContainerOp], pipeline: tfx_pipeline.Pipeline, pipeline_name: Text, pipeline_root: dsl.PipelineParam, tfx_image: Text, kubeflow_metadata_config: Optional[ kubeflow_pb2.KubeflowMetadataConfig], component_config: base_component_config.BaseComponentConfig, pod_labels_to_attach: Optional[Dict[Text, Text]] = None): """Creates a new Kubeflow-based component. This class essentially wraps a dsl.ContainerOp construct in Kubeflow Pipelines. Args: component: The logical TFX component to wrap. component_launcher_class: the class of the launcher to launch the component. depends_on: The set of upstream KFP ContainerOp components that this component will depend on. pipeline: The logical TFX pipeline to which this component belongs. pipeline_name: The name of the TFX pipeline. pipeline_root: The pipeline root specified, as a dsl.PipelineParam tfx_image: The container image to use for this component. kubeflow_metadata_config: Configuration settings for connecting to the MLMD store in a Kubeflow cluster. component_config: Component config to launch the component. pod_labels_to_attach: Optional dict of pod labels to attach to the GKE pod. """ component_launcher_class_path = '.'.join([ component_launcher_class.__module__, component_launcher_class.__name__ ]) serialized_component = utils.replace_placeholder( json_utils.dumps(node_wrapper.NodeWrapper(component))) arguments = [ '--pipeline_name', pipeline_name, '--pipeline_root', pipeline_root, '--kubeflow_metadata_config', json_format.MessageToJson(message=kubeflow_metadata_config, preserving_proto_field_name=True), '--beam_pipeline_args', json.dumps(pipeline.beam_pipeline_args), '--additional_pipeline_args', json.dumps(pipeline.additional_pipeline_args), '--component_launcher_class_path', component_launcher_class_path, '--serialized_component', serialized_component, '--component_config', json_utils.dumps(component_config), ] if component.enable_cache or (component.enable_cache is None and pipeline.enable_cache): arguments.append('--enable_cache') self.container_op = dsl.ContainerOp( name=component.id.replace('.', '_'), command=_COMMAND, image=tfx_image, arguments=arguments, output_artifact_paths={ 'mlpipeline-ui-metadata': '/mlpipeline-ui-metadata.json', }, ) absl.logging.info( 'Adding upstream dependencies for component {}'.format( self.container_op.name)) for op in depends_on: absl.logging.info(' -> Component: {}'.format(op.name)) self.container_op.after(op) # TODO(b/140172100): Document the use of additional_pipeline_args. if _WORKFLOW_ID_KEY in pipeline.additional_pipeline_args: # Allow overriding pipeline's run_id externally, primarily for testing. self.container_op.container.add_env_variable( k8s_client.V1EnvVar( name=_WORKFLOW_ID_KEY, value=pipeline.additional_pipeline_args[_WORKFLOW_ID_KEY])) else: # Add the Argo workflow ID to the container's environment variable so it # can be used to uniquely place pipeline outputs under the pipeline_root. field_path = "metadata.labels['workflows.argoproj.io/workflow']" self.container_op.container.add_env_variable( k8s_client.V1EnvVar( name=_WORKFLOW_ID_KEY, value_from=k8s_client.V1EnvVarSource( field_ref=k8s_client.V1ObjectFieldSelector( field_path=field_path)))) if pod_labels_to_attach: for k, v in pod_labels_to_attach.items(): self.container_op.add_pod_label(k, v)
def deploy_model_op(model): return dsl.ContainerOp( name='Deploy Model', image='gcr.io/kube01/kubeflow/presidentialelections/deploy:latest', arguments=['--model', model])
def nlp_pipeline( namespace="kubeflow", seldon_server="SKLEARN_SERVER", model_path="gs://seldon-models/v1.13.0-dev/sklearn/iris", gateway_endpoint="istio-ingressgateway.istio-system.svc.cluster.local", retries=3, replicas=10, workers=100, input_path="data/input-data.txt", output_path="data/output-data.txt", ): """ Pipeline """ vop = dsl.VolumeOp( name="seldon-batch-pvc", resource_name="seldon-batch-pvc", modes=dsl.VOLUME_MODE_RWO, size="2Mi", ) seldon_deployment_yaml = f""" apiVersion: machinelearning.seldon.io/v1 kind: SeldonDeployment metadata: name: "{{{{workflow.name}}}}" namespace: "{namespace}" spec: name: "{{{{workflow.name}}}}" predictors: - graph: children: [] implementation: "{seldon_server}" modelUri: "{model_path}" name: classifier name: default """ deploy_step = dsl.ResourceOp( name="deploy_seldon", action="create", k8s_resource=yaml.safe_load(seldon_deployment_yaml), ) scale_and_wait = dsl.ContainerOp( name="scale_and_wait_seldon", image="bitnami/kubectl:1.17", command="bash", arguments=[ "-c", f"sleep 10 && kubectl scale --namespace {namespace} --replicas={replicas} sdep/{{{{workflow.name}}}} && sleep 2 && kubectl rollout status deploy/$(kubectl get deploy -l seldon-deployment-id={{{{workflow.name}}}} -o jsonpath='{{.items[0].metadata.name'}})", ], ) download_from_object_store = dsl.ContainerOp( name="download-from-object-store", image="minio/mc:RELEASE.2020-04-17T08-55-48Z", command="sh", arguments=[ "-c", f"mc config host add minio-local http://minio.default.svc.cluster.local:9000 minioadmin minioadmin && mc cp minio-local/{input_path} /assets/input-data.txt", ], pvolumes={"/assets": vop.volume}, ) batch_process_step = dsl.ContainerOp( name="data_downloader", image="seldonio/seldon-core-s2i-python37:1.1.1-rc", command="seldon-batch-processor", arguments=[ "--deployment-name", "{{workflow.name}}", "--namespace", namespace, "--host", gateway_endpoint, "--retries", retries, "--input-data-path", "/assets/input-data.txt", "--output-data-path", "/assets/output-data.txt", "--benchmark", ], pvolumes={"/assets": vop.volume}, ) upload_to_object_store = dsl.ContainerOp( name="upload-to-object-store", image="minio/mc:RELEASE.2020-04-17T08-55-48Z", command="sh", arguments=[ "-c", f"mc config host add minio-local http://minio.default.svc.cluster.local:9000 minioadmin minioadmin && mc cp /assets/output-data.txt minio-local/{output_path}", ], pvolumes={"/assets": vop.volume}, ) delete_step = dsl.ResourceOp( name="delete_seldon", action="delete", k8s_resource=yaml.safe_load(seldon_deployment_yaml), ) scale_and_wait.after(deploy_step) download_from_object_store.after(scale_and_wait) batch_process_step.after(download_from_object_store) upload_to_object_store.after(batch_process_step) delete_step.after(upload_to_object_store)
def train_and_deploy(project='cloud-training-demos', bucket='cloud-training-demos-ml', startYear='2000'): """Pipeline to train babyweight model""" start_step = 1 # Step 1: create training dataset using Apache Beam on Cloud Dataflow if start_step <= 1: preprocess = dsl.ContainerOp( name='preprocess', # image needs to be a compile-time string image= 'gcr.io/cloud-training-demos/babyweight-pipeline-bqtocsv:latest', arguments=[ '--project', project, '--mode', 'cloud', '--bucket', bucket, '--start_year', startYear ], file_outputs={'bucket': '/output.txt'}) else: preprocess = ObjectDict({'outputs': {'bucket': bucket}}) # Step 2: Do hyperparameter tuning of the model on Cloud ML Engine if start_step <= 2: hparam_train = dsl.ContainerOp( name='hypertrain', # image needs to be a compile-time string image= 'gcr.io/cloud-training-demos/babyweight-pipeline-hypertrain:latest', arguments=[preprocess.outputs['bucket']], file_outputs={'jobname': '/output.txt'}) else: hparam_train = ObjectDict( {'outputs': { 'jobname': 'babyweight_181008_210829' }}) # Step 3: Train the model some more, but on the pipelines cluster itself if start_step <= 3: train_tuned = dsl.ContainerOp( name='traintuned', # image needs to be a compile-time string image= 'gcr.io/cloud-training-demos/babyweight-pipeline-traintuned-trainer:latest', #image='gcr.io/cloud-training-demos/babyweight-pipeline-traintuned-trainer@sha256:3d73c805430a16d0675aeafa9819d6d2cfbad0f0f34cff5fb9ed4e24493bc9a8', arguments=[hparam_train.outputs['jobname'], bucket], file_outputs={'train': '/output.txt'}) train_tuned.set_memory_request('2G') train_tuned.set_cpu_request('1') else: train_tuned = ObjectDict({ 'outputs': { 'train': 'gs://cloud-training-demos-ml/babyweight/hyperparam/15' } }) # Step 4: Deploy the trained model to Cloud ML Engine if start_step <= 4: deploy_cmle = dsl.ContainerOp( name='deploycmle', # image needs to be a compile-time string image= 'gcr.io/cloud-training-demos/babyweight-pipeline-deploycmle:latest', arguments=[ train_tuned.outputs['train'], # modeldir 'babyweight', 'mlp' ], file_outputs={ 'model': '/model.txt', 'version': '/version.txt' }) else: deploy_cmle = ObjectDict( {'outputs': { 'model': 'babyweight', 'version': 'mlp' }}) # Step 5: Deploy the trained model to AppEngine if start_step <= 5: deploy_cmle = dsl.ContainerOp( name='deployapp', # image needs to be a compile-time string image= 'gcr.io/cloud-training-demos/babyweight-pipeline-deployapp:latest', arguments=[ deploy_cmle.outputs['model'], deploy_cmle.outputs['version'] ], file_outputs={'appurl': '/appurl.txt'}) else: deploy_cmle = ObjectDict({ 'outputs': { 'appurl': 'https://cloud-training-demos.appspot.com/' } })
def echo2_op(text2): return dsl.ContainerOp(name='echo2', image='library/bash:4.4.23', command=['sh', '-c'], arguments=['echo "$0"', text2])
def echo_op(): return dsl.ContainerOp(name='echo', image='busybox', command=['sh', '-c'], arguments=['echo "Found my node"'])
def some_name(): dsl.ContainerOp( name='some_name', image='alpine:latest', )
def query_op(n): return dsl.ContainerOp( name = n, image = "hanjoo8821/jdbc-tibero:basic", container_kwargs = {'env': [V1EnvVar('id', 'hanjoo'), V1EnvVar('pw', '1010')]} )
def no_outputs_pipeline(): no_outputs_op = dsl.ContainerOp(name='dummy', image='dummy') dsl.ContainerOp(name='dummy', image='dummy', arguments=[no_outputs_op.output])
def TrainingOp(name: str, input_dir: str, output_dir: str, epochs: int, model_name: str, model_version: int, batch_size: int, learning_rate: float, momentum: float, lr_patience: int, resize: int, scale_img: int, dropout: int, crop_pct: float, growth_rate: int, num_classes: int, input_size: int, prefetch_size: int, shuffle_buffer: int, volume: str): """Start model training within Kubeflow pipeline Arguments: name {str} -- operation name for Kubeflow UID (eg Training) output_dir {str} -- Output directory containing artifacts from training (eg. "/directory/on/local/filesystem") epochs {int} -- Number of epochs for model training (eg. 10) model_name {str} -- Name of the model (eg "peleenet") model_version {int} -- Version of the model (eg 1) batch_size {int} -- Batch size to use for mini-batch training (eg. 64) learning_rate {float} -- Learning rate for training model (eg. 1e-3 or 0.0001) momentum {float} -- Momentum factor for use with SGD optimizer lr_patience {int} -- Patience interval to wait for dropout {float} -- Percentage of dropout to add to the network (eg .5 == 50%) resume_training {bool} -- Resume training of a saved model (eg. True or False) resize {int} -- Resize training data (eg. 32 (where original image size is (224,224) this would resize the image to (256, 256))) scale_img {int} -- Factor by which to scale the input image (eg. 7 (if the input image is 32x32x3 (HWC) the output would be (224,224,3))) crop_pct {float} -- Percentage to center crop training images (eg. 0.5 will center crop to the middle 50% of pixels in the image) dataset_split {list} -- Splits to use for Training, Validation, and Test sets (if applicable) growth_rate {int} -- Growth rate to use (see DenseNet and PeleeNet paper : https://arxiv.org/abs/1804.06882) bottle_neck_width {List[int]} -- Bottle beck widths to use for the Dense layers num_classes {int} -- Number of classes the model is being used for input_size {int} -- Input size of the images used for training prefetch_size {int} -- Number of batches to prefetch for training shuffle_buffer {int} -- Number of examples to store in buffer for shuffling datasets too large to fit in memory volume {str} -- Name of volume to map into container for access to data """ return dsl.ContainerOp( name=name, # TODO enter container image name image='edhenry/peleenet-train:latest', arguments=[ '--input_dir', input_dir, '--output_dir', output_dir, '--epochs', epochs, '--model_name', model_name, '--model_version', model_version, '--batch_size', batch_size, '--learning_rate', learning_rate, '--momentum', momentum, '--lr_patience', lr_patience, '--dropout', dropout, #'--resume_training', resume_training, '--resize', resize, '--scale_img', scale_img, '--crop_pct', crop_pct, #'--dataset_split', dataset_split, '--growth_rate', growth_rate, #'--bottle_neck_width', bottle_neck_width, '--num_classes', num_classes, '--input_size', input_size, '--prefetch_size', prefetch_size, '--shuffle_buffer', shuffle_buffer ], pvolumes=volume, file_outputs={}).set_gpu_limit(1)
def query_op(): return dsl.ContainerOp( name = "JDBC Agent", image = "hanjoo8821/jdbc-tibero:menu" )
def mnist_hpo(name="mnist", namespace="anonymous", goal: float = 0.99, parallelTrialCount: int = 3, maxTrialCount: int = 12, experimentTimeoutMinutes: int = 60, deleteAfterDone: bool = True): objectiveConfig = { "type": "maximize", "goal": goal, "objectiveMetricName": "Validation-accuracy", "additionalMetricNames": ["accuracy"] } algorithmConfig = {"algorithmName": "random"} parameters = [{ "name": "--lr", "parameterType": "double", "feasibleSpace": { "min": "0.01", "max": "0.03" } }, { "name": "--num-layers", "parameterType": "int", "feasibleSpace": { "min": "2", "max": "5" } }, { "name": "--optimizer", "parameterType": "categorical", "feasibleSpace": { "list": ["sgd", "adam", "ftrl"] } }] rawTemplate = { "apiVersion": "batch/v1", "kind": "Job", "metadata": { "name": "{{.Trial}}", "namespace": "{{.NameSpace}}" }, "spec": { "template": { "spec": { "restartPolicy": "Never", "containers": [{ "name": "{{.Trial}}", "image": "docker.io/katib/mxnet-mnist-example", "command": [ "python /mxnet/example/image-classification/train_mnist.py --batch-size=64 {{- with .HyperParameters}} {{- range .}} {{.Name}}={{.Value}} {{- end}} {{- end}}" # noqa E501 ] }] } } } } trialTemplate = {"goTemplate": {"rawTemplate": json.dumps(rawTemplate)}} op1 = katib_experiment_launcher_op( name, namespace, parallelTrialCount=parallelTrialCount, maxTrialCount=maxTrialCount, objectiveConfig=str(objectiveConfig), algorithmConfig=str(algorithmConfig), trialTemplate=str(trialTemplate), parameters=str(parameters), experimentTimeoutMinutes=experimentTimeoutMinutes, deleteAfterDone=deleteAfterDone) op_out = dsl.ContainerOp( name="my-out-cop", image="library/bash:4.4.23", command=["sh", "-c"], arguments=["echo hyperparameter: %s" % op1.output], )
def __new__(cls, component_name, input_dict, output_dict, exec_properties): """Creates a new component. Args: component_name: TFX component name. input_dict: Dictionary of input names to TFX types, or kfp.dsl.PipelineParam representing input parameters. output_dict: Dictionary of output names to List of TFX types. exec_properties: Execution properties. Returns: Newly constructed TFX Kubeflow component instance. """ outputs = output_dict.keys() file_outputs = { output: '/output/ml_metadata/{}'.format(output) for output in outputs } for k, v in ExecutionProperties.exec_properties.items(): exec_properties[k] = v arguments = [ '--exec_properties', json.dumps(exec_properties), '--outputs', types.jsonify_tfx_type_dict(output_dict), component_name, ] for k, v in input_dict.items(): if isinstance(v, float) or isinstance(v, int): v = str(v) arguments.append('--{}'.format(k)) arguments.append(v) container_op = dsl.ContainerOp( name=component_name, command=_COMMAND, image=_KUBEFLOW_TFX_IMAGE, arguments=arguments, file_outputs=file_outputs, ).apply(gcp.use_gcp_secret('user-gcp-sa')) # Adds GCP authentication. # Add the Argo workflow ID to the container's environment variable so it # can be used to uniquely place pipeline outputs under the pipeline_root. field_path = "metadata.labels['workflows.argoproj.io/workflow']" container_op.add_env_variable( k8s_client.V1EnvVar( name='WORKFLOW_ID', value_from=k8s_client.V1EnvVarSource( field_ref=k8s_client.V1ObjectFieldSelector( field_path=field_path)))) named_outputs = {output: container_op.outputs[output] for output in outputs} # This allows user code to refer to the ContainerOp 'op' output named 'x' # as op.outputs.x component_outputs = type('Output', (), named_outputs) return type(component_name, (BaseComponent,), { 'container_op': container_op, 'outputs': component_outputs })
def kubeflow_training( output='', project='', evaluation='gs://dataset-image-train/TFRecords/images/test_labels.csv', train='gs://dataset-image-train/TFRecords/images/train_labels.csv', schema='gs://ml-pipeline-playground/flower/schema.json', learning_rate=0.1, hidden_layer_size='100,50', steps=2000, target='label', workers=0, pss=0, preprocess_mode='local', predict_mode='local', optimizer_choice='SGD', batch_size_predict='', lambda_target='' ): output_template = str(output) + '/{{workflow.uid}}/{{pod.name}}/data' start_step = 1 use_gpu = False if start_step <= 1: preprocess = dsl.ContainerOp( name='preprocess', image='gcr.io/celerates-playground/dock-img:latest', arguments=[ '--training_data_file_pattern', train, '--evaluation_data_file_pattern', evaluation, '--schema', schema, '--gcp_project', project, '--run_mode', preprocess_mode, '--preprocessing_module', '', '--transformed_data_dir', output_template], file_outputs={'transformed_data_dir': '/output.txt'} ).apply(gcp.use_gcp_secret('user-gcp-sa')) else: preprocess = ObjectDict({ 'outputs': { 'transformed_data_dir': output_template } }).apply(gcp.use_gcp_secret('user-gcp-sa')) if start_step <= 2: training = dsl.ContainerOp( name='training', image='gcr.io/celerates-playground/ml-pipeline-kubeflow-tf-trainer:latest', arguments=[ '--transformed_data_dir', preprocess.output, '--schema', schema, '--learning_rate', learning_rate, '--hidden_layer_size', hidden_layer_size, '--steps', steps, '--target', target, '--preprocessing_module', '', '--optimizer', optimizer_choice, '--training_output_dir', output_template], file_outputs={'training_output_dir': '/output.txt'} ).apply(gcp.use_gcp_secret('user-gcp-sa')) else: training = ObjectDict({ 'outputs': { 'training_output_dir': output_template } }).apply(gcp.use_gcp_secret('user-gcp-sa')) if use_gpu: training.image = 'gcr.io/ml-pipeline/ml-pipeline-kubeflow-tf-trainer-gpu:fe639f41661d8e17fcda64ff8242127620b80ba0', training.set_gpu_limit(1) if start_step <= 3: prediction = dsl.ContainerOp( name='prediction', image='gcr.io/celerates-playground/ml-pipeline-dataflow-tf-predict:latest', arguments=[ '--data_file_pattern', evaluation, '--schema', schema, '--target_column', target, '--model', training.output, '--run_mode', predict_mode, '--gcp_project', project, '--batchsize', batch_size_predict, '--predictions_dir', output_template], file_outputs={'predictions_dir': '/output.txt'} ).apply(gcp.use_gcp_secret('user-gcp-sa')) else: prediction = ObjectDict({ 'outputs': { 'predictions_dir': output_template } }).apply(gcp.use_gcp_secret('user-gcp-sa')) if start_step <= 4: confusion_matrix = dsl.ContainerOp( name='confusion_matrix', image='gcr.io/celerates-playground/ml-pipeline-local-confusion-matrix:latest', arguments=[ '--predictions', prediction.output, '--target_lambda', lambda_target, '--output_dir', output_template], file_outputs={ 'output_dir': '/mlpipeline-metrics.json', } ).apply(gcp.use_gcp_secret('user-gcp-sa')) else: confusion_matrix = ObjectDict({ 'outputs': { 'output_dir': output_template } }).apply(gcp.use_gcp_secret('user-gcp-sa'))
def my_pipeline(msg1, json, kind, msg2='value2'): op = dsl.ContainerOp(name='echo', image='image', command=['sh', '-c'], arguments=['echo %s %s | tee /tmp/message.txt' % (msg1, msg2)], file_outputs={'merged': '/tmp/message.txt'}) \ .add_volume_mount(k8s_client.V1VolumeMount( mount_path='/secret/gcp-credentials', name='gcp-credentials')) \ .add_env_variable(k8s_client.V1EnvVar( name='GOOGLE_APPLICATION_CREDENTIALS', value='/secret/gcp-credentials/user-gcp-sa.json')) res = dsl.ResourceOp( name="test-resource", k8s_resource=k8s_client.V1PersistentVolumeClaim( api_version="v1", kind=kind, metadata=k8s_client.V1ObjectMeta(name="resource")), attribute_outputs={"out": json}) golden_output = { 'container': { 'image': 'image', 'args': [ 'echo {{inputs.parameters.msg1}} {{inputs.parameters.msg2}} | tee /tmp/message.txt' ], 'command': ['sh', '-c'], 'env': [{ 'name': 'GOOGLE_APPLICATION_CREDENTIALS', 'value': '/secret/gcp-credentials/user-gcp-sa.json' }], 'volumeMounts': [{ 'mountPath': '/secret/gcp-credentials', 'name': 'gcp-credentials', }] }, 'inputs': { 'parameters': [ { 'name': 'msg1' }, { 'name': 'msg2' }, ] }, 'name': 'echo', 'outputs': { 'artifacts': [ { 'name': 'echo-merged', 'path': '/tmp/message.txt', }, ], 'parameters': [{ 'name': 'echo-merged', 'valueFrom': { 'path': '/tmp/message.txt' } }], } } res_output = { 'inputs': { 'parameters': [{ 'name': 'json' }, { 'name': 'kind' }] }, 'name': 'test-resource', 'outputs': { 'parameters': [{ 'name': 'test-resource-manifest', 'valueFrom': { 'jsonPath': '{}' } }, { 'name': 'test-resource-name', 'valueFrom': { 'jsonPath': '{.metadata.name}' } }, { 'name': 'test-resource-out', 'valueFrom': { 'jsonPath': '{{inputs.parameters.json}}' } }] }, 'resource': { 'action': 'create', 'manifest': ("apiVersion: v1\n" "kind: '{{inputs.parameters.kind}}'\n" "metadata:\n" " name: resource\n") } } self.maxDiff = None self.assertEqual(golden_output, compiler._op_to_template._op_to_template(op)) self.assertEqual(res_output, compiler._op_to_template._op_to_template(res))
def tacosandburritos_train( tenant_id, service_principal_id, service_principal_password, subscription_id, resource_group, workspace, persistent_volume_name='azure', persistent_volume_path='/mnt/azure', data_download='https://aiadvocate.blob.core.windows.net/public/tacodata.zip', epochs=5, batch=32, learning_rate=0.0001, imagetag='latest', model_name='tacosandburritos', profile_name='tacoprofile'): operations = {} image_size = 160 training_folder = 'train' training_dataset = 'train.txt' model_folder = 'model' # preprocess data operations['preprocess'] = dsl.ContainerOp( name='preprocess', image='insert your image here', command=['python'], arguments=[ '/scripts/data.py', '--base_path', persistent_volume_path, '--data', training_folder, '--target', training_dataset, '--img_size', image_size, '--zipfile', data_download ]) #train operations['training'] = dsl.ContainerOp( name='training', image='insert your image here', command=['python'], arguments=[ '/scripts/train.py', '--base_path', persistent_volume_path, '--data', training_folder, '--epochs', epochs, '--batch', batch, '--image_size', image_size, '--lr', learning_rate, '--outputs', model_folder, '--dataset', training_dataset ]) operations['training'].after(operations['preprocess']) # register model operations['register'] = dsl.ContainerOp( name='register', image='insert your image here', command=['python'], arguments=[ '/scripts/register.py', '--base_path', persistent_volume_path, '--model', 'latest.h5', '--model_name', model_name, '--tenant_id', tenant_id, '--service_principal_id', service_principal_id, '--service_principal_password', service_principal_password, '--subscription_id', subscription_id, '--resource_group', resource_group, '--workspace', workspace ]) operations['register'].after(operations['training']) operations['profile'] = dsl.ContainerOp( name='profile', image='insert your image here', command=['sh'], arguments=[ '/scripts/profile.sh', '-n', profile_name, '-m', model_name, '-i', '/scripts/inferenceconfig.json', '-d', '{"image":"https://www.exploreveg.org/files/2015/05/sofritas-burrito.jpeg"}', '-t', tenant_id, '-r', resource_group, '-w', workspace, '-s', service_principal_id, '-p', service_principal_password, '-u', subscription_id, '-b', persistent_volume_path ]) operations['profile'].after(operations['register']) operations['deploy'] = dsl.ContainerOp( name='deploy', image='insert your image here', command=['sh'], arguments=[ '/scripts/deploy.sh', '-n', model_name, '-m', model_name, '-i', '/scripts/inferenceconfig.json', '-d', '/scripts/deploymentconfig.json', '-t', tenant_id, '-r', resource_group, '-w', workspace, '-s', service_principal_id, '-p', service_principal_password, '-u', subscription_id, '-b', persistent_volume_path ]) operations['deploy'].after(operations['profile']) for _, op in operations.items(): op.container.set_image_pull_policy("Always") op.add_volume( k8s_client.V1Volume( name='azure', persistent_volume_claim=k8s_client. V1PersistentVolumeClaimVolumeSource( claim_name='azure-managed-disk'))).add_volume_mount( k8s_client.V1VolumeMount(mount_path='/mnt/azure', name='azure'))