def run(): """Define a beam pipeline.""" BeamDagRunner().run( pipeline.create_pipeline( pipeline_name=configs.PIPELINE_NAME, pipeline_root=PIPELINE_ROOT, gcp_project=configs.GOOGLE_CLOUD_PROJECT, gcs_bucket=configs.GCS_BUCKET_NAME, tcga_betas_query=configs.TCGA_BETAS_QUERY, tcga_betas_output_schema=configs.TCGA_BETAS_OUTPUT_SCHEMA, tcga_betas_output_table_name=configs.TCGA_BETAS_OUTPUT_TABLE, cpg_sites_list_query=configs.CPG_SITES_LIST_QUERY, cpg_sites_list_output_schema=configs.CPG_SITES_OUTPUT_SCHEMA, cpg_sites_list_output_table_name=configs.CPG_SITES_OUTPUT_TABLE, pivot_query=configs.PIVOT_DATASET_QUERY, pivot_output_table=configs.PIVOT_OUTPUT_TABLE, final_dataset_query=configs.TRAIN_QUERY, preprocessing_fn=configs.PREPROCESSING_FN, run_fn=configs.RUN_FN, train_args=trainer_pb2.TrainArgs(num_steps=configs.TRAIN_NUM_STEPS), eval_args=trainer_pb2.EvalArgs(num_steps=configs.EVAL_NUM_STEPS), eval_accuracy_threshold=configs.EVAL_ACCURACY_THRESHOLD, serving_model_dir=SERVING_MODEL_DIR, # TODO(step 7): (Optional) Uncomment here to use provide GCP related # config for BigQuery with Beam DirectRunner. beam_pipeline_args=configs.BIG_QUERY_WITH_DIRECT_RUNNER_BEAM_PIPELINE_ARGS, metadata_connection_config=metadata.sqlite_metadata_connection_config( METADATA_PATH)))
def run(): metadata_config = kubeflow_dag_runner.get_default_kubeflow_metadata_config( ) tfx_image = os.environ.get('KUBEFLOW_TFX_IMAGE', None) runner_config = kubeflow_dag_runner.KubeflowDagRunnerConfig( kubeflow_metadata_config=metadata_config, tfx_image=tfx_image) os.environ[kubeflow_dag_runner.SDK_ENV_LABEL] = 'tfx-template' kubeflow_dag_runner.KubeflowDagRunner(config=runner_config).run( pipeline.create_pipeline( pipeline_name=config.PIPELINE_NAME, pipeline_root=pipeline_config.PIPELINE_ROOT_GCS, data_path=pipeline_config.DATA_PATH_KUBEFLOW, preprocessing_fn=config.PREPROCESSING_FN, run_fn=config.RUN_FN, train_args=trainer_pb2.TrainArgs(num_steps=config.TRAIN_NUM_STEPS), eval_args=trainer_pb2.EvalArgs(num_steps=config.EVAL_NUM_STEPS), eval_accuracy_threshold=config.EVAL_ACCURACY_THRESHOLD, serving_model_dir=pipeline_config.SERVING_MODEL_DIR_GCS, query=config.BIG_QUERY_QUERY, beam_pipeline_args=config. BIG_QUERY_WITH_DIRECT_RUNNER_BEAM_PIPELINE_ARGS, # beam_pipeline_args=config.DATAFLOW_BEAM_PIPELINE_ARGS, # ai_platform_training_args=config.GCP_AI_PLATFORM_TRAINING_ARGS, # ai_platform_serving_args=config.GCP_AI_PLATFORM_SERVING_ARGS ))
def run(metadata_file: Optional[Text] = None): """Define a kubeflow pipeline.""" # Metadata config. The defaults works work with the installation of # KF Pipelines using Kubeflow. If installing KF Pipelines using the # lightweight deployment option, you may need to override the defaults. # If you use Kubeflow, metadata will be written to MySQL database inside # Kubeflow cluster. metadata_config = kubeflow_dag_runner.get_default_kubeflow_metadata_config( ) # This pipeline automatically injects the Kubeflow TFX image if the # environment variable 'KUBEFLOW_TFX_IMAGE' is defined. Currently, the tfx # cli tool exports the environment variable to pass to the pipelines. # TODO(b/157598477) Find a better way to pass parameters from CLI handler to # pipeline DSL file, instead of using environment vars. metadata = get_metadata(metadata_file) system_config = get_config(metadata, "system_configurations") model_config = get_config(metadata, "model_configurations") # tfx_image = system_config.get("TFX_IMAGE", None) tfx_image = os.environ.get("KUBEFLOW_TFX_IMAGE", None) logging.info(f"Current tfx image used: {tfx_image}") runner_config = kubeflow_dag_runner.KubeflowDagRunnerConfig( kubeflow_metadata_config=metadata_config, tfx_image=tfx_image, #pipeline_operator_funcs=([set_memory_request_and_limits( # system_config["memory_request"], system_config["memory_limit"])]), ) pod_labels = kubeflow_dag_runner.get_default_pod_labels() pod_labels.update({ telemetry_utils.LABEL_KFP_SDK_ENV: metadata["pipeline_name"] + "_" + metadata["pipeline_version"] }) kubeflow_dag_runner.KubeflowDagRunner( config=runner_config, pod_labels_to_attach=pod_labels ).run( pipeline.create_pipeline( pipeline_name=metadata["pipeline_name"] + "_" + metadata["pipeline_version"], pipeline_root=system_config["PIPELINE_ROOT"], query=model_config["query_script_path"], preprocessing_fn=system_config["preprocessing_fn"], run_fn=system_config["run_fn"], train_args=trainer_pb2.TrainArgs(splits=["train"], num_steps=100), eval_args=trainer_pb2.EvalArgs(splits=["train"], num_steps=50), model_serve_dir=system_config["MODEL_SERVE_DIR"], beam_pipeline_args=system_config["DATAFLOW_BEAM_PIPELINE_ARGS"], ai_platform_training_args=system_config[ "GCP_AI_PLATFORM_TRAINING_ARGS"] if system_config["enable_gpc_ai_platform_training"] else None, # (Optional) Uncomment below to use Cloud AI Platform. # ai_platform_serving_args=system_config["GCP_AI_PLATFORM_SERVING_ARGS"], enable_cache=system_config["enable_cache"], system_config=system_config, # passing config parameters downstream model_config=model_config, # passing model parameters downstream ))
def create_pipeline_api(name): args = reqparse.RequestParser(). \ add_argument("description", type=str, required=True). \ add_argument("processors", type=dict, required=True, action="append"). \ add_argument("encoder", type=dict, required=True). \ parse_args() args = from_view_dict(args) args['name'] = name return create_pipeline(**args)
def run(): """Define a kubeflow pipeline.""" # Metadata config. The defaults works work with the installation of # KF Pipelines using Kubeflow. If installing KF Pipelines using the # lightweight deployment option, you may need to override the defaults. # If you use Kubeflow, metadata will be written to MySQL database inside # Kubeflow cluster. metadata_config = kubeflow_dag_runner.get_default_kubeflow_metadata_config( ) # This pipeline automatically injects the Kubeflow TFX image if the # environment variable 'KUBEFLOW_TFX_IMAGE' is defined. Currently, the tfx # cli tool exports the environment variable to pass to the pipelines. # TODO(b/157598477) Find a better way to pass parameters from CLI handler to # pipeline DSL file, instead of using environment vars. # tfx_image = os.environ.get('KUBEFLOW_TFX_IMAGE', None) tfx_image = 'gcr.io/gcp-nyc/tfx-pipeline' runner_config = kubeflow_dag_runner.KubeflowDagRunnerConfig( kubeflow_metadata_config=metadata_config, tfx_image=tfx_image) pod_labels = kubeflow_dag_runner.get_default_pod_labels() pod_labels.update({telemetry_utils.LABEL_KFP_SDK_ENV: 'tfx-template'}) kubeflow_dag_runner.KubeflowDagRunner( config=runner_config, pod_labels_to_attach=pod_labels ).run( pipeline.create_pipeline( pipeline_name=configs.PIPELINE_NAME, pipeline_root=PIPELINE_ROOT, gcp_project=configs.GOOGLE_CLOUD_PROJECT, gcs_bucket=configs.GCS_BUCKET_NAME, tcga_betas_query=configs.TCGA_BETAS_QUERY, tcga_betas_output_schema=configs.TCGA_BETAS_OUTPUT_SCHEMA, tcga_betas_output_table_name=configs.TCGA_BETAS_OUTPUT_TABLE, cpg_sites_list_query=configs.CPG_SITES_LIST_QUERY, cpg_sites_list_output_schema=configs.CPG_SITES_OUTPUT_SCHEMA, cpg_sites_list_output_table_name=configs.CPG_SITES_OUTPUT_TABLE, pivot_query=configs.PIVOT_DATASET_QUERY, pivot_output_table=configs.PIVOT_OUTPUT_TABLE, final_dataset_query=configs.TRAIN_QUERY, preprocessing_fn=configs.PREPROCESSING_FN, run_fn=configs.RUN_FN, train_args=trainer_pb2.TrainArgs( num_steps=configs.TRAIN_NUM_STEPS), eval_args=trainer_pb2.EvalArgs(num_steps=configs.EVAL_NUM_STEPS), eval_accuracy_threshold=configs.EVAL_ACCURACY_THRESHOLD, serving_model_dir=SERVING_MODEL_DIR, beam_pipeline_args=configs. BIG_QUERY_WITH_DIRECT_RUNNER_BEAM_PIPELINE_ARGS, # TODO(step 8): (Optional) Uncomment below to use Dataflow. # beam_pipeline_args=configs.DATAFLOW_BEAM_PIPELINE_ARGS, # TODO(step 9): (Optional) Uncomment below to use Cloud AI Platform. # ai_platform_training_args=configs.GCP_AI_PLATFORM_TRAINING_ARGS, # TODO(step 9): (Optional) Uncomment below to use Cloud AI Platform. # ai_platform_serving_args=configs.GCP_AI_PLATFORM_SERVING_ARGS, ))
def run(): LocalDagRunner().run( pipeline.create_pipeline( pipeline_name="fishing-classifier", data_path="data", outputs_path="outputs", output_model_path="outputs/model", train_args=trainer_pb2.TrainArgs(num_steps=100), eval_args=trainer_pb2.EvalArgs(num_steps=15), eval_accuracy_threshold=0.6, metadata_connection_config=sqlite_metadata_connection_config( f"outputs/metadata.db"), ))
def run(): """Define a kubeflow pipeline.""" # Metadata config. The defaults works work with the installation of # KF Pipelines using Kubeflow. If installing KF Pipelines using the # lightweight deployment option, you may need to override the defaults. # If you use Kubeflow, metadata will be written to MySQL database inside # Kubeflow cluster. metadata_config = kubeflow_dag_runner.get_default_kubeflow_metadata_config( ) # This pipeline automatically injects the Kubeflow TFX image if the # environment variable 'KUBEFLOW_TFX_IMAGE' is defined. Currently, the tfx # cli tool exports the environment variable to pass to the pipelines. # TODO(b/157598477) Find a better way to pass parameters from CLI handler to # pipeline DSL file, instead of using environment vars. tfx_image = os.environ.get('KUBEFLOW_TFX_IMAGE', None) runner_config = kubeflow_dag_runner.KubeflowDagRunnerConfig( kubeflow_metadata_config=metadata_config, tfx_image=tfx_image) pod_labels = kubeflow_dag_runner.get_default_pod_labels().update( {telemetry_utils.LABEL_KFP_SDK_ENV: 'tfx-template'}) kubeflow_dag_runner.KubeflowDagRunner( config=runner_config, pod_labels_to_attach=pod_labels ).run( pipeline.create_pipeline( pipeline_name=conf['kfp']['pipeline_name'], pipeline_root=conf['pipeline_root_dir'], data_path=conf['train_data'], # TODO(step 7): (Optional) Uncomment below to use BigQueryExampleGen. # query=configs.BIG_QUERY_QUERY, module_file='pjm_trainer.py', # preprocessing_fn=configs.PREPROCESSING_FN, # run_fn=configs.RUN_FN, train_args=trainer_pb2.TrainArgs( num_steps=configs.TRAIN_NUM_STEPS), eval_args=trainer_pb2.EvalArgs(num_steps=configs.EVAL_NUM_STEPS), eval_accuracy_threshold=configs.EVAL_ACCURACY_THRESHOLD, serving_model_dir=conf['serving_model_dir'], # TODO(step 7): (Optional) Uncomment below to use provide GCP related # config for BigQuery with Beam DirectRunner. # beam_pipeline_args=configs # .BIG_QUERY_WITH_DIRECT_RUNNER_BEAM_PIPELINE_ARGS, # TODO(step 8): (Optional) Uncomment below to use Dataflow. # beam_pipeline_args=configs.DATAFLOW_BEAM_PIPELINE_ARGS, # TODO(step 9): (Optional) Uncomment below to use Cloud AI Platform. # ai_platform_training_args=configs.GCP_AI_PLATFORM_TRAINING_ARGS, # TODO(step 9): (Optional) Uncomment below to use Cloud AI Platform. # ai_platform_serving_args=configs.GCP_AI_PLATFORM_SERVING_ARGS, ))
def run(): BeamDagRunner().run( pipeline.create_pipeline( pipeline_name=config.PIPELINE_NAME, pipeline_root=pipeline_config.PIPELINE_ROOT, data_path=pipeline_config.DATA_PATH, preprocessing_fn=config.PREPROCESSING_FN, run_fn=config.RUN_FN, train_args=trainer_pb2.TrainArgs(num_steps=config.TRAIN_NUM_STEPS), eval_args=trainer_pb2.EvalArgs(num_steps=config.EVAL_NUM_STEPS), eval_accuracy_threshold=config.EVAL_ACCURACY_THRESHOLD, serving_model_dir=pipeline_config.SERVING_MODEL_DIR, # query=config.BIG_QUERY_QUERY, # beam_pipeline_args=config.BIG_QUERY_WITH_DIRECT_RUNNER_BEAM_PIPELINE_ARGS, metadata_connection_config=metadata. sqlite_metadata_connection_config(pipeline_config.METADATA_PATH)))
def create_pipeline_api(name): args = reqparse.RequestParser(). \ add_argument("description", type=str, required=True). \ add_argument("processors", type=dict, action="append"). \ add_argument("encoder", type=dict, required=True). \ parse_args() args = from_view_dict(args) args['name'] = name if not args["processors"]: args["processors"] = [] if "name" not in args['encoder'] or "instance" not in args["encoder"]: raise RequestError("name or instance not in encoder", "") for processor in args['processors']: if "name" not in processor or "instance" not in processor: raise RequestError( f"name or instance not in processor <{processor}>", "") return create_pipeline(**args)
def run(): """Define a pipeline to be executed using Kubeflow V2 runner.""" # TODO(b/157598477) Find a better way to pass parameters from CLI handler to # pipeline DSL file, instead of using environment vars. tfx_image = os.environ.get(labels.TFX_IMAGE_ENV) project_id = os.environ.get(labels.GCP_PROJECT_ID_ENV) api_key = os.environ.get(labels.API_KEY_ENV) runner_config = kubeflow_v2_dag_runner.KubeflowV2DagRunnerConfig( project_id=project_id, display_name="tfx-kubeflow-v2-pipeline-{}".format( configs.PIPELINE_NAME), default_image=tfx_image, ) dsl_pipeline = pipeline.create_pipeline( pipeline_name=configs.PIPELINE_NAME, pipeline_root=_PIPELINE_ROOT, data_path=_DATA_PATH, # TODO(step 7): (Optional) Uncomment here to use BigQueryExampleGen. # query=configs.BIG_QUERY_QUERY, preprocessing_fn=configs.PREPROCESSING_FN, run_fn=configs.RUN_FN, train_args=trainer_pb2.TrainArgs(num_steps=configs.TRAIN_NUM_STEPS), eval_args=trainer_pb2.EvalArgs(num_steps=configs.EVAL_NUM_STEPS), eval_accuracy_threshold=configs.EVAL_ACCURACY_THRESHOLD, serving_model_dir=_SERVING_MODEL_DIR, # TODO(step 7): (Optional) Uncomment here to use provide GCP related # config for BigQuery with Beam DirectRunner. # beam_pipeline_args=configs. # BIG_QUERY_WITH_DIRECT_RUNNER_BEAM_PIPELINE_ARGS, # TODO(step 8): (Optional) Uncomment below to use Dataflow. # beam_pipeline_args=configs.DATAFLOW_BEAM_PIPELINE_ARGS, # TODO(step 9): (Optional) Uncomment below to use Cloud AI Platform. # ai_platform_training_args=configs.GCP_AI_PLATFORM_TRAINING_ARGS, # TODO(step 9): (Optional) Uncomment below to use Cloud AI Platform. # ai_platform_serving_args=configs.GCP_AI_PLATFORM_SERVING_ARGS, ) runner = kubeflow_v2_dag_runner.KubeflowV2DagRunner(config=runner_config) if os.environ.get(labels.RUN_FLAG_ENV, False): # Only trigger the execution when invoked by 'run' command. runner.run(pipeline=dsl_pipeline, api_key=api_key) else: runner.compile(pipeline=dsl_pipeline, write_out=True)
def run(): """Define a beam pipeline.""" BeamDagRunner().run( pipeline.create_pipeline( pipeline_name=configs.PIPELINE_NAME, pipeline_root=PIPELINE_ROOT, data_path=DATA_PATH, preprocessing_fn=configs.PREPROCESSING_FN, run_fn=configs.RUN_FN, train_args=trainer_pb2.TrainArgs( num_steps=configs.TRAIN_NUM_STEPS), eval_args=trainer_pb2.EvalArgs(num_steps=configs.EVAL_NUM_STEPS), eval_accuracy_threshold=configs.EVAL_ACCURACY_THRESHOLD, serving_model_dir=SERVING_MODEL_DIR, metadata_connection_config=metadata. sqlite_metadata_connection_config( # noqa METADATA_PATH), ))
def run(): """Define a kubeflow pipeline.""" # Metadata config. The defaults works work with the installation of # KF Pipelines using Kubeflow. If installing KF Pipelines using the # lightweight deployment option, you may need to override the defaults. # If you use Kubeflow, metadata will be written to MySQL database inside # Kubeflow cluster. metadata_config = kubeflow_dag_runner.get_default_kubeflow_metadata_config( ) # This pipeline automatically injects the Kubeflow TFX image if the # environment variable 'KUBEFLOW_TFX_IMAGE' is defined. Currently, the tfx # cli tool exports the environment variable to pass to the pipelines. # TODO(b/157598477) Find a better way to pass parameters from CLI handler to # pipeline DSL file, instead of using environment vars. tfx_image = os.environ.get('KUBEFLOW_TFX_IMAGE', None) runner_config = kubeflow_dag_runner.KubeflowDagRunnerConfig( kubeflow_metadata_config=metadata_config, tfx_image=tfx_image) pod_labels = kubeflow_dag_runner.get_default_pod_labels() pod_labels.update({telemetry_utils.LABEL_KFP_SDK_ENV: 'tfx-template'}) kubeflow_dag_runner.KubeflowDagRunner( config=runner_config, pod_labels_to_attach=pod_labels ).run( pipeline.create_pipeline( pipeline_name=configs.PIPELINE_NAME, pipeline_root=PIPELINE_ROOT, data_path=DATA_PATH, # NOTE: Use `query` instead of `data_path` to use BigQueryExampleGen. # query=configs.BIG_QUERY_QUERY, preprocessing_fn=configs.PREPROCESSING_FN, run_fn=configs.RUN_FN, train_args=trainer_pb2.TrainArgs( num_steps=configs.TRAIN_NUM_STEPS), eval_args=trainer_pb2.EvalArgs(num_steps=configs.EVAL_NUM_STEPS), eval_accuracy_threshold=configs.EVAL_ACCURACY_THRESHOLD, serving_model_dir=SERVING_MODEL_DIR, # NOTE: Provide GCP configs to use BigQuery with Beam DirectRunner. # beam_pipeline_args=configs. # BIG_QUERY_WITH_DIRECT_RUNNER_BEAM_PIPELINE_ARGS, ))
def run(): """Define a beam pipeline.""" BeamDagRunner().run( pipeline.create_pipeline( pipeline_name=configs.PIPELINE_NAME, pipeline_root=PIPELINE_ROOT, data_path=DATA_PATH, # TODO(step 7): (Optional) Uncomment here to use BigQueryExampleGen. # query=configs.BIG_QUERY_QUERY, preprocessing_fn=configs.PREPROCESSING_FN, run_fn=configs.RUN_FN, train_args=trainer_pb2.TrainArgs( num_steps=configs.TRAIN_NUM_STEPS), eval_args=trainer_pb2.EvalArgs(num_steps=configs.EVAL_NUM_STEPS), eval_accuracy_threshold=configs.EVAL_ACCURACY_THRESHOLD, serving_model_dir=SERVING_MODEL_DIR, # TODO(step 7): (Optional) Uncomment here to use provide GCP related # config for BigQuery with Beam DirectRunner. # beam_pipeline_args=configs. # BIG_QUERY_WITH_DIRECT_RUNNER_BEAM_PIPELINE_ARGS, metadata_connection_config=metadata. sqlite_metadata_connection_config(METADATA_PATH)))
def run(): """Define a kubeflow pipeline.""" # Metadata config. The defaults works work with the installation of # KF Pipelines using Kubeflow. If installing KF Pipelines using the # lightweight deployment option, you may need to override the defaults. # If you use Kubeflow, metadata will be written to MySQL database inside # Kubeflow cluster. metadata_config = kubeflow_dag_runner.get_default_kubeflow_metadata_config( ) # This pipeline automatically injects the Kubeflow TFX image if the # environment variable 'KUBEFLOW_TFX_IMAGE' is defined. tfx_image = os.environ.get('KUBEFLOW_TFX_IMAGE', None) runner_config = kubeflow_dag_runner.KubeflowDagRunnerConfig( kubeflow_metadata_config=metadata_config, tfx_image=tfx_image) pod_labels = kubeflow_dag_runner.get_default_pod_labels().update( {telemetry_utils.LABEL_KFP_SDK_ENV: 'tfx-template'}) kubeflow_dag_runner.KubeflowDagRunner( config=runner_config, pod_labels_to_attach=pod_labels ).run( pipeline.create_pipeline( pipeline_name=configs.PIPELINE_NAME, pipeline_root=PIPELINE_ROOT, query=configs.BIG_QUERY_QUERY, run_fn=configs.RUN_FN, train_args=trainer_pb2.TrainArgs( num_steps=configs.TRAIN_NUM_STEPS), eval_args=trainer_pb2.EvalArgs(num_steps=configs.EVAL_NUM_STEPS), serving_model_dir=SERVING_MODEL_DIR, beam_pipeline_args=configs. BIG_QUERY_WITH_DIRECT_RUNNER_BEAM_PIPELINE_ARGS, ai_platform_training_args=configs.GCP_AI_PLATFORM_TRAINING_ARGS, ai_platform_serving_args=configs.GCP_AI_PLATFORM_SERVING_ARGS, ))
parser.add_argument("--log-level", type=str, default=os.environ.get("LOGLEVEL", "INFO").upper()) parser.add_argument("--sagemaker-project-id", type=str, required=True) parser.add_argument("--sagemaker-project-name", type=str, required=True) parser.add_argument("--pipeline-description", type=str, default="automated ingestion from s3 to feature store") parser.add_argument("--pipeline-name-prefix", type=str, default="s3-fs-ingest-pipeline") parser.add_argument("--dw-flow-url", type=str, required=True) parser.add_argument("--dw-flow-output-name", type=str, required=True) parser.add_argument("--s3-data-prefix", type=str, required=True) parser.add_argument("--feature-group-name", type=str, required=True) parser.add_argument("--execution-role", type=str, default="") args, _ = parser.parse_known_args() # Configure logging to output the line number and message log_format = "%(levelname)s: [%(filename)s:%(lineno)s] %(message)s" logging.basicConfig(format=log_format, level=args.log_level) pipeline = create_pipeline( pipeline_name=f"{args.pipeline_name_prefix}-{args.sagemaker_project_id}", pipeline_description=args.pipeline_description, project_id=args.sagemaker_project_id, project_name=args.sagemaker_project_name, data_wrangler_flow_s3_url=args.dw_flow_url, flow_output_name=args.dw_flow_output_name, input_data_s3_url=f"s3://{args.s3_data_prefix}", feature_group_name=args.feature_group_name, execution_role=args.execution_role, ) logger.info(f"pipeline created:") logger.info(f"{json.dumps(json.loads(pipeline.definition()), indent=2, sort_keys=True)}")
def main(): df = get_data() pipeline = create_pipeline() # data = pipeline.fit_transform(df) print(df)
def wrapper(*args, **kwargs): create_pipeline(name=name, processors=processors, encoder=encoder) func(*args, **kwargs) delete_pipeline(name)