def run(): """Define a kubeflow pipeline.""" metadata_config = kubeflow_dag_runner.get_default_kubeflow_metadata_config( ) tfx_image = os.environ.get('KUBEFLOW_TFX_IMAGE', None) runner_config = kubeflow_dag_runner.KubeflowDagRunnerConfig( kubeflow_metadata_config=metadata_config, tfx_image=tfx_image) pod_labels = kubeflow_dag_runner.get_default_pod_labels() pod_labels.update({telemetry_utils.LABEL_KFP_SDK_ENV: 'advert-pred'}) kubeflow_dag_runner.KubeflowDagRunner( config=runner_config, pod_labels_to_attach=pod_labels).run( pipeline.create_pipeline( pipeline_name=PIPELINE_NAME, pipeline_root=PIPELINE_ROOT, data_path=DATA_PATH, preprocessing_fn=PREPROCESSING_FN, run_fn=RUN_FN, train_args=trainer_pb2.TrainArgs(num_steps=TRAIN_NUM_STEPS), eval_args=trainer_pb2.EvalArgs(num_steps=EVAL_NUM_STEPS), eval_accuracy_threshold=EVAL_ACCURACY_THRESHOLD, serving_model_dir=SERVING_MODEL_DIR, ))
def general_dist_pipeline(conn, query, dist_label): return pipe.create_pipeline( pipe.create_fetch_data(query, conn), pipe.create_set_index("how_out"), pipe.create_rename_label("counts", dist_label, axis=1), clean_data_pipeline, pipe.create_normalise_values(dist_label), )
def bowler_pipeline(conn): return pipe.create_pipeline( pipe.create_fetch_data(bowler_query, conn), pipe.transpose_df, pipe.create_rename_axis("how_out"), pipe.create_rename_label(0, "bowler_dist", axis=1), clean_data_pipeline, pipe.create_normalise_values("bowler_dist"), )
def main(): try: SERVICE_HOME = sys.argv[1] # init spark spark = get_spark(app_name="sample") # get logger logger = get_logger(spark, "app") # load data df = spark.read.schema(get_train_schema()).option( 'header', True).csv(SERVICE_HOME + '/dataset/WA_Fn-UseC_-HR-Employee-Attrition.csv') # label preprocessing (only in training part) df = df.withColumn('label', str2num( F.col('Attrition'), {'No': 0, 'Yes': 1})) \ .drop('Attrition') # seperate train and valid (train_data, valid_data) = df.randomSplit([0.8, 0.2]) # preprocess(pipeline / non-pipeline) / training logger.info('preprocessing & training') stages = get_stages(train_data) rf = RandomForestRegressor(labelCol="label", featuresCol="features", numTrees=10) stages.append(rf) mypipeline = create_pipeline(stages) mymodel = mypipeline.fit(train_data) # get validation metric predictions = mymodel.transform(valid_data) evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse") rmse = evaluator.evaluate(predictions) logger.info('valid rmse: {}'.format(rmse)) model_path = SERVICE_HOME + '/model' if os.path.exists(model_path): shutil.rmtree(model_path) logger.info('model exist, rm old model') mymodel.save(model_path) logger.info('save model to {}'.format(model_path)) except Exception: logger.error(traceback.print_exc()) finally: # stop spark spark.stop()
def main(): parser = argparse.ArgumentParser() parser.add_argument('-auth', type=str, required=False, default='credentials.json') parser.add_argument('-config', type=str, required=False, default='trackerConfig.json') args = parser.parse_args() try: test_config = Configuration(args.config) except ValueError as err: print(f'{type(err).__name__}: {err}') return db = get_db_connection(args.auth) update_collections(db, test_config.refresh) collection = db[test_config.collection] pipeline = create_pipeline(test_config) with open('pipeline.json', 'w') as f: print(pipeline, file=f) result = collection.aggregate(pipeline).next() page = get_header() for n, t in enumerate(result): if test_config.analysis[n]['task'].get('stats') is not None: test_config.analysis[n]['task'].update( {"aggregation": test_config.aggregation}) q = Query(task=test_config.analysis[n]['task'], output=test_config.analysis[n]['output'], data={"data": result[t]}) page += create_table(q) else: q = Query(task=test_config.analysis[n]['task'], output=test_config.analysis[n]['output'], data=result[t][0]) for key in q.output: if ('track' in q.task or 'ratio' in q.task) and key == 'table': page += get_table(q) if ('track' in q.task or 'ratio' in q.task) and key == 'graph': create_graph(q, n) page += f'<img src="graph{n}.png"></img>' with open(test_config.output_file, 'w') as f: f.write(page) print("done")
def node_tagged_with(context, node_name, tags): """ Check tagging in `pipeline_template.py` is consistent with tagging descriptions in background steps """ sys.path.append( str(context.root_project_dir / "src" / context.project_name.replace("-", "_")) ) import pipeline # pylint: disable=import-error context.project_pipeline = pipeline.create_pipeline() node_objs = [n for n in context.project_pipeline.nodes if n.name == node_name] assert node_objs assert set(tags) == node_objs[0].tags
def run(): tfx_image = os.environ.get('KUBEFLOW_TFX_IMAGE', None) runner_config = kubeflow_dag_runner.KubeflowDagRunnerConfig( kubeflow_metadata_config=metadata_config, tfx_image=tfx_image) kubeflow_dag_runner.KubeflowDagRunner(config=runner_config).run( pipeline.create_pipeline( pipeline_name=configs.PIPELINE_NAME, pipeline_root=PIPELINE_ROOT, bucket=BUCKET, csv_file=CSV_FILE, preprocessing_fn=configs.PREPROCESSING_FN, trainer_fn=configs.TRAINER_FN, train_args=configs.TRAIN_ARGS, eval_args=configs.EVAL_ARGS, serving_model_dir=SERVING_MODEL_DIR, ))
def run(): """Define a beam pipeline.""" BeamDagRunner().run( pipeline.create_pipeline( pipeline_name=configs.PIPELINE_NAME, pipeline_root=PIPELINE_ROOT, data_path=DATA_PATH, # TODO(step 7): (Optional) Uncomment here to use BigQueryExampleGen. # query=configs.BIG_QUERY_QUERY, preprocessing_fn=configs.PREPROCESSING_FN, trainer_fn=configs.TRAINER_FN, train_args=configs.TRAIN_ARGS, eval_args=configs.EVAL_ARGS, serving_model_dir=SERVING_MODEL_DIR, # TODO(step 7): (Optional) Uncomment here to use provide GCP related # config for BigQuery. # beam_pipeline_args=configs.BIG_QUERY_BEAM_PIPELINE_ARGS, metadata_connection_config=metadata. sqlite_metadata_connection_config(METADATA_PATH)))
def run(): """Define a kubeflow pipeline.""" # Metadata config. The defaults works work with the installation of # KF Pipelines using Kubeflow. If installing KF Pipelines using the # lightweight deployment option, you may need to override the defaults. # If you use Kubeflow, metadata will be written to MySQL database inside # Kubeflow cluster. metadata_config = kubeflow_dag_runner.get_default_kubeflow_metadata_config( ) # This pipeline automatically injects the Kubeflow TFX image if the # environment variable 'KUBEFLOW_TFX_IMAGE' is defined. Currently, the tfx # cli tool exports the environment variable to pass to the pipelines. tfx_image = os.environ.get('KUBEFLOW_TFX_IMAGE', None) runner_config = kubeflow_dag_runner.KubeflowDagRunnerConfig( kubeflow_metadata_config=metadata_config, tfx_image=tfx_image) kubeflow_dag_runner.KubeflowDagRunner(config=runner_config).run( pipeline.create_pipeline( pipeline_name=configs.PIPELINE_NAME, pipeline_root=PIPELINE_ROOT, #data_path=DATA_PATH, # TODO(step 7): (Optional) Uncomment below to use BigQueryExampleGen. query=configs.BIG_QUERY_QUERY, preprocessing_fn=configs.PREPROCESSING_FN, trainer_fn=configs.TRAINER_FN, train_args=configs.TRAIN_ARGS, eval_args=configs.EVAL_ARGS, serving_model_dir=SERVING_MODEL_DIR, # TODO(step 7): (Optional) Uncomment below to use provide GCP related # config for BigQuery. beam_pipeline_args=configs.BIG_QUERY_BEAM_PIPELINE_ARGS, # TODO(step 8): (Optional) Uncomment below to use Dataflow. # beam_pipeline_args=configs.BEAM_PIPELINE_ARGS, # TODO(step 9): (Optional) Uncomment below to use Cloud AI Platform. # ai_platform_training_args=configs.GCP_AI_PLATFORM_TRAINING_ARGS, # TODO(step 9): (Optional) Uncomment below to use Cloud AI Platform. # ai_platform_serving_args=configs.GCP_AI_PLATFORM_SERVING_ARGS, ))
import os from pathlib import Path from tfx.orchestration import metadata from tfx.orchestration import pipeline as pipeline_module from tfx.orchestration.beam.beam_dag_runner import BeamDagRunner from pipeline import create_pipeline if __name__ == "__main__": pipeline_name = 'tfx-container-pipeline' tfx_root = Path(__file__).parent / 'tfx_root' pipeline_root = tfx_root / 'pipelines' / pipeline_name # Sqlite ML-metadata db path. metadata_path = tfx_root / 'metadata' / pipeline_name / 'metadata.db' components = create_pipeline() pipeline = pipeline_module.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root.as_posix(), components=components, metadata_connection_config=metadata.sqlite_metadata_connection_config( metadata_path.as_posix())) BeamDagRunner().run(pipeline)
def create_get_dist_pipeline(conn): return pipe.create_pipeline(create_get_all_dists(conn), pipe.join_dists, pipe.create_average_df(1))
def main(argv): del argv # Overwrite the use_cloud_pipelines flag if the compile_only flag set if FLAGS.compile_only: FLAGS.use_cloud_pipelines = True # Config executors if FLAGS.use_cloud_executors: ai_platform_training_args = { 'project': FLAGS.project_id, 'region': FLAGS.region, 'masterConfig': { 'imageUri': FLAGS.pipeline_image, } } trainer_custom_config = { ai_platform_trainer_executor.TRAINING_ARGS_KEY: ai_platform_training_args } trainer_custom_executor_spec = executor_spec.ExecutorClassSpec( ai_platform_trainer_executor.GenericExecutor) beam_pipeline_args = [ '--runner=DataflowRunner', '--experiments=shuffle_mode=auto', '--project=' + FLAGS.project_id, '--temp_location=' + FLAGS.dataflow_temp_location, '--disk_size_gb=' + str(FLAGS.dataflow_disk_size), '--machine_type=' + FLAGS.dataflow_machine_type, '--region=' + FLAGS.region ] else: trainer_custom_config = None trainer_custom_executor_spec = executor_spec.ExecutorClassSpec( trainer_executor.GenericExecutor) beam_pipeline_args = [ '--direct_running_mode=multi_processing', # 0 means auto-detect based on on the number of CPUs available # during execution time. '--direct_num_workers=0' ] # Config pipeline orchestrator if FLAGS.use_cloud_pipelines: metadata_connection_config = None data_root_uri = data_types.RuntimeParameter( name='data-root-uri', ptype=str, default=FLAGS.data_root_uri) schema_folder_uri = data_types.RuntimeParameter( name='schema-folder-uri', ptype=str, default=FLAGS.schema_folder_uri) else: metadata_connection_config = (sqlite_metadata_connection_config( FLAGS.sql_lite_path)) data_root_uri = FLAGS.data_root_uri schema_folder_uri = FLAGS.schema_folder_uri # Create the pipeline pipeline_def = pipeline.create_pipeline( pipeline_name=FLAGS.pipeline_name, pipeline_root=FLAGS.pipeline_root, serving_model_uri=FLAGS.serving_model_uri, data_root_uri=data_root_uri, schema_folder_uri=schema_folder_uri, eval_steps=FLAGS.eval_steps, train_steps=FLAGS.train_steps, trainer_custom_executor_spec=trainer_custom_executor_spec, trainer_custom_config=trainer_custom_config, beam_pipeline_args=beam_pipeline_args, metadata_connection_config=metadata_connection_config) # Run or compile the pipeline if FLAGS.use_cloud_pipelines: logging.info(f'Compiling pipeline to: {FLAGS.pipeline_spec_path}') _compile_pipeline(pipeline_def=pipeline_def, project_id=FLAGS.project_id, pipeline_name=FLAGS.pipeline_name, pipeline_image=FLAGS.pipeline_image, pipeline_spec_path=FLAGS.pipeline_spec_path) if FLAGS.compile_only: return # Set runtime parameters parameter_values = { 'data-root-uri': FLAGS.data_root_uri, 'schema-folder-uri': FLAGS.schema_folder_uri, } # Submit the run logging.info('Submitting AI Platform Pipelines job ...') _submit_pipeline_run(project_id=FLAGS.project_id, region=FLAGS.region, api_key=FLAGS.api_key, pipeline_spec_path=FLAGS.pipeline_spec_path, pipeline_root=FLAGS.pipeline_root, parameter_values=parameter_values) else: logging.info('Using local dag runner') LocalDagRunner().run(pipeline_def)
def main(argv): del argv beam_pipeline_args = [ '--direct_running_mode=multi_processing', # 0 means auto-detect based on on the number of CPUs available # during execution time. '--direct_num_workers=0' ] metadata_connection_config = None data_root_uri = data_types.RuntimeParameter( name='data-root-uri', ptype=str, default=FLAGS.data_root_uri) eval_split_name = data_types.RuntimeParameter( name='eval-split-name', ptype=str, default='eval' ) #output_config = example_gen_pb2.Output( # split_config=example_gen_pb2.SplitConfig(splits=[ # example_gen_pb2.SplitConfig.Split(name=eval_split_name, hash_buckets=4), # example_gen_pb2.SplitConfig.Split(name='test', hash_buckets=1)])) output_config = { "split_config": { "splits": [ { "name": "train", "hash_buckets": 4 }, { "name": eval_split_name, "hash_buckets": 1 } ] } } # Create the pipeline pipeline_def = pipeline.create_pipeline( pipeline_name=FLAGS.pipeline_name, pipeline_root=FLAGS.pipeline_root, data_root_uri=data_root_uri, output_config=output_config, beam_pipeline_args=beam_pipeline_args, metadata_connection_config=metadata_connection_config) logging.info(f'Compiling pipeline to: {FLAGS.pipeline_spec_path}') metadata_config = kubeflow_dag_runner.get_default_kubeflow_metadata_config() runner_config = kubeflow_dag_runner.KubeflowDagRunnerConfig( kubeflow_metadata_config=metadata_config, # Specify custom docker image to use. # tfx_image=tfx_image ) runner = kubeflow_dag_runner.KubeflowDagRunner( config=runner_config, output_filename=FLAGS.pipeline_spec_path) runner.run(pipeline_def)
eval_steps = data_types.RuntimeParameter(name='eval-steps', default=500, ptype=int) pipeline_root = '{}/{}/{}'.format(Config.ARTIFACT_STORE_URI, Config.PIPELINE_NAME, kfp.dsl.RUN_ID_PLACEHOLDER) # Set KubeflowDagRunner settings. metadata_config = kubeflow_dag_runner.get_default_kubeflow_metadata_config( ) runner_config = kubeflow_dag_runner.KubeflowDagRunnerConfig( kubeflow_metadata_config=metadata_config, pipeline_operator_funcs=kubeflow_dag_runner. get_default_pipeline_operator_funcs(strtobool(Config.USE_KFP_SA)), tfx_image=Config.TFX_IMAGE) # Compile the pipeline. kubeflow_dag_runner.KubeflowDagRunner(config=runner_config).run( create_pipeline(pipeline_name=Config.PIPELINE_NAME, pipeline_root=pipeline_root, data_root_uri=data_root_uri, train_steps=train_steps, eval_steps=eval_steps, enable_tuning=strtobool(Config.ENABLE_TUNING), ai_platform_training_args=ai_platform_training_args, ai_platform_serving_args=ai_platform_serving_args, beam_pipeline_args=beam_pipeline_args))
_beam_tmp_folder = '{}/beam/tmp'.format(_artifact_store_uri) _beam_pipeline_args = [ '--runner=DataflowRunner', '--project=' + _project_id, '--temp_location=' + _beam_tmp_folder, '--region=' + _region, ] # To run this pipeline from the python CLI: # $python taxi_pipeline_hello.py if __name__ == '__main__': absl.logging.set_verbosity(absl.logging.INFO) metadata_config = kubeflow_dag_runner.get_default_kubeflow_metadata_config( ) pipeline_operator_funcs = kubeflow_dag_runner.get_default_pipeline_operator_funcs( ) runner_config = kubeflow_dag_runner.KubeflowDagRunnerConfig( kubeflow_metadata_config=metadata_config, pipeline_operator_funcs=pipeline_operator_funcs, tfx_image=_tfx_image) kubeflow_dag_runner.KubeflowDagRunner(config=runner_config).run( create_pipeline(pipeline_name=_pipeline_name, pipeline_root=_pipeline_root, data_root=_data_root, beam_pipeline_args=_beam_pipeline_args))
inner join core_competition on core_match.competition_id=core_competition.id where core_batperformance.bat_how_out not in ('not out', 'no', 'DNB') and core_competition.league_id = {obj_id} group by bat_how_out; """ # # Pipelines # clean_data_pipeline = pipe.create_pipeline( pipe.create_add_missing_row("lbw", 0), pipe.create_add_missing_row("ct", 0), pipe.create_add_missing_row("b", 0), pipe.create_add_missing_row("run out", 0), pipe.create_add_missing_row("st", 0), pipe.create_rename_label("run out", "ro", axis=0), ) def bowler_pipeline(conn): return pipe.create_pipeline( pipe.create_fetch_data(bowler_query, conn), pipe.transpose_df, pipe.create_rename_axis("how_out"), pipe.create_rename_label(0, "bowler_dist", axis=1), clean_data_pipeline, pipe.create_normalise_values("bowler_dist"), )
eval_steps = data_types.RuntimeParameter(name='eval-steps', default=350, ptype=int) PIPELINE_ROOT = '{}/{}/{}'.format(Config.ARTIFACT_STORE_URI, Config.PIPELINE_NAME, kfp.dsl.RUN_ID_PLACEHOLDER) # Set KubeflowDagRunner settings. metadata_config = kubeflow_dag_runner.get_default_kubeflow_metadata_config( ) runner_config = kubeflow_dag_runner.KubeflowDagRunnerConfig( kubeflow_metadata_config=metadata_config, pipeline_operator_funcs=kubeflow_dag_runner. get_default_pipeline_operator_funcs(strtobool(Config.USE_KFP_SA)), tfx_image=Config.TFX_IMAGE) # Compile the pipeline. kubeflow_dag_runner.KubeflowDagRunner(config=runner_config).run( create_pipeline(pipeline_name=Config.PIPELINE_NAME, pipeline_root=PIPELINE_ROOT, data_root_uri=data_root_uri, train_steps=train_steps, eval_steps=eval_steps, enable_tuning=strtobool(Config.ENABLE_TUNING), ai_platform_training_args=AI_PLATFORM_TRAINING_ARGS, ai_platform_serving_args=AI_PLATFORM_SERVING_ARGS, beam_pipeline_args=BEAM_PIPELINE_ARGS))
) pipeline_root = f'{config.ARTIFACT_STORE_URI}/{config.PIPELINE_NAME}/{kfp.dsl.RUN_ID_PLACEHOLDER}' # Set KubeflowDagRunner settings metadata_config = kubeflow_dag_runner.get_default_kubeflow_metadata_config() runner_config = kubeflow_dag_runner.KubeflowDagRunnerConfig( kubeflow_metadata_config = metadata_config, pipeline_operator_funcs = kubeflow_dag_runner.get_default_pipeline_operator_funcs( config.USE_KFP_SA == 'True'), tfx_image=config.ML_IMAGE_URI ) # Compile the pipeline kubeflow_dag_runner.KubeflowDagRunner(config=runner_config).run( pipeline.create_pipeline( pipeline_name=config.PIPELINE_NAME, pipeline_root=pipeline_root, project_id=config.PROJECT_ID, bq_dataset_name=config.BQ_DATASET_NAME, min_item_frequency=min_item_frequency, max_group_size=max_group_size, dimensions=dimensions, num_leaves=num_leaves, eval_min_recall=eval_min_recall, eval_max_latency=eval_max_latency, ai_platform_training_args=ai_platform_training_args, beam_pipeline_args=beam_pipeline_args, model_regisrty_uri=config.MODEL_REGISTRY_URI) )
ptype=int) eval_steps = data_types.RuntimeParameter(name='eval-steps', default=500, ptype=int) pipeline_root = '{}/{}/{}'.format(Config.ARTIFACT_STORE_URI, Config.PIPELINE_NAME, kfp.dsl.RUN_ID_PLACEHOLDER) # Set KubeflowDagRunner settings metadata_config = kubeflow_dag_runner.get_default_kubeflow_metadata_config( ) runner_config = kubeflow_dag_runner.KubeflowDagRunnerConfig( kubeflow_metadata_config=metadata_config, pipeline_operator_funcs=kubeflow_dag_runner. get_default_pipeline_operator_funcs(Config.USE_KFP_SA == 'True'), tfx_image=Config.TFX_IMAGE) # Compile the pipeline kubeflow_dag_runner.KubeflowDagRunner(config=runner_config).run( create_pipeline(pipeline_name=Config.PIPELINE_NAME, pipeline_root=pipeline_root, data_root_uri=data_root_uri, train_steps=train_steps, eval_steps=eval_steps, ai_platform_training_args=ai_platform_training_args, ai_platform_serving_args=ai_platform_serving_args, beam_pipeline_args=beam_pipeline_args))
accuracy_threshold = data_types.RuntimeParameter(name='accuracy-threshold', default=0.75, ptype=float) pipeline_root = '{}/{}/{}'.format(config.ARTIFACT_STORE_URI, config.PIPELINE_NAME, kfp.dsl.RUN_ID_PLACEHOLDER) # Set KubeflowDagRunner settings metadata_config = kubeflow_dag_runner.get_default_kubeflow_metadata_config( ) runner_config = kubeflow_dag_runner.KubeflowDagRunnerConfig( kubeflow_metadata_config=metadata_config, pipeline_operator_funcs=kubeflow_dag_runner. get_default_pipeline_operator_funcs(config.USE_KFP_SA == 'True'), tfx_image=config.TFX_IMAGE) # Compile the pipeline kubeflow_dag_runner.KubeflowDagRunner(config=runner_config).run( pipeline.create_pipeline( pipeline_name=config.PIPELINE_NAME, pipeline_root=pipeline_root, dataset_name=config.DATASET_NAME, train_steps=train_steps, eval_steps=eval_steps, accuracy_threshold=accuracy_threshold, ai_platform_training_args=ai_platform_training_args, ai_platform_serving_args=ai_platform_serving_args, beam_pipeline_args=beam_pipeline_args, model_regisrty_uri=config.MODEL_REGISTRY_URI))