def create_pl_comp(): ''''creates pipeline components using python functions. return list of components''' # Define your components code as standalone python functions:====================== def add(a: float, b: float) -> float: '''Calculates sum of two arguments''' return a + b def multiply(c: float, d: float) -> float: '''Calculates the product''' return c * d # convert the python functions to a task factory (function that return a task object) add_op = comp.create_component_from_func( add, output_component_file='add_component.yaml', ) # factory function used to create kfp.dsl.ContainerOp class instances for your pipeline add_op.component_spec.save('add_component.yaml') #add_op.component.OutputTextFile('Output.txt') # product_op is a task factory that creates a task object when given argument product_op = comp.create_component_from_func( multiply, output_component_file='multiple_component.yaml') component_lst = [add_op, product_op] return component_lst
def test_handling_list_arguments_containing_serializable_python_objects( self): '''Checks that lists containing python objects with .to_struct() can be properly serialized.''' class MyClass: def to_struct(self): return {'foo': [7, 42]} def assert_values_are_correct( list_param: list, dict_param: dict, ) -> int: import unittest unittest.TestCase().assertEqual(list_param, [1, { 'foo': [7, 42] }, 3]) unittest.TestCase().assertEqual(dict_param, {'k1': { 'foo': [7, 42] }}) return 1 task_factory = comp.create_component_from_func( assert_values_are_correct) self.helper_test_component_using_local_call( task_factory, arguments=dict( list_param=[1, MyClass(), 3], dict_param={'k1': MyClass()}, ), expected_output_values={'Output': '1'}, )
def test_python_component_decorator(self): # Deprecated from kfp.dsl import python_component from kfp.components import create_component_from_func expected_name = 'Sum component name' expected_description = 'Sum component description' expected_image = 'org/image' @python_component(name=expected_name, description=expected_description, base_image=expected_image) def add_two_numbers_decorated( a: float, b: float, ) -> float: '''Returns sum of two arguments''' return a + b op = create_component_from_func(add_two_numbers_decorated) component_spec = op.component_spec self.assertEqual(component_spec.name, expected_name) self.assertEqual(component_spec.description.strip(), expected_description.strip()) self.assertEqual(component_spec.implementation.container.image, expected_image)
def test_annotations_stripping(self): import typing import collections MyFuncOutputs = typing.NamedTuple('Outputs', [('sum', int), ('product', int)]) class CustomType1: pass def my_func( param1: CustomType1 = None, # This caused failure previously param2: collections. OrderedDict = None, # This caused failure previously ) -> MyFuncOutputs: # This caused failure previously assert param1 == None assert param2 == None return (8, 15) task_factory = comp.create_component_from_func(my_func) self.helper_test_component_using_local_call(task_factory, arguments={}, expected_output_values={ 'sum': '8', 'product': '15' })
def test_fail_on_handling_list_arguments_containing_python_objects(self): '''Checks that lists containing python objects not having .to_struct() raise error during serialization.''' class MyClass: pass def consume_list(list_param: list, ) -> int: return 1 def consume_dict(dict_param: dict, ) -> int: return 1 list_op = comp.create_component_from_func(consume_list) dict_op = comp.create_component_from_func(consume_dict) with self.assertRaises(Exception): list_op([1, MyClass(), 3]) with self.assertRaises(Exception): dict_op({'k1': MyClass()})
def artifact_passing_pipeline(): producer_task = producer_op() processor_task = processor_op(producer_task.outputs['output_1'], producer_task.outputs['output_2']) consumer_task = consumer_op(processor_task.outputs['output_1'], processor_task.outputs['output_2']) markdown_task = create_component_from_func(func=metadata_and_metrics)() # This line is only needed for compiling using dsl-compile to work kfp.dsl.get_pipeline_conf( ).data_passing_method = volume_based_data_passing_method
def test_handling_list_arguments_containing_pipelineparam(self): '''Checks that lists containing PipelineParam can be properly serialized''' def consume_list(list_param: list) -> int: pass import kfp task_factory = create_component_from_func(consume_list) task = task_factory([1, 2, 3, kfp.dsl.PipelineParam('aaa'), 4, 5, 6]) full_command_line = task.command + task.arguments for arg in full_command_line: self.assertNotIn('PipelineParam', arg)
def test_compile_pipeline_with_importer_on_inputpath_should_raise_error( self): # YAML componet authoring component_op = components.load_component_from_text(""" name: compoent with misused placeholder inputs: - {name: model, type: Model} implementation: container: image: dummy args: - {inputPath: model} """) @dsl.pipeline(name='my-component') def my_pipeline(model): component_op(model=model) with self.assertRaisesRegex( TypeError, 'Input "model" with type "Model" is not connected to any upstream ' 'output. However it is used with InputPathPlaceholder.'): compiler.Compiler().compile(pipeline_func=my_pipeline, pipeline_root='dummy', output_path='output.json') # Python function based component authoring def my_component(datasets: components.InputPath('Datasets')): pass component_op = components.create_component_from_func(my_component) @dsl.pipeline(name='my-component') def my_pipeline(datasets): component_op(datasets=datasets) with self.assertRaisesRegex( TypeError, 'Input "datasets" with type "Datasets" is not connected to any upstream ' 'output. However it is used with InputPathPlaceholder.'): compiler.Compiler().compile(pipeline_func=my_pipeline, pipeline_root='dummy', output_path='output.json')
def main(args): OUT_COMPONENTS_DIR = args.output_component_dir OUT_PIPELINE_DIR = args.output_pipeline_dir # Write the component file of Python function hello_component = cpt.create_component_from_func( func=hello_kubeflow, output_component_file=f'{OUT_COMPONENTS_DIR}/hello_kubeflow.component') # Read the component file hello_component = cpt.load_component_from_file( filename=f'{OUT_COMPONENTS_DIR}/hello_kubeflow.component') # Write a pipeline function using the Kubeflow Pipelines DSL @dsl.pipeline(name='Hello Kubeflow Pipeline', description='A Hello Kubeflow pipeline') def hello_kubeflow_pipeline(name='Ivan'): task = hello_component(name) # Compile the pipeline to generate a compressed YAML definition of the pipeline cmp.Compiler().compile( pipeline_func=hello_kubeflow_pipeline, package_path=f'{OUT_PIPELINE_DIR}/hello_kubeflow_pipeline.zip')
def compile(): create_component_from_func( upload_model, output_component_file='./kf_hs/steps/upload_model/component.yaml', base_image='kineticcookie/demo-kf', ) create_component_from_func( update_application, output_component_file='./kf_hs/steps/update_application/component.yaml', base_image='kineticcookie/demo-kf', ) create_component_from_func( train, output_component_file='./kf_hs/steps/train_model/component.yaml', base_image='kineticcookie/demo-kf', ) compiler.Compiler().compile(pipeline, "./output/pipeline.tar.gz") print("Done! Compiled to ./output/pipeline.tar.gz")
def main(args): OUT_COMPONENTS_DIR = args.output_component_dir OUT_PIPELINE_DIR = args.output_pipeline_dir # Because we have a non standard library, create a container tokenizer_component = cpt.func_to_container_op( tokenizer, packages_to_install=['nltk==3.5'], output_component_file=f'{OUT_COMPONENTS_DIR}/tokenizer.component') count_tokens_component = cpt.create_component_from_func( count_tokens, output_component_file=f'{OUT_COMPONENTS_DIR}/count_tokens.component') @dsl.pipeline(name='Count Kubeflow Pipeline', description='Count Number of tokens in a sentence') def count_kubeflow_pipeline(sentence='Ciao Kubeflow, come stai oggi?'): tokenizer_task = tokenizer_component(sentence=sentence) count_tokens_task = count_tokens_component( tokens=tokenizer_task.output) complier = cmp.Compiler() complier.compile( pipeline_func=count_kubeflow_pipeline, package_path=f'{OUT_PIPELINE_DIR}/count_kubeflow_pipeline.zip')
if label_column is not None: df = df.drop(columns=[df.columns[label_column]]) testing_data = xgboost.DMatrix(data=df, ) model = xgboost.Booster(model_file=model_path) predictions = model.predict(testing_data) Path(predictions_path).parent.mkdir(parents=True, exist_ok=True) numpy.savetxt(predictions_path, predictions) if __name__ == '__main__': create_component_from_func( xgboost_predict, output_component_file='component.yaml', base_image='python:3.7', packages_to_install=[ 'xgboost==1.1.1', 'pandas==1.0.5', ], annotations={ "author": "Alexey Volkov <*****@*****.**>", "canonical_location": "https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/XGBoost/Predict/component.yaml", }, )
from kfp.v2 import compiler def flip_coin() -> str: """Flip a coin and output heads or tails randomly.""" import random result = 'heads' if random.randint(0, 1) == 0 else 'tails' return result def print_msg(msg: str): """Print a message.""" print(msg) flip_coin_op = components.create_component_from_func(flip_coin) print_op = components.create_component_from_func(print_msg) @dsl.pipeline(name='nested-conditions-pipeline') def my_pipeline(): flip1 = flip_coin_op() print_op(flip1.output) flip2 = flip_coin_op() print_op(flip2.output) with dsl.Condition(flip1.output != 'no-such-result'): # always true flip3 = flip_coin_op() print_op(flip3.output)
df = df.drop([0], axis = 0) weather_df = dfreplace(df, ',', '') for i in weather_df.columns: weather_df[i] = weather_df[i].astype(str) weather_df[i][weather_df[i].apply(lambda i: True if re.search('^\s*$', str(i)) else False)]=np.NaN print(weather_df.columns) print(weather_df) print('trying to write to GS') weather_df.to_parquet(raw_data_path, compression='GZIP') print('Done!') return raw_data_path # %% # create a KFP component download_raw_data_op = comp.create_component_from_func( download_raw_data, output_component_file='download_raw_data.yaml', packages_to_install=['fastparquet', 'fsspec', 'gcsfs', "google-cloud-storage"]) # %% """ #### Component - Feature processing """ # %% def feature_processing(raw_data_path: str, new_feature_data_path: str) -> str: '''calculate features for our machine learning model''' import pandas as pd from datetime import datetime # read dataframe weather_df = pd.read_parquet(raw_data_path)
predictions_path: Output path for the predictions. label_column: Column containing the label data. Annotations: author: Alexey Volkov <*****@*****.**> ''' from pathlib import Path import numpy import xgboost csv_data_spec = data_path + '?format=csv' # Only specifying the column if it's passed. if label_column is not None: csv_data_spec += '&label_column=' + str(label_column) testing_data = xgboost.DMatrix(csv_data_spec) model = xgboost.Booster(model_file=model_path) predictions = model.predict(testing_data) Path(predictions_path).parent.mkdir(parents=True, exist_ok=True) numpy.savetxt(predictions_path, predictions) if __name__ == '__main__': create_component_from_func(xgboost_predict, output_component_file='component.yaml', base_image='python:3.7', packages_to_install=['xgboost==1.0.2'])
data_path: InputPath('ApacheArrowFeather'), output_data_path: OutputPath('ApacheParquet'), ): '''Converts Apache Arrow Feather to Apache Parquet. [Apache Arrow Feather](https://arrow.apache.org/docs/python/feather.html) [Apache Parquet](https://parquet.apache.org/) Annotations: author: Alexey Volkov <*****@*****.**> ''' from pyarrow import feather, parquet table = feather.read_table(data_path) parquet.write_table(table, output_data_path) if __name__ == '__main__': create_component_from_func( convert_apache_arrow_feather_to_apache_parquet, output_component_file='component.yaml', base_image='python:3.7', packages_to_install=['pyarrow==0.17.1'], annotations={ "author": "Alexey Volkov <*****@*****.**>", "canonical_location": "https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/_converters/ApacheParquet/from_ApacheArrowFeather/component.yaml", }, )
'metrics': metric_specs, } study = {'study_config': study_config} create_study_request = ml_api.projects().locations().studies().create( parent=f'projects/{gcp_project_id}/locations/{gcp_region}', studyId=study_id, body=study, ) create_study_response = create_study_request.execute() study_name = create_study_response['name'] return (study_name, ) if __name__ == '__main__': create_study_in_gcp_ai_platform_optimizer_op = create_component_from_func( create_study_in_gcp_ai_platform_optimizer, base_image='python:3.8', packages_to_install=[ 'google-api-python-client==1.12.3', 'google-cloud-storage==1.31.2', 'google-auth==1.21.3' ], output_component_file='component.yaml', annotations={ "author": "Alexey Volkov <*****@*****.**>", "canonical_location": "https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/google-cloud/Optimizer/Create_study/component.yaml", }, )
table_specs = [s for s in list_table_specs_response] print('table_specs=') print(table_specs) table_spec_name = table_specs[table_index].name list_column_specs_response = client.list_column_specs(table_spec_name) column_specs = [s for s in list_column_specs_response] print('column_specs=') print(column_specs) target_column_spec = [s for s in column_specs if s.display_name == target_column_name][0] feature_column_specs = [s for s in column_specs if s.display_name != target_column_name] feature_column_names = [s.name for s in feature_column_specs] import json return (target_column_spec.name, json.dumps(feature_column_names)) if __name__ == '__main__': from kfp.components import create_component_from_func automl_split_dataset_table_column_names_op = create_component_from_func( automl_split_dataset_table_column_names, output_component_file='component.yaml', base_image='python:3.7', annotations={ "author": "Alexey Volkov <*****@*****.**>", "canonical_location": "https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/gcp/automl/split_dataset_table_column_names/component.yaml", }, )
folds = list(splitter.split(df)) fold_paths = [ (train_1_path, test_1_path), (train_2_path, test_2_path), (train_3_path, test_3_path), (train_4_path, test_4_path), (train_5_path, test_5_path), ] for i in range(max_number_of_folds): (train_path, test_path) = fold_paths[i] if i < len(folds): (train_indices, test_indices) = folds[i] train_fold = df.iloc[train_indices] test_fold = df.iloc[test_indices] else: train_fold = df.iloc[0:0] test_fold = df.iloc[0:0] train_fold.to_csv(train_path, index=False) test_fold.to_csv(test_path, index=False) if __name__ == '__main__': split_table_into_folds_op = create_component_from_func( split_table_into_folds, base_image='python:3.7', packages_to_install=['scikit-learn==0.23.1', 'pandas==1.0.5'], output_component_file='component.yaml', )
model.compile( loss=loss_name, optimizer=optimizer, metrics=metrics, ) history = model.fit( x_train, y_train_one_hot, batch_size=batch_size, epochs=num_epochs, shuffle=True ) model.save(model_path) metrics_history = {name: [float(value) for value in values] for name, values in history.history.items()} final_metrics = {name: values[-1] for name, values in metrics_history.items()} final_loss = final_metrics['loss'] return (final_loss, final_metrics, metrics_history) if __name__ == '__main__': keras_train_classifier_from_csv_op = create_component_from_func( keras_train_classifier_from_csv, base_image='tensorflow/tensorflow:2.2.0', packages_to_install=['keras==2.3.1', 'pandas==1.0.5'], output_component_file='component.yaml', )
number_of_items=number_of_items, max_absolute_error=max_absolute_error, mean_absolute_error=mean_absolute_error, mean_squared_error=mean_squared_error, root_mean_squared_error=root_mean_squared_error, ) return ( number_of_items, max_absolute_error, mean_absolute_error, mean_squared_error, root_mean_squared_error, metrics, ) if __name__ == '__main__': calculate_regression_metrics_from_csv_op = create_component_from_func( calculate_regression_metrics_from_csv, output_component_file='component.yaml', base_image='python:3.7', packages_to_install=['numpy==1.19.0'], annotations={ "author": "Alexey Volkov <*****@*****.**>", "canonical_location": "https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/ml_metrics/Calculate_regression_metrics/from_CSV/component.yaml", }, )
# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Fail pipeline.""" from kfp import components, dsl def fail(): '''Fails''' import sys sys.exit(1) fail_op = components.create_component_from_func(fail, base_image='alpine:latest') @dsl.pipeline(name='fail_pipeline') def fail_pipeline(): fail_task = fail_op()
if label_column: column_descriptions = {label_column: 'Label'} column_description_path = tempfile.NamedTemporaryFile( delete=False).name with open(column_description_path, 'w') as column_description_file: for idx, kind in column_descriptions.items(): column_description_file.write('{}\t{}\n'.format(idx, kind)) else: column_description_path = None eval_data = Pool( data_path, column_description=column_description_path, has_header=True, delimiter=',', ) model = CatBoost() model.load_model(model_path) predictions = model.predict(eval_data, prediction_type='Probability') numpy.savetxt(predictions_path, predictions) if __name__ == '__main__': catboost_predict_class_probabilities_op = create_component_from_func( catboost_predict_class_probabilities, output_component_file='component.yaml', base_image='python:3.7', packages_to_install=['catboost==0.23'])
from kfp.components import create_component_from_func def build_list( item_1: dict = None, item_2: dict = None, item_3: dict = None, item_4: dict = None, item_5: dict = None, ) -> list: """Creates a JSON array from multiple items. Annotations: author: Alexey Volkov <*****@*****.**> """ result = [] for item in [item_1, item_2, item_3, item_4, item_5]: if item is not None: result.append(item) return result if __name__ == '__main__': build_list_op = create_component_from_func( build_list, base_image='python:3.8', output_component_file='component.yaml', )
input_config = { 'bigquery_source': { 'input_uri': input_uri, }, } response = client.import_data( dataset_path, input_config, retry or google.api_core.gapic_v1.method.DEFAULT, timeout or google.api_core.gapic_v1.method.DEFAULT, metadata, ) result = response.result() print(result) metadata = response.metadata print(metadata) return (dataset_path) if __name__ == '__main__': from kfp.components import create_component_from_func automl_import_data_from_bigquery_op = create_component_from_func( automl_import_data_from_bigquery, output_component_file='component.yaml', base_image='python:3.7', annotations={ "author": "Alexey Volkov <*****@*****.**>", "canonical_location": "https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/gcp/automl/import_data_from_bigquery/component.yaml", }, )
should_stop_trial = True complete_response = trials_api.complete( name=fix_resource_name(trial_name), ).execute() return (trial_name, complete_response, should_stop_trial) else: check_early_stopping_response = trials_api.checkEarlyStoppingState( name=fix_resource_name(trial_name), ).execute() operation_name = check_early_stopping_response['name'] while True: get_operation_response = operations_api.get( name=fix_resource_name(operation_name), ).execute() if get_operation_response.get('done'): break logging.info('Not finished yet: ' + str(get_operation_response)) time.sleep(10) operation_response = get_operation_response['response'] should_stop_trial = operation_response['shouldStop'] return (trial_name, add_measurement_response, should_stop_trial) if __name__ == '__main__': add_measurement_for_trial_in_gcp_ai_platform_optimizer_op = create_component_from_func( add_measurement_for_trial_in_gcp_ai_platform_optimizer, base_image='python:3.8', packages_to_install=[ 'google-api-python-client==1.12.3', 'google-cloud-storage==1.31.2', 'google-auth==1.21.3' ], output_component_file='component.yaml', )
from kfp.components import create_component_from_func, InputPath, OutputPath def convert_to_tensorflow_saved_model_from_onnx_model( model_path: InputPath('OnnxModel'), converted_model_path: OutputPath('TensorflowSavedModel'), ): import onnx import onnx_tf onnx_model = onnx.load(model_path) tf_rep = onnx_tf.backend.prepare(onnx_model) tf_rep.export_graph(converted_model_path) if __name__ == '__main__': convert_to_tensorflow_saved_model_from_onnx_model_op = create_component_from_func( convert_to_tensorflow_saved_model_from_onnx_model, output_component_file='component.yaml', base_image='tensorflow/tensorflow:2.4.1', packages_to_install=['onnx-tf==1.7.0', 'onnx==1.8.0'], # onnx-tf==1.7.0 is not compatible with onnx==1.8.1 annotations={ "author": "Alexey Volkov <*****@*****.**>", "canonical_location": "https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/_converters/OnnxModel/to_TensorflowSavedModel/component.yaml", }, )
model_format=model_format, gcs_destination=automl.GcsDestination( output_uri_prefix=gcs_output_uri_prefix, ), ), ) print('Operation started:') print(response.operation) result = response.result() metadata = response.metadata print('Operation finished:') print(metadata) return (metadata.export_model_details.output_info.gcs_output_directory, ) if __name__ == '__main__': automl_export_model_to_gcs_op = create_component_from_func( automl_export_model_to_gcs, output_component_file='component.yaml', base_image='python:3.8', packages_to_install=[ 'google-cloud-automl==2.0.0', ], annotations={ "author": "Alexey Volkov <*****@*****.**>", "canonical_location": "https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/gcp/automl/export_model_to_gcs/component.yaml", }, )
"""Two step v2-compatible pipeline.""" from kfp import components, dsl from kfp.components import InputPath, OutputPath def preprocess(uri: str, some_int: int, output_parameter_one: OutputPath(int), output_dataset_one: OutputPath('Dataset')): '''Dummy Preprocess Step.''' with open(output_dataset_one, 'w') as f: f.write('Output dataset') with open(output_parameter_one, 'w') as f: f.write("{}".format(1234)) preprocess_op = components.create_component_from_func(preprocess, base_image='python:3.9') @components.create_component_from_func def train_op(dataset: InputPath('Dataset'), model: OutputPath('Model'), num_steps: int = 100): '''Dummy Training Step.''' with open(dataset, 'r') as input_file: input_string = input_file.read() with open(model, 'w') as output_file: for i in range(num_steps): output_file.write("Step {}\n{}\n=====\n".format( i, input_string))
def run_component(args): OUT_COMPONENTS_DIR = args.out_component_dir get_word_component = cpt.create_component_from_func(get_word, output_component_file=f'{OUT_COMPONENTS_DIR}/get_word.component')