class MultiRegionHousePricePredictionModelTrainer(object): """ This pipeline trains an XGBoost model, also generated synthetic data and runs predictions against test dataset """ regions = Input(Types.List(Types.String), default=["SFO", "SEA", "DEN"], help="Regions for where to train the model.") seed = Input(Types.Integer, default=7, help="Seed to use for splitting.") num_houses_per_region = Input( Types.Integer, default=1000, help="Number of houses to generate data for in each region") # the actual algorithm split = generate_and_split_data_multiloc( locations=regions, number_of_houses_per_location=num_houses_per_region, seed=seed) fit_task = parallel_fit(multi_train=split.outputs.train) predicted = parallel_predict(multi_models=fit_task.outputs.multi_models, multi_test=split.outputs.test) # Outputs: joblib seralized models per region and accuracy of the model per region # Note we should make this into a map, but for demo we will output a simple list models = Output(fit_task.outputs.multi_models, sdk_type=Types.List(Types.Blob)) accuracies = Output(predicted.outputs.accuracies, sdk_type=Types.List(Types.Float))
class SageMakerHPO(object): train_dataset = Input(Types.MultiPartCSV, default="s3://somelocation") validation_dataset = Input(Types.MultiPartCSV, default="s3://somelocation") static_hyperparameters = Input(Types.Generic, default=example_hyperparams) hyperparameter_tuning_job_config = Input( HyperparameterTuningJobConfig, default=_HyperparameterTuningJobConfig( tuning_strategy=HyperparameterTuningStrategy.BAYESIAN, tuning_objective=HyperparameterTuningObjective( objective_type=HyperparameterTuningObjectiveType.MINIMIZE, metric_name="validation:error", ), training_job_early_stopping_type=TrainingJobEarlyStoppingType.AUTO, ), ) a = simple_xgboost_hpo_job_task( train=train_dataset, validation=validation_dataset, static_hyperparameters=static_hyperparameters, hyperparameter_tuning_job_config=hyperparameter_tuning_job_config, num_round=IntegerParameterRange( min_value=2, max_value=8, scaling_type=HyperparameterScalingType.LINEAR), max_depth=IntegerParameterRange( min_value=5, max_value=7, scaling_type=HyperparameterScalingType.LINEAR), gamma=ContinuousParameterRange( min_value=0.0, max_value=0.3, scaling_type=HyperparameterScalingType.LINEAR), )
class DiabetesXGBoostModelOptimizer(object): """ This pipeline trains an XGBoost mode for any given dataset that matches the schema as specified in https://github.com/jbrownlee/Datasets/blob/master/pima-indians-diabetes.names. """ # Inputs dataset, fraction of the dataset to be split out for validations and seed to use to perform the split # dataset = Input(Types.CSV, default=Types.CSV.create_at_known_location( # "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"), # help="A CSV File that matches the format https://github.com/jbrownlee/Datasets/blob/master/pima-indians-diabetes.names") dataset_remote_location = Input(Types.String, default="https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv", help="Remote location to a CSV File that matches the format https://github.com/jbrownlee/Datasets/blob/master/pima-indians-diabetes.names") test_split_ratio = Input(Types.Float, default=0.33, help="Ratio of how much should be test to Train") seed = Input(Types.Integer, default=7, help="Seed to use for splitting.") # the actual algorithm split = dxgb.get_traintest_splitdatabase(dataset=dataset_remote_location, seed=seed, test_split_ratio=test_split_ratio) fit_task = sxghpo.fit_lp(train_data=split.outputs.x_train, train_target=split.outputs.y_train, validation_data=split.outputs.x_test, validation_target=split.outputs.y_test) predicted = dxgb.predict(model_ser=fit_task.outputs.model, x=split.outputs.x_test) score_task = dxgb.metrics(predictions=predicted.outputs.predictions, y=split.outputs.y_test) # Outputs: joblib seralized model and accuracy of the model model = Output(fit_task.outputs.model, sdk_type=Types.Blob) accuracy = Output(score_task.outputs.accuracy, sdk_type=Types.Float)
class InverterDynamicWorkflow(object): input_a = Input(Types.Integer, default=5, help="Input for inner workflow") inverter_input = Input(Types.Boolean, default=False, help="Should invert or not") lp_task = workflow_builder(task_input_num=input_a, decider=inverter_input) wf_output = Output(lp_task.outputs.out, sdk_type=Types.Integer)
class sup(object): input_1 = Input(Types.Integer) input_2 = Input(Types.Integer, default=5, help='Not required.') a = my_task(a=input_1) b = my_task(a=input_2) c = my_task(a=100)
class RawContainerWorkflow(object): val1 = Input(Types.Integer) val2 = Input(Types.Integer) sq1 = square(val=val1) sq2 = square(val=val2) sm = sum(x=sq1.outputs.out, y=sq2.outputs.out) sum_of_squares = Output(sm.outputs.out, sdk_type=Types.Integer)
class PrestoWorkflow(object): length = Input(Types.Integer, required=True, help="Int between 1 and 26") routing_group = Input(Types.String, required=True, help="Test string with no default") p_task = presto_task(length=length, rg=routing_group) output_a = Output(p_task.outputs.results, sdk_type=schema)
class ClassifierTrainWorkflow: available_streams_mpblobs = Input([Types.MultiPartBlob], required=True) available_streams_names = Input([Types.String], required=True) streams_metadata_path = Input(Types.String, required=True) training_validation_config_json = Input( Types.Generic, default=ujson.loads( open(DEFAULT_TRAINING_VALIDATION_CONFIG_FILE).read())) validation_data_ratio = Input(Types.Float, default=DEFAULT_VALIDATION_DATA_RATIO) rearrange_data_task = rearrange_data( available_streams_mpblobs=available_streams_mpblobs, available_streams_names=available_streams_names, training_validation_config_json=training_validation_config_json, streams_metadata_path=streams_metadata_path, validation_data_ratio=validation_data_ratio, ) train_on_datasets_task = train_on_datasets( training_validation_config_json=training_validation_config_json, training_clean_mpblob=rearrange_data_task.outputs. training_clean_mpblob, training_dirty_mpblob=rearrange_data_task.outputs. training_dirty_mpblob, validation_clean_mpblob=rearrange_data_task.outputs. validation_clean_mpblob, validation_dirty_mpblob=rearrange_data_task.outputs. validation_dirty_mpblob, ) trained_models = Output(train_on_datasets_task.outputs.model_blobs, sdk_type=[Types.Blob]) model_file_names = Output(train_on_datasets_task.outputs.model_files_names, sdk_type=[Types.String])
def test_workflow_no_node_dependencies_or_outputs(): @inputs(a=Types.Integer) @outputs(b=Types.Integer) @python_task def my_task(wf_params, a, b): b.set(a + 1) i1 = Input(Types.Integer) i2 = Input(Types.Integer, default=5, help='Not required.') input_dict = { 'input_1': i1, 'input_2': i2 } nodes = { 'a': my_task(a=input_dict['input_1']), 'b': my_task(a=input_dict['input_2']), 'c': my_task(a=100) } w = workflow(inputs=input_dict, outputs={}, nodes=nodes) assert w.interface.inputs['input_1'].type == Types.Integer.to_flyte_literal_type() assert w.interface.inputs['input_2'].type == Types.Integer.to_flyte_literal_type() assert _get_node_by_id(w, 'a').inputs[0].var == 'a' assert _get_node_by_id(w, 'a').inputs[0].binding.promise.node_id == constants.GLOBAL_INPUT_NODE_ID assert _get_node_by_id(w, 'a').inputs[0].binding.promise.var == 'input_1' assert _get_node_by_id(w, 'b').inputs[0].binding.promise.node_id == constants.GLOBAL_INPUT_NODE_ID assert _get_node_by_id(w, 'b').inputs[0].binding.promise.var == 'input_2' assert _get_node_by_id(w, 'c').inputs[0].binding.scalar.primitive.integer == 100
def test_workflow_no_node_dependencies_or_outputs(): @inputs(a=Types.Integer) @outputs(b=Types.Integer) @python_task def my_task(wf_params, a, b): b.set(a + 1) i1 = Input(Types.Integer) i2 = Input(Types.Integer, default=5, help="Not required.") input_dict = {"input_1": i1, "input_2": i2} nodes = { "a": my_task(a=input_dict["input_1"]), "b": my_task(a=input_dict["input_2"]), "c": my_task(a=100), } w = workflow(inputs=input_dict, outputs={}, nodes=nodes) assert w.interface.inputs[ "input_1"].type == Types.Integer.to_flyte_literal_type() assert w.interface.inputs[ "input_2"].type == Types.Integer.to_flyte_literal_type() assert _get_node_by_id(w, "a").inputs[0].var == "a" assert _get_node_by_id( w, "a" ).inputs[0].binding.promise.node_id == constants.GLOBAL_INPUT_NODE_ID assert _get_node_by_id(w, "a").inputs[0].binding.promise.var == "input_1" assert _get_node_by_id( w, "b" ).inputs[0].binding.promise.node_id == constants.GLOBAL_INPUT_NODE_ID assert _get_node_by_id(w, "b").inputs[0].binding.promise.var == "input_2" assert _get_node_by_id( w, "c").inputs[0].binding.scalar.primitive.integer == 100
class WorkflowWithIO(object): a = Input(Types.Integer, default=10, help="Test integer input with default") b = Input(Types.String, required=True, help="Test string with no default") odd_nums_task = find_odd_numbers_with_string(list_of_nums=[2, 3, 4, 7], demo_string=b) task_output = Output(odd_nums_task.outputs.are_num_odd, sdk_type=[Boolean]) output_a = Output(a, sdk_type=Integer) # pass through output output_b = Output(odd_nums_task.outputs.altered_string, sdk_type=String)
class Child(object): input_1 = Input(Types.Integer) input_2 = Input(Types.Integer, default=5, help='Not required.') a = add_one(a=input_1) b = add_one(a=input_2) c = add_one(a=100) output = Output(c.outputs.b, sdk_type=Types.Integer)
class OptionallyCachableWorkflow(object): input_if_cached_enabled = Input(Types.Float, default=10.0, help="Test float input with default") cache_disabled = Input(Types.Boolean, default=False, help="Whether to disable cache.") input_generator = generate_input(wf_input=input_if_cached_enabled, cache_disabled=cache_disabled) dynamic_task = sample_batch_task_cachable( caching_input=input_generator.outputs.generated)
class BackfillWorkflow(object): """ So if FailingWorkflow Fails, we can resurrect and backfill the FailingWorkflow, using the BackfillWorkflow. The Backfill workflow just has one step """ in_image = Input(Types.Blob, required=True) angle = Input(Types.Float, default=180.0) rotate_task = rotate(image=in_image, angle=angle, fail=False) out_image = Output(rotate_task.outputs.out_image, sdk_type=Types.Blob)
class PrimitiveDemoWorkflow(object): x = Input(Types.Integer, help="Integer") y = Input(Types.Float, help="Float") s = Input(Types.String, help="String") b = Input(Types.Boolean, help="Boolean") m = multiply(x=x, y=y) s1 = convert_to_str(z=m.outputs.z) s2 = add_bool_to_str(st=s, b=b) mult_str = Output(s1.outputs.s, sdk_type=Types.String) bool_str = Output(s2.outputs.s, sdk_type=Types.String)
class SimpleWorkflow(object): train_dataset = Input(Types.Blob) validation_dataset = Input(Types.Blob) custom = custom_training_task(dummy_train_dataset=train_dataset, dummy_validation_dataset=validation_dataset, my_input="hello world") final_model = Output(custom.outputs.out_model, sdk_type=Types.Blob) final_extra_output = Output(custom.outputs.out_extra_output_file, sdk_type=Types.Blob) final_value = Output(custom.outputs.out, sdk_type=Types.Integer)
def nested_dynamic_wf_task(wf_params, task_input_num, out): wf_params.logging.info( "Running inner task... yielding a code generated sub workflow") # Inner workflow input_a = Input(Types.Integer, help="Tell me something") node1 = sq_sub_task(in1=input_a) MyUnregisteredWorkflowInner = workflow( inputs={ 'a': input_a, }, outputs={ 'ooo': Output(node1.outputs.out1, sdk_type=Types.Integer, help='This is an integer output') }, nodes={ 'node_one': node1, }) setattr(MyUnregisteredWorkflowInner, 'auto_assign_name', manual_assign_name) MyUnregisteredWorkflowInner._platform_valid_name = 'unregistered' # Output workflow input_a = Input(Types.Integer, help="Tell me something") node1 = MyUnregisteredWorkflowInner(a=task_input_num) MyUnregisteredWorkflowOuter = workflow( inputs={ 'a': input_a, }, outputs={ 'ooo': Output(node1.outputs.ooo, sdk_type=Types.Integer, help='This is an integer output') }, nodes={ 'node_one': node1, }) setattr(MyUnregisteredWorkflowOuter, 'auto_assign_name', manual_assign_name) MyUnregisteredWorkflowOuter._platform_valid_name = 'unregistered' unregistered_workflow_execution = MyUnregisteredWorkflowOuter( a=task_input_num) out.set(unregistered_workflow_execution.outputs.ooo)
class SimpleWorkflow(object): input_1 = Input(Types.Integer) input_2 = Input(Types.Integer, default=5, help='Not required.') a = add_one(a=input_1) b = add_one(a=input_2) c = subtract_one(a=input_1) d = write_special_types() e = read_special_types( a=d.outputs.a, b=d.outputs.b, c=d.outputs.c, d=d.outputs.d, e=d.outputs.e, )
class PrestoWorkflow(object): ds = Input(Types.String, required=True, help="Test string with no default") # routing_group = Input(Types.String, required=True, help="Test string with no default") p_task = presto_task(ds=ds, rg="etl") output_a = Output(p_task.outputs.results, sdk_type=schema)
class GenericDemoWorkflow(object): a = Input(Types.Generic, default={}, help="Input for inner workflow") generic_type_example = generic_type_task(custom=a) generic_json = generic_to_json( replicated=generic_type_example.outputs.replicated) counts = Output(generic_type_example.outputs.counts, sdk_type=Types.Generic)
def workflow_builder(wf_params, task_input_num, decider, out): wf_params.logging.info( "Running inner task... yielding a code generated sub workflow") input_a = Input(Types.Integer, help="Tell me something") if decider: node1 = inverse_inner_task(num=input_a) else: node1 = inner_task(num=input_a) MyUnregisteredWorkflow = workflow(inputs={ 'a': input_a, }, outputs={ 'ooo': Output( node1.outputs.out, sdk_type=Types.Integer, help='This is an integer output') }, nodes={ 'node_one': node1, }) # This is an unfortunate setting that will hopefully not be necessary in the future. setattr(MyUnregisteredWorkflow, 'auto_assign_name', manual_assign_name) MyUnregisteredWorkflow._platform_valid_name = 'unregistered' unregistered_workflow_execution = MyUnregisteredWorkflow(a=task_input_num) yield unregistered_workflow_execution out.set(unregistered_workflow_execution.outputs.ooo)
class StaticSubWorkflowCaller(object): outer_a = Input(Types.Integer, default=5, help="Input for inner workflow") identity_wf_execution = IdentityWorkflow(a=outer_a) wf_output = Output(identity_wf_execution.outputs.task_output, sdk_type=Types.Integer)
class LoadTestOrchestrationWorkflow(object): # 30 python tasks ~= 75 i3.16x nodes on AWS Batch python_task_count = 30 # 30 spark tasks ~= 60 i3.16x nodes on AWS Batch spark_task_count = 30 # 3 dynamic-jobs each of 1000 tasks ~= 3*20 i3.16x nodes on AWS Batch djo_task_count = 1000 dj_count = 3 p = [None] * python_task_count s = [None] * spark_task_count d = [None] * dj_count # python tasks for i in range(0, python_task_count): p[i] = python_loadtest_lp() # dynamic-job tasks for i in range(0, dj_count): d[i] = dynamic_job_loadtest_lp(tasks_count=djo_task_count) # hive load tests. # h1 = hive_loadtest_lp() # spark load tests trigger_time = Input(Types.Datetime) for i in range(0, spark_task_count): s[i] = spark_loadtest_lp(triggered_date=trigger_time, offset=i)
class StructuredSagemakerXGBoostHPO(object): # Input parameters static_hyperparameters = Input( Types.Generic, help= "A list of the static hyperparameters to pass to the training jobs.", default=example_hyperparams, ) train_data = Input( Types.Schema(), help= "A Columnar schema that contains all the features used for training.", ) train_target = Input( Types.Schema(), help= "A Columnar schema that contains all the labeled results for train_data.", ) validation_data = Input( Types.Schema(), help= "A Columnar schema that contains all the features used for validation.", ) validation_target = Input( Types.Schema(), help= "A Columnar schema that contains all the labeled results for validation_data.", ) sagemaker_transform = convert_to_sagemaker_csv(x_train=train_data, y_train=train_target, x_test=validation_data, y_test=validation_target) # Node definitions train_node = xgtrainer_task( static_hyperparameters=static_hyperparameters, train=sagemaker_transform.outputs.train, validation=sagemaker_transform.outputs.validation, ) untar = untar_xgboost(model_tar=train_node.outputs.model, ) # Outputs model = Output(untar.outputs.model, sdk_type=Types.Blob)
class BatchTasksWorkflow(object): num_subtasks = Input(Types.Integer, default=3) task1 = no_inputs_sample_batch_task() task2 = sample_batch_task_beatles_cached(in1=num_subtasks) t = print_every_time(ints_to_print=task1.outputs.out_ints, strings_to_print=task1.outputs.out_str) ints_out = Output(task1.outputs.out_ints, sdk_type=[[Types.Integer]]) str_out = Output(task2.outputs.out_str, sdk_type=[Types.String])
class SimpleWorkflow(object): triggered_date = Input(Types.Datetime) print1a = add_one_and_print(value_to_print=3) print1b = add_one_and_print(value_to_print=101) print2 = sum_non_none( values_to_print=[print1a.outputs.out, print1b.outputs.out]) print3 = add_one_and_print(value_to_print=print2.outputs.out) print4 = add_one_and_print(value_to_print=print3.outputs.out) final_value = Output(print4.outputs.out, sdk_type=Types.Integer)
class DemoWorkflow(object): # Input parameters static_hyperparameters = Input( Types.Generic, help= "A list of the static hyperparameters to pass to the training jobs.", ) train_data = Input(Types.MultiPartCSV, help="S3 path to a flat directory of CSV files.") validation_data = Input(Types.MultiPartCSV, help="S3 path to a flat directory of CSV files.") # Node definitions train_node = xgtrainer_task( static_hyperparameters=example_hyperparams, train=train_data, validation=validation_data, ) # Outputs trained_model = Output(train_node.outputs.model, sdk_type=Types.Blob)
class DataPreparationWorkflow: streams_external_storage_prefix = Input(Types.String, required=True) streams_names = Input([Types.String], required=True) stream_extension = Input(Types.String, default="avi") # video_external_paths = Input([Types.String], required=True) sampling_random_seed = Input(Types.Integer, default=DEFAULT_RANDOM_SEED) sampling_n_clusters = Input(Types.Integer, default=DEFAULT_LUMINANCE_N_CLUSTERS) sampling_sample_size = Input(Types.Integer, default=DEFAULT_LUMINANCE_SAMPLE_SIZE) download_video_task = download_videos( streams_external_storage_prefix=streams_external_storage_prefix, streams_names=streams_names, stream_extension=stream_extension, ) extract_from_video_collection_task = extract_from_video_collections( video_blobs=download_video_task.outputs.downloaded_streams_blobs, ) luminance_select_collections_task = luminance_select_collections( raw_frames_mpblobs=extract_from_video_collection_task.outputs.raw_frames_mpblobs, n_clusters=sampling_n_clusters, sample_size=sampling_sample_size, random_seed=sampling_random_seed, ) selected_frames_mpblobs = Output(luminance_select_collections_task.outputs.selected_image_mpblobs, sdk_type=[Types.MultiPartBlob]) selected_frames_mpblobs_metadata = Output(luminance_select_collections_task.outputs.selected_file_names, sdk_type=[[Types.String]]) streams_names_out = Output(streams_names, sdk_type=[Types.String])
class DemoWorkflow(object): # Input parameters train_data = Input(Types.MultiPartCSV, help="s3 path to a flat directory of CSV files.") validation_data = Input(Types.MultiPartCSV, help="s3 path to a flat directory of CSV files.") # Node definitions train_node = xgtrainer_task( static_hyperparameters={ "eval_metric": "auc", "num_round": "100", "objective": "binary:logistic", "rate_drop": "0.3", "tweedie_variance_power": "1.4", }, train=train_data, validation=validation_data, ) # Outputs trained_model = Output(train_node.outputs.model, sdk_type=Types.Blob)
class FailingWorkflow(object): """ This workflow is two step workflow, Step 1: scale an image Step 2: Rotate an image NOTE: This is not an efficient workflow as one image - scaling and rotation can be done with one OPEN CV call. But this example exists only for a demo Step 2: in this case will always fail as it is hard-coded to indicate fail=True """ in_image = Input( Types.Blob, default=Types.Blob.create_at_known_location( "https://miro.medium.com/max/1400/1*qL8UYfaStcEo_YVPrA4cbA.png")) angle = Input(Types.Float, default=180.0) scale = Input(Types.Integer, default=2) scale_task = tasks.scale(image=in_image, scale_factor=scale) rotate_task = tasks.rotate(image=scale_task.outputs.out_image, angle=angle, fail=True) out_image = Output(rotate_task.outputs.out_image, sdk_type=Types.Blob)