class MultiRegionHousePricePredictionModelTrainer(object):
    """
    This pipeline trains an XGBoost model, also generated synthetic data and runs predictions against test dataset
    """
    regions = Input(Types.List(Types.String),
                    default=["SFO", "SEA", "DEN"],
                    help="Regions for where to train the model.")
    seed = Input(Types.Integer, default=7, help="Seed to use for splitting.")
    num_houses_per_region = Input(
        Types.Integer,
        default=1000,
        help="Number of houses to generate data for in each region")

    # the actual algorithm
    split = generate_and_split_data_multiloc(
        locations=regions,
        number_of_houses_per_location=num_houses_per_region,
        seed=seed)
    fit_task = parallel_fit(multi_train=split.outputs.train)
    predicted = parallel_predict(multi_models=fit_task.outputs.multi_models,
                                 multi_test=split.outputs.test)

    # Outputs: joblib seralized models per region and accuracy of the model per region
    # Note we should make this into a map, but for demo we will output a simple list
    models = Output(fit_task.outputs.multi_models,
                    sdk_type=Types.List(Types.Blob))
    accuracies = Output(predicted.outputs.accuracies,
                        sdk_type=Types.List(Types.Float))
Exemple #2
0
class SageMakerHPO(object):
    train_dataset = Input(Types.MultiPartCSV, default="s3://somelocation")
    validation_dataset = Input(Types.MultiPartCSV, default="s3://somelocation")
    static_hyperparameters = Input(Types.Generic, default=example_hyperparams)
    hyperparameter_tuning_job_config = Input(
        HyperparameterTuningJobConfig,
        default=_HyperparameterTuningJobConfig(
            tuning_strategy=HyperparameterTuningStrategy.BAYESIAN,
            tuning_objective=HyperparameterTuningObjective(
                objective_type=HyperparameterTuningObjectiveType.MINIMIZE,
                metric_name="validation:error",
            ),
            training_job_early_stopping_type=TrainingJobEarlyStoppingType.AUTO,
        ),
    )

    a = simple_xgboost_hpo_job_task(
        train=train_dataset,
        validation=validation_dataset,
        static_hyperparameters=static_hyperparameters,
        hyperparameter_tuning_job_config=hyperparameter_tuning_job_config,
        num_round=IntegerParameterRange(
            min_value=2,
            max_value=8,
            scaling_type=HyperparameterScalingType.LINEAR),
        max_depth=IntegerParameterRange(
            min_value=5,
            max_value=7,
            scaling_type=HyperparameterScalingType.LINEAR),
        gamma=ContinuousParameterRange(
            min_value=0.0,
            max_value=0.3,
            scaling_type=HyperparameterScalingType.LINEAR),
    )
Exemple #3
0
class DiabetesXGBoostModelOptimizer(object):
    """
    This pipeline trains an XGBoost mode for any given dataset that matches the schema as specified in
    https://github.com/jbrownlee/Datasets/blob/master/pima-indians-diabetes.names.
    """

    # Inputs dataset, fraction of the dataset to be split out for validations and seed to use to perform the split
    # dataset = Input(Types.CSV, default=Types.CSV.create_at_known_location(
    #     "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"),
    #                 help="A CSV File that matches the format https://github.com/jbrownlee/Datasets/blob/master/pima-indians-diabetes.names")

    dataset_remote_location = Input(Types.String, default="https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv",
                    help="Remote location to a CSV File that matches the format https://github.com/jbrownlee/Datasets/blob/master/pima-indians-diabetes.names")

    test_split_ratio = Input(Types.Float, default=0.33, help="Ratio of how much should be test to Train")
    seed = Input(Types.Integer, default=7, help="Seed to use for splitting.")

    # the actual algorithm
    split = dxgb.get_traintest_splitdatabase(dataset=dataset_remote_location, seed=seed, test_split_ratio=test_split_ratio)
    fit_task = sxghpo.fit_lp(train_data=split.outputs.x_train, train_target=split.outputs.y_train, validation_data=split.outputs.x_test, validation_target=split.outputs.y_test)

    predicted = dxgb.predict(model_ser=fit_task.outputs.model, x=split.outputs.x_test)
    score_task = dxgb.metrics(predictions=predicted.outputs.predictions, y=split.outputs.y_test)

    # Outputs: joblib seralized model and accuracy of the model
    model = Output(fit_task.outputs.model, sdk_type=Types.Blob)
    accuracy = Output(score_task.outputs.accuracy, sdk_type=Types.Float)
class InverterDynamicWorkflow(object):
    input_a = Input(Types.Integer, default=5, help="Input for inner workflow")
    inverter_input = Input(Types.Boolean,
                           default=False,
                           help="Should invert or not")
    lp_task = workflow_builder(task_input_num=input_a, decider=inverter_input)
    wf_output = Output(lp_task.outputs.out, sdk_type=Types.Integer)
Exemple #5
0
    class sup(object):
        input_1 = Input(Types.Integer)
        input_2 = Input(Types.Integer, default=5, help='Not required.')

        a = my_task(a=input_1)
        b = my_task(a=input_2)
        c = my_task(a=100)
Exemple #6
0
class RawContainerWorkflow(object):
    val1 = Input(Types.Integer)
    val2 = Input(Types.Integer)
    sq1 = square(val=val1)
    sq2 = square(val=val2)
    sm = sum(x=sq1.outputs.out, y=sq2.outputs.out)
    sum_of_squares = Output(sm.outputs.out, sdk_type=Types.Integer)
Exemple #7
0
class PrestoWorkflow(object):
    length = Input(Types.Integer, required=True, help="Int between 1 and 26")
    routing_group = Input(Types.String,
                          required=True,
                          help="Test string with no default")
    p_task = presto_task(length=length, rg=routing_group)
    output_a = Output(p_task.outputs.results, sdk_type=schema)
Exemple #8
0
class ClassifierTrainWorkflow:
    available_streams_mpblobs = Input([Types.MultiPartBlob], required=True)
    available_streams_names = Input([Types.String], required=True)
    streams_metadata_path = Input(Types.String, required=True)
    training_validation_config_json = Input(
        Types.Generic,
        default=ujson.loads(
            open(DEFAULT_TRAINING_VALIDATION_CONFIG_FILE).read()))
    validation_data_ratio = Input(Types.Float,
                                  default=DEFAULT_VALIDATION_DATA_RATIO)

    rearrange_data_task = rearrange_data(
        available_streams_mpblobs=available_streams_mpblobs,
        available_streams_names=available_streams_names,
        training_validation_config_json=training_validation_config_json,
        streams_metadata_path=streams_metadata_path,
        validation_data_ratio=validation_data_ratio,
    )

    train_on_datasets_task = train_on_datasets(
        training_validation_config_json=training_validation_config_json,
        training_clean_mpblob=rearrange_data_task.outputs.
        training_clean_mpblob,
        training_dirty_mpblob=rearrange_data_task.outputs.
        training_dirty_mpblob,
        validation_clean_mpblob=rearrange_data_task.outputs.
        validation_clean_mpblob,
        validation_dirty_mpblob=rearrange_data_task.outputs.
        validation_dirty_mpblob,
    )

    trained_models = Output(train_on_datasets_task.outputs.model_blobs,
                            sdk_type=[Types.Blob])
    model_file_names = Output(train_on_datasets_task.outputs.model_files_names,
                              sdk_type=[Types.String])
Exemple #9
0
def test_workflow_no_node_dependencies_or_outputs():

    @inputs(a=Types.Integer)
    @outputs(b=Types.Integer)
    @python_task
    def my_task(wf_params, a, b):
        b.set(a + 1)

    i1 = Input(Types.Integer)
    i2 = Input(Types.Integer, default=5, help='Not required.')

    input_dict = {
        'input_1': i1,
        'input_2': i2
    }

    nodes = {
        'a': my_task(a=input_dict['input_1']),
        'b': my_task(a=input_dict['input_2']),
        'c': my_task(a=100)
    }

    w = workflow(inputs=input_dict, outputs={}, nodes=nodes)

    assert w.interface.inputs['input_1'].type == Types.Integer.to_flyte_literal_type()
    assert w.interface.inputs['input_2'].type == Types.Integer.to_flyte_literal_type()
    assert _get_node_by_id(w, 'a').inputs[0].var == 'a'
    assert _get_node_by_id(w, 'a').inputs[0].binding.promise.node_id == constants.GLOBAL_INPUT_NODE_ID
    assert _get_node_by_id(w, 'a').inputs[0].binding.promise.var == 'input_1'
    assert _get_node_by_id(w, 'b').inputs[0].binding.promise.node_id == constants.GLOBAL_INPUT_NODE_ID
    assert _get_node_by_id(w, 'b').inputs[0].binding.promise.var == 'input_2'
    assert _get_node_by_id(w, 'c').inputs[0].binding.scalar.primitive.integer == 100
Exemple #10
0
def test_workflow_no_node_dependencies_or_outputs():
    @inputs(a=Types.Integer)
    @outputs(b=Types.Integer)
    @python_task
    def my_task(wf_params, a, b):
        b.set(a + 1)

    i1 = Input(Types.Integer)
    i2 = Input(Types.Integer, default=5, help="Not required.")

    input_dict = {"input_1": i1, "input_2": i2}

    nodes = {
        "a": my_task(a=input_dict["input_1"]),
        "b": my_task(a=input_dict["input_2"]),
        "c": my_task(a=100),
    }

    w = workflow(inputs=input_dict, outputs={}, nodes=nodes)

    assert w.interface.inputs[
        "input_1"].type == Types.Integer.to_flyte_literal_type()
    assert w.interface.inputs[
        "input_2"].type == Types.Integer.to_flyte_literal_type()
    assert _get_node_by_id(w, "a").inputs[0].var == "a"
    assert _get_node_by_id(
        w, "a"
    ).inputs[0].binding.promise.node_id == constants.GLOBAL_INPUT_NODE_ID
    assert _get_node_by_id(w, "a").inputs[0].binding.promise.var == "input_1"
    assert _get_node_by_id(
        w, "b"
    ).inputs[0].binding.promise.node_id == constants.GLOBAL_INPUT_NODE_ID
    assert _get_node_by_id(w, "b").inputs[0].binding.promise.var == "input_2"
    assert _get_node_by_id(
        w, "c").inputs[0].binding.scalar.primitive.integer == 100
Exemple #11
0
class WorkflowWithIO(object):
    a = Input(Types.Integer, default=10, help="Test integer input with default")
    b = Input(Types.String, required=True, help="Test string with no default")
    odd_nums_task = find_odd_numbers_with_string(list_of_nums=[2, 3, 4, 7], demo_string=b)
    task_output = Output(odd_nums_task.outputs.are_num_odd, sdk_type=[Boolean])
    output_a = Output(a, sdk_type=Integer)  # pass through output
    output_b = Output(odd_nums_task.outputs.altered_string, sdk_type=String)
Exemple #12
0
class Child(object):
    input_1 = Input(Types.Integer)
    input_2 = Input(Types.Integer, default=5, help='Not required.')
    a = add_one(a=input_1)
    b = add_one(a=input_2)
    c = add_one(a=100)
    output = Output(c.outputs.b, sdk_type=Types.Integer)
Exemple #13
0
class OptionallyCachableWorkflow(object):
    input_if_cached_enabled = Input(Types.Float,
                                    default=10.0,
                                    help="Test float input with default")
    cache_disabled = Input(Types.Boolean,
                           default=False,
                           help="Whether to disable cache.")
    input_generator = generate_input(wf_input=input_if_cached_enabled,
                                     cache_disabled=cache_disabled)
    dynamic_task = sample_batch_task_cachable(
        caching_input=input_generator.outputs.generated)
Exemple #14
0
class BackfillWorkflow(object):
    """
    So if FailingWorkflow Fails, we can resurrect and backfill the FailingWorkflow, using the BackfillWorkflow.
    The Backfill workflow just has one step
    """
    in_image = Input(Types.Blob, required=True)
    angle = Input(Types.Float, default=180.0)

    rotate_task = rotate(image=in_image, angle=angle, fail=False)

    out_image = Output(rotate_task.outputs.out_image, sdk_type=Types.Blob)
Exemple #15
0
class PrimitiveDemoWorkflow(object):
    x = Input(Types.Integer, help="Integer")
    y = Input(Types.Float, help="Float")
    s = Input(Types.String, help="String")
    b = Input(Types.Boolean, help="Boolean")

    m = multiply(x=x, y=y)
    s1 = convert_to_str(z=m.outputs.z)
    s2 = add_bool_to_str(st=s, b=b)

    mult_str = Output(s1.outputs.s, sdk_type=Types.String)
    bool_str = Output(s2.outputs.s, sdk_type=Types.String)
Exemple #16
0
class SimpleWorkflow(object):
    train_dataset = Input(Types.Blob)
    validation_dataset = Input(Types.Blob)

    custom = custom_training_task(dummy_train_dataset=train_dataset,
                                  dummy_validation_dataset=validation_dataset,
                                  my_input="hello world")

    final_model = Output(custom.outputs.out_model, sdk_type=Types.Blob)
    final_extra_output = Output(custom.outputs.out_extra_output_file,
                                sdk_type=Types.Blob)
    final_value = Output(custom.outputs.out, sdk_type=Types.Integer)
def nested_dynamic_wf_task(wf_params, task_input_num, out):
    wf_params.logging.info(
        "Running inner task... yielding a code generated sub workflow")

    # Inner workflow
    input_a = Input(Types.Integer, help="Tell me something")
    node1 = sq_sub_task(in1=input_a)

    MyUnregisteredWorkflowInner = workflow(
        inputs={
            'a': input_a,
        },
        outputs={
            'ooo':
            Output(node1.outputs.out1,
                   sdk_type=Types.Integer,
                   help='This is an integer output')
        },
        nodes={
            'node_one': node1,
        })

    setattr(MyUnregisteredWorkflowInner, 'auto_assign_name',
            manual_assign_name)
    MyUnregisteredWorkflowInner._platform_valid_name = 'unregistered'

    # Output workflow
    input_a = Input(Types.Integer, help="Tell me something")
    node1 = MyUnregisteredWorkflowInner(a=task_input_num)

    MyUnregisteredWorkflowOuter = workflow(
        inputs={
            'a': input_a,
        },
        outputs={
            'ooo':
            Output(node1.outputs.ooo,
                   sdk_type=Types.Integer,
                   help='This is an integer output')
        },
        nodes={
            'node_one': node1,
        })

    setattr(MyUnregisteredWorkflowOuter, 'auto_assign_name',
            manual_assign_name)
    MyUnregisteredWorkflowOuter._platform_valid_name = 'unregistered'

    unregistered_workflow_execution = MyUnregisteredWorkflowOuter(
        a=task_input_num)
    out.set(unregistered_workflow_execution.outputs.ooo)
Exemple #18
0
class SimpleWorkflow(object):
    input_1 = Input(Types.Integer)
    input_2 = Input(Types.Integer, default=5, help='Not required.')
    a = add_one(a=input_1)
    b = add_one(a=input_2)
    c = subtract_one(a=input_1)

    d = write_special_types()
    e = read_special_types(
        a=d.outputs.a,
        b=d.outputs.b,
        c=d.outputs.c,
        d=d.outputs.d,
        e=d.outputs.e,
    )
Exemple #19
0
class PrestoWorkflow(object):
    ds = Input(Types.String, required=True, help="Test string with no default")
    # routing_group = Input(Types.String, required=True, help="Test string with no default")

    p_task = presto_task(ds=ds, rg="etl")

    output_a = Output(p_task.outputs.results, sdk_type=schema)
Exemple #20
0
class GenericDemoWorkflow(object):
    a = Input(Types.Generic, default={}, help="Input for inner workflow")
    generic_type_example = generic_type_task(custom=a)
    generic_json = generic_to_json(
        replicated=generic_type_example.outputs.replicated)
    counts = Output(generic_type_example.outputs.counts,
                    sdk_type=Types.Generic)
def workflow_builder(wf_params, task_input_num, decider, out):
    wf_params.logging.info(
        "Running inner task... yielding a code generated sub workflow")

    input_a = Input(Types.Integer, help="Tell me something")
    if decider:
        node1 = inverse_inner_task(num=input_a)
    else:
        node1 = inner_task(num=input_a)

    MyUnregisteredWorkflow = workflow(inputs={
        'a': input_a,
    },
                                      outputs={
                                          'ooo':
                                          Output(
                                              node1.outputs.out,
                                              sdk_type=Types.Integer,
                                              help='This is an integer output')
                                      },
                                      nodes={
                                          'node_one': node1,
                                      })

    # This is an unfortunate setting that will hopefully not be necessary in the future.
    setattr(MyUnregisteredWorkflow, 'auto_assign_name', manual_assign_name)
    MyUnregisteredWorkflow._platform_valid_name = 'unregistered'

    unregistered_workflow_execution = MyUnregisteredWorkflow(a=task_input_num)

    yield unregistered_workflow_execution
    out.set(unregistered_workflow_execution.outputs.ooo)
 class StaticSubWorkflowCaller(object):
     outer_a = Input(Types.Integer,
                     default=5,
                     help="Input for inner workflow")
     identity_wf_execution = IdentityWorkflow(a=outer_a)
     wf_output = Output(identity_wf_execution.outputs.task_output,
                        sdk_type=Types.Integer)
Exemple #23
0
class LoadTestOrchestrationWorkflow(object):

    # 30 python tasks ~=  75 i3.16x nodes on AWS Batch
    python_task_count = 30
    # 30 spark tasks ~=  60 i3.16x nodes on AWS Batch
    spark_task_count = 30
    # 3 dynamic-jobs each of 1000 tasks ~=  3*20 i3.16x nodes on AWS Batch
    djo_task_count = 1000
    dj_count = 3

    p = [None] * python_task_count
    s = [None] * spark_task_count
    d = [None] * dj_count

    # python tasks
    for i in range(0, python_task_count):
        p[i] = python_loadtest_lp()

    # dynamic-job tasks
    for i in range(0, dj_count):
        d[i] = dynamic_job_loadtest_lp(tasks_count=djo_task_count)

    # hive load tests.
    # h1 = hive_loadtest_lp()

    # spark load tests
    trigger_time = Input(Types.Datetime)
    for i in range(0, spark_task_count):
        s[i] = spark_loadtest_lp(triggered_date=trigger_time, offset=i)
class StructuredSagemakerXGBoostHPO(object):
    # Input parameters
    static_hyperparameters = Input(
        Types.Generic,
        help=
        "A list of the static hyperparameters to pass to the training jobs.",
        default=example_hyperparams,
    )
    train_data = Input(
        Types.Schema(),
        help=
        "A Columnar schema that contains all the features used for training.",
    )
    train_target = Input(
        Types.Schema(),
        help=
        "A Columnar schema that contains all the labeled results for train_data.",
    )

    validation_data = Input(
        Types.Schema(),
        help=
        "A Columnar schema that contains all the features used for validation.",
    )
    validation_target = Input(
        Types.Schema(),
        help=
        "A Columnar schema that contains all the labeled results for validation_data.",
    )

    sagemaker_transform = convert_to_sagemaker_csv(x_train=train_data,
                                                   y_train=train_target,
                                                   x_test=validation_data,
                                                   y_test=validation_target)

    # Node definitions
    train_node = xgtrainer_task(
        static_hyperparameters=static_hyperparameters,
        train=sagemaker_transform.outputs.train,
        validation=sagemaker_transform.outputs.validation,
    )

    untar = untar_xgboost(model_tar=train_node.outputs.model, )

    # Outputs
    model = Output(untar.outputs.model, sdk_type=Types.Blob)
Exemple #25
0
class BatchTasksWorkflow(object):
    num_subtasks = Input(Types.Integer, default=3)
    task1 = no_inputs_sample_batch_task()
    task2 = sample_batch_task_beatles_cached(in1=num_subtasks)
    t = print_every_time(ints_to_print=task1.outputs.out_ints,
                         strings_to_print=task1.outputs.out_str)
    ints_out = Output(task1.outputs.out_ints, sdk_type=[[Types.Integer]])
    str_out = Output(task2.outputs.out_str, sdk_type=[Types.String])
Exemple #26
0
class SimpleWorkflow(object):
    triggered_date = Input(Types.Datetime)
    print1a = add_one_and_print(value_to_print=3)
    print1b = add_one_and_print(value_to_print=101)
    print2 = sum_non_none(
        values_to_print=[print1a.outputs.out, print1b.outputs.out])
    print3 = add_one_and_print(value_to_print=print2.outputs.out)
    print4 = add_one_and_print(value_to_print=print3.outputs.out)
    final_value = Output(print4.outputs.out, sdk_type=Types.Integer)
Exemple #27
0
class DemoWorkflow(object):
    # Input parameters
    static_hyperparameters = Input(
        Types.Generic,
        help=
        "A list of the static hyperparameters to pass to the training jobs.",
    )
    train_data = Input(Types.MultiPartCSV,
                       help="S3 path to a flat directory of CSV files.")
    validation_data = Input(Types.MultiPartCSV,
                            help="S3 path to a flat directory of CSV files.")

    # Node definitions
    train_node = xgtrainer_task(
        static_hyperparameters=example_hyperparams,
        train=train_data,
        validation=validation_data,
    )

    # Outputs
    trained_model = Output(train_node.outputs.model, sdk_type=Types.Blob)
Exemple #28
0
class DataPreparationWorkflow:
    streams_external_storage_prefix = Input(Types.String, required=True)
    streams_names = Input([Types.String], required=True)
    stream_extension = Input(Types.String, default="avi")

    # video_external_paths = Input([Types.String], required=True)
    sampling_random_seed = Input(Types.Integer, default=DEFAULT_RANDOM_SEED)
    sampling_n_clusters = Input(Types.Integer, default=DEFAULT_LUMINANCE_N_CLUSTERS)
    sampling_sample_size = Input(Types.Integer, default=DEFAULT_LUMINANCE_SAMPLE_SIZE)

    download_video_task = download_videos(
        streams_external_storage_prefix=streams_external_storage_prefix,
        streams_names=streams_names,
        stream_extension=stream_extension,
    )

    extract_from_video_collection_task = extract_from_video_collections(
        video_blobs=download_video_task.outputs.downloaded_streams_blobs,
    )

    luminance_select_collections_task = luminance_select_collections(
        raw_frames_mpblobs=extract_from_video_collection_task.outputs.raw_frames_mpblobs,
        n_clusters=sampling_n_clusters,
        sample_size=sampling_sample_size,
        random_seed=sampling_random_seed,
    )

    selected_frames_mpblobs = Output(luminance_select_collections_task.outputs.selected_image_mpblobs,
                                     sdk_type=[Types.MultiPartBlob])
    selected_frames_mpblobs_metadata = Output(luminance_select_collections_task.outputs.selected_file_names,
                                              sdk_type=[[Types.String]])
    streams_names_out = Output(streams_names, sdk_type=[Types.String])
class DemoWorkflow(object):
    # Input parameters
    train_data = Input(Types.MultiPartCSV,
                       help="s3 path to a flat directory of CSV files.")
    validation_data = Input(Types.MultiPartCSV,
                            help="s3 path to a flat directory of CSV files.")

    # Node definitions
    train_node = xgtrainer_task(
        static_hyperparameters={
            "eval_metric": "auc",
            "num_round": "100",
            "objective": "binary:logistic",
            "rate_drop": "0.3",
            "tweedie_variance_power": "1.4",
        },
        train=train_data,
        validation=validation_data,
    )

    # Outputs
    trained_model = Output(train_node.outputs.model, sdk_type=Types.Blob)
class FailingWorkflow(object):
    """
    This workflow is  two step workflow,
    Step 1: scale an image
    Step 2: Rotate an image
    NOTE: This is not an efficient workflow as one image - scaling and rotation can be done with one OPEN CV call. But this example exists only for a demo

    Step 2: in this case will always fail as it is hard-coded to indicate fail=True
    """
    in_image = Input(
        Types.Blob,
        default=Types.Blob.create_at_known_location(
            "https://miro.medium.com/max/1400/1*qL8UYfaStcEo_YVPrA4cbA.png"))
    angle = Input(Types.Float, default=180.0)
    scale = Input(Types.Integer, default=2)

    scale_task = tasks.scale(image=in_image, scale_factor=scale)
    rotate_task = tasks.rotate(image=scale_task.outputs.out_image,
                               angle=angle,
                               fail=True)

    out_image = Output(rotate_task.outputs.out_image, sdk_type=Types.Blob)