Esempio n. 1
0
 class StaticSubWorkflowCaller(object):
     outer_a = Input(Types.Integer,
                     default=5,
                     help="Input for inner workflow")
     identity_wf_execution = IdentityWorkflow(a=outer_a)
     wf_output = Output(identity_wf_execution.outputs.task_output,
                        sdk_type=Types.Integer)
Esempio n. 2
0
def workflow_builder(wf_params, task_input_num, decider, out):
    wf_params.logging.info(
        "Running inner task... yielding a code generated sub workflow")

    input_a = Input(Types.Integer, help="Tell me something")
    if decider:
        node1 = inverse_inner_task(num=input_a)
    else:
        node1 = inner_task(num=input_a)

    MyUnregisteredWorkflow = workflow(inputs={
        'a': input_a,
    },
                                      outputs={
                                          'ooo':
                                          Output(
                                              node1.outputs.out,
                                              sdk_type=Types.Integer,
                                              help='This is an integer output')
                                      },
                                      nodes={
                                          'node_one': node1,
                                      })

    # This is an unfortunate setting that will hopefully not be necessary in the future.
    setattr(MyUnregisteredWorkflow, 'auto_assign_name', manual_assign_name)
    MyUnregisteredWorkflow._platform_valid_name = 'unregistered'

    unregistered_workflow_execution = MyUnregisteredWorkflow(a=task_input_num)

    yield unregistered_workflow_execution
    out.set(unregistered_workflow_execution.outputs.ooo)
Esempio n. 3
0
class PrestoWorkflow(object):
    ds = Input(Types.String, required=True, help="Test string with no default")
    # routing_group = Input(Types.String, required=True, help="Test string with no default")

    p_task = presto_task(ds=ds, rg="etl")

    output_a = Output(p_task.outputs.results, sdk_type=schema)
Esempio n. 4
0
class InverterDynamicWorkflow(object):
    input_a = Input(Types.Integer, default=5, help="Input for inner workflow")
    inverter_input = Input(Types.Boolean,
                           default=False,
                           help="Should invert or not")
    lp_task = workflow_builder(task_input_num=input_a, decider=inverter_input)
    wf_output = Output(lp_task.outputs.out, sdk_type=Types.Integer)
Esempio n. 5
0
class Child(object):
    input_1 = Input(Types.Integer)
    input_2 = Input(Types.Integer, default=5, help="Not required.")
    a = add_one(a=input_1)
    b = add_one(a=input_2)
    c = add_one(a=100)
    output = Output(c.outputs.b, sdk_type=Types.Integer)
Esempio n. 6
0
class RawContainerWorkflow(object):
    val1 = Input(Types.Integer)
    val2 = Input(Types.Integer)
    sq1 = square(val=val1)
    sq2 = square(val=val2)
    sm = sum(x=sq1.outputs.out, y=sq2.outputs.out)
    sum_of_squares = Output(sm.outputs.out, sdk_type=Types.Integer)
Esempio n. 7
0
class PrestoWorkflow(object):
    length = Input(Types.Integer, required=True, help="Int between 1 and 26")
    routing_group = Input(Types.String,
                          required=True,
                          help="Test string with no default")
    p_task = presto_task(length=length, rg=routing_group)
    output_a = Output(p_task.outputs.results, sdk_type=schema)
Esempio n. 8
0
class GenericDemoWorkflow(object):
    a = Input(Types.Generic, default={}, help="Input for inner workflow")
    generic_type_example = generic_type_task(custom=a)
    generic_json = generic_to_json(
        replicated=generic_type_example.outputs.replicated)
    counts = Output(generic_type_example.outputs.counts,
                    sdk_type=Types.Generic)
class ClassifierEvaluateWorkflow:
    available_streams_mpblobs = Input([Types.MultiPartBlob], required=True)
    available_streams_names = Input([Types.String], required=True)
    validation_data_ratio = Input(Types.Float,
                                  default=DEFAULT_VALIDATION_DATA_RATIO)
    streams_metadata_path = Input(Types.String, required=True)
    model = Input(Types.Blob, default=None)
    evaluation_config_json = Input(
        Types.Generic,
        default=ujson.loads(open(DEFAULT_EVALUATION_CONFIG_FILE).read()))

    fetch_model_task = fetch_model(model=model)

    rearrange_data_task = rearrange_data(
        available_streams_mpblobs=available_streams_mpblobs,
        available_streams_names=available_streams_names,
        training_validation_config_json=evaluation_config_json,
        streams_metadata_path=streams_metadata_path,
        validation_data_ratio=validation_data_ratio,
    )

    evaluate_on_datasets_task = evaluate_on_datasets(
        model=fetch_model_task.outputs.model_blob,
        evaluation_clean_mpblob=rearrange_data_task.outputs.
        validation_clean_mpblob,
        evaluation_dirty_mpblob=rearrange_data_task.outputs.
        validation_dirty_mpblob,
    )

    analyze_task = analyze_prediction_results(
        ground_truths=evaluate_on_datasets_task.outputs.ground_truths_out,
        predictions=evaluate_on_datasets_task.outputs.predictions_out,
    )

    predict = generate_predictions(
        ground_truths=evaluate_on_datasets_task.outputs.ground_truths_out,
        probabilities=evaluate_on_datasets_task.outputs.predictions_out)

    analyze_results_blobs = Output(analyze_task.outputs.result_blobs,
                                   sdk_type=[Types.Blob])
    analyze_results_files_names = Output(
        analyze_task.outputs.result_files_names, sdk_type=[Types.String])
    ground_truths = Output(evaluate_on_datasets_task.outputs.ground_truths_out,
                           sdk_type=[Types.Integer])
    predictions = Output(predict.outputs.predictions, sdk_type=[Types.Integer])
Esempio n. 10
0
def nested_dynamic_wf_task(wf_params, task_input_num, out):
    wf_params.logging.info(
        "Running inner task... yielding a code generated sub workflow")

    # Inner workflow
    input_a = Input(Types.Integer, help="Tell me something")
    node1 = sq_sub_task(in1=input_a)

    MyUnregisteredWorkflowInner = workflow(
        inputs={"a": input_a},
        outputs={
            "ooo":
            Output(node1.outputs.out1,
                   sdk_type=Types.Integer,
                   help="This is an integer output")
        },
        nodes={"node_one": node1},
    )

    setattr(MyUnregisteredWorkflowInner, "auto_assign_name",
            manual_assign_name)
    MyUnregisteredWorkflowInner._platform_valid_name = "unregistered"

    # Output workflow
    input_a = Input(Types.Integer, help="Tell me something")
    node1 = MyUnregisteredWorkflowInner(a=task_input_num)

    MyUnregisteredWorkflowOuter = workflow(
        inputs={"a": input_a},
        outputs={
            "ooo":
            Output(node1.outputs.ooo,
                   sdk_type=Types.Integer,
                   help="This is an integer output")
        },
        nodes={"node_one": node1},
    )

    setattr(MyUnregisteredWorkflowOuter, "auto_assign_name",
            manual_assign_name)
    MyUnregisteredWorkflowOuter._platform_valid_name = "unregistered"

    unregistered_workflow_execution = MyUnregisteredWorkflowOuter(
        a=task_input_num)
    out.set(unregistered_workflow_execution.outputs.ooo)
Esempio n. 11
0
class SimpleWorkflow(object):
    triggered_date = Input(Types.Datetime)
    print1a = add_one_and_print(value_to_print=3)
    print1b = add_one_and_print(value_to_print=101)
    print2 = sum_non_none(
        values_to_print=[print1a.outputs.out, print1b.outputs.out])
    print3 = add_one_and_print(value_to_print=print2.outputs.out)
    print4 = add_one_and_print(value_to_print=print3.outputs.out)
    final_value = Output(print4.outputs.out, sdk_type=Types.Integer)
Esempio n. 12
0
class MNISTTest(object):
    no_cuda = Input(Types.Boolean,
                    default=False,
                    help="disables CUDA training")
    batch_size = Input(Types.Integer,
                       default=64,
                       help='input batch size for training (default: 64)')
    test_batch_size = Input(
        Types.Integer,
        default=1000,
        help='input batch size for testing (default: 1000)')
    epochs = Input(Types.Integer,
                   default=1,
                   help='number of epochs to train (default: 10)')
    learning_rate = Input(Types.Float,
                          default=0.01,
                          help='learning rate (default: 0.01)')
    sgd_momentum = Input(Types.Float,
                         default=0.5,
                         help='SGD momentum (default: 0.5)')
    seed = Input(Types.Integer, default=1, help='random seed (default: 1)')
    log_interval = Input(
        Types.Integer,
        default=10,
        help='how many batches to wait before logging training status')
    dir = Input(Types.String,
                default='logs',
                help='directory where summary logs are stored')

    mnist_result = mnist_pytorch_job(no_cuda=no_cuda,
                                     batch_size=batch_size,
                                     test_batch_size=test_batch_size,
                                     epochs=epochs,
                                     learning_rate=learning_rate,
                                     sgd_momentum=sgd_momentum,
                                     seed=seed,
                                     log_interval=log_interval,
                                     dir=dir)

    accuracies = Output(mnist_result.outputs.epoch_accuracies,
                        sdk_type=[Types.Float])
    model = Output(mnist_result.outputs.model_state, sdk_type=Types.Blob)
Esempio n. 13
0
class BackfillWorkflow(object):
    """
    So if FailingWorkflow Fails, we can resurrect and backfill the FailingWorkflow, using the BackfillWorkflow.
    The Backfill workflow just has one step
    """
    in_image = Input(Types.Blob, required=True)
    angle = Input(Types.Float, default=180.0)

    rotate_task = rotate(image=in_image, angle=angle, fail=False)

    out_image = Output(rotate_task.outputs.out_image, sdk_type=Types.Blob)
Esempio n. 14
0
class HousePricePredictionModelTrainer(object):
    """
    This pipeline trains an XGBoost model, also generated synthetic data and runs predictions against test dataset
    """

    loc = Input(Types.String, help="Location for where to train the model.")
    seed = Input(Types.Integer, default=7, help="Seed to use for splitting.")
    num_houses = Input(Types.Integer,
                       default=1000,
                       help="Number of houses to generate data for")

    # the actual algorithm
    split = generate_and_split_data(loc=loc,
                                    number_of_houses=num_houses,
                                    seed=seed)
    fit_task = fit(train=split.outputs.train)
    predicted = predict(model_ser=fit_task.outputs.model,
                        test=split.outputs.test)

    # Outputs: joblib seralized model and accuracy of the model
    model = Output(fit_task.outputs.model, sdk_type=Types.Blob)
    accuracy = Output(predicted.outputs.accuracy, sdk_type=Types.Float)
Esempio n. 15
0
class DiabetesXGBoostModelTrainer(object):
    """
    This pipeline trains an XGBoost mode for any given dataset that matches the schema as specified in
    https://github.com/jbrownlee/Datasets/blob/master/pima-indians-diabetes.names.
    """

    # Inputs dataset, fraction of the dataset to be split out for validations and seed to use to perform the split
    dataset = Input(
        Types.CSV,
        default=Types.CSV.create_at_known_location(
            "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
        ),
        help=
        "A CSV File that matches the format https://github.com/jbrownlee/Datasets/blob/master/pima-indians-diabetes.names"
    )

    test_split_ratio = Input(Types.Float,
                             default=0.33,
                             help="Ratio of how much should be test to Train")
    seed = Input(Types.Integer, default=7, help="Seed to use for splitting.")

    # the actual algorithm
    split = get_traintest_splitdatabase(dataset=dataset,
                                        seed=seed,
                                        test_split_ratio=test_split_ratio)
    fit_task = fit(x=split.outputs.x_train,
                   y=split.outputs.y_train,
                   hyperparams=XGBoostModelHyperparams(
                       max_depth=4, ).to_dict())
    predicted = predict(model_ser=fit_task.outputs.model,
                        x=split.outputs.x_test)
    score_task = metrics(predictions=predicted.outputs.predictions,
                         y=split.outputs.y_test)

    # Outputs: joblib seralized model and accuracy of the model
    model = Output(fit_task.outputs.model, sdk_type=Types.Blob)
    accuracy = Output(score_task.outputs.accuracy, sdk_type=Types.Float)
class StructuredSagemakerXGBoostHPO(object):
    # Input parameters
    static_hyperparameters = Input(
        Types.Generic,
        help=
        "A list of the static hyperparameters to pass to the training jobs.",
        default=example_hyperparams,
    )
    train_data = Input(
        Types.Schema(),
        help=
        "A Columnar schema that contains all the features used for training.",
    )
    train_target = Input(
        Types.Schema(),
        help=
        "A Columnar schema that contains all the labeled results for train_data.",
    )

    validation_data = Input(
        Types.Schema(),
        help=
        "A Columnar schema that contains all the features used for validation.",
    )
    validation_target = Input(
        Types.Schema(),
        help=
        "A Columnar schema that contains all the labeled results for validation_data.",
    )

    sagemaker_transform = convert_to_sagemaker_csv(x_train=train_data,
                                                   y_train=train_target,
                                                   x_test=validation_data,
                                                   y_test=validation_target)

    # Node definitions
    train_node = xgtrainer_task(
        static_hyperparameters=static_hyperparameters,
        train=sagemaker_transform.outputs.train,
        validation=sagemaker_transform.outputs.validation,
    )

    untar = untar_xgboost(model_tar=train_node.outputs.model, )

    # Outputs
    model = Output(untar.outputs.model, sdk_type=Types.Blob)
Esempio n. 17
0
    def __call__(self, module="__main__"):
        """
        Creates an SdkWorkflow from a dagster pipeline. Then, adds the nodes as attrs within the module
        that this function is invoked from. User will need to manually provide the module name.
        This is required because flytekit runs dir() on the module that the resultant container
        registers, in order  to discover the DAG structure.
        """

        self.execution_plan = create_execution_plan(self.pipeline,
                                                    run_config=self.run_config)

        self.build_flyte_sdk_workflow()
        nodes = {}
        for name, node in self.get_sdk_tasks():
            setattr(sys.modules[module], name, node)
            nodes[name] = node(**self.inputs[name],
                               **self.source_handle_inputs(
                                   name, nodes)).assign_id_and_return(name)

        _inputs = [
            _input.rename_and_return_reference(name)
            for key in self.inputs.keys()
            for name, _input in self.inputs[key].items()
        ]

        # currently, we create an Output for every solid's output. A user may only want outputs for
        # solids at the highest topological level or for solids whose output is not used elsewhere.
        # However they may want to persist outputs from other levels as well.
        # Therefore, it may be simplest to create an Output for every Solid's output
        _outputs = [
            Output(getattr(nodes[key].outputs, name),
                   sdk_type=flyte_type).rename_and_return_reference(
                       "{}_{}".format(key, name))
            for key in self.outputs.keys()
            for name, flyte_type in self.outputs[key].items()
        ]

        return SdkWorkflow(
            inputs=sorted(_inputs, key=lambda x: x.name),
            outputs=sorted(_outputs, key=lambda x: x.name),
            nodes=sorted(nodes.values(), key=lambda x: x.id),
        )
Esempio n. 18
0
class DemoWorkflow(object):
    # Input parameters
    static_hyperparameters = Input(
        Types.Generic,
        help=
        "A list of the static hyperparameters to pass to the training jobs.",
    )
    train_data = Input(Types.MultiPartCSV,
                       help="S3 path to a flat directory of CSV files.")
    validation_data = Input(Types.MultiPartCSV,
                            help="S3 path to a flat directory of CSV files.")

    # Node definitions
    train_node = xgtrainer_task(
        static_hyperparameters=example_hyperparams,
        train=train_data,
        validation=validation_data,
    )

    # Outputs
    trained_model = Output(train_node.outputs.model, sdk_type=Types.Blob)
Esempio n. 19
0
class FailingWorkflow(object):
    """
    This workflow is  two step workflow,
    Step 1: scale an image
    Step 2: Rotate an image
    NOTE: This is not an efficient workflow as one image - scaling and rotation can be done with one OPEN CV call. But this example exists only for a demo

    Step 2: in this case will always fail as it is hard-coded to indicate fail=True
    """
    in_image = Input(
        Types.Blob,
        default=Types.Blob.create_at_known_location(
            "https://miro.medium.com/max/1400/1*qL8UYfaStcEo_YVPrA4cbA.png"))
    angle = Input(Types.Float, default=180.0)
    scale = Input(Types.Integer, default=2)

    scale_task = tasks.scale(image=in_image, scale_factor=scale)
    rotate_task = tasks.rotate(image=scale_task.outputs.out_image,
                               angle=angle,
                               fail=True)

    out_image = Output(rotate_task.outputs.out_image, sdk_type=Types.Blob)
Esempio n. 20
0
class DemoWorkflow(object):
    # Input parameters
    train_data = Input(Types.MultiPartCSV,
                       help="s3 path to a flat directory of CSV files.")
    validation_data = Input(Types.MultiPartCSV,
                            help="s3 path to a flat directory of CSV files.")

    # Node definitions
    train_node = xgtrainer_task(
        static_hyperparameters={
            "eval_metric": "auc",
            "num_round": "100",
            "objective": "binary:logistic",
            "rate_drop": "0.3",
            "tweedie_variance_power": "1.4",
        },
        train=train_data,
        validation=validation_data,
    )

    # Outputs
    trained_model = Output(train_node.outputs.model, sdk_type=Types.Blob)
Esempio n. 21
0
class DynamicLaunchPlanCaller(object):
    outer_a = Input(Types.Integer, default=5, help="Input for inner workflow")
    lp_task = lp_yield_task(num=outer_a)
    wf_output = Output(lp_task.outputs.out, sdk_type=Types.Integer)
Esempio n. 22
0
class StaticLaunchPlanCaller(object):
    outer_a = Input(Types.Integer, default=5, help="Input for inner workflow")
    identity_lp_execution = id_lp(a=outer_a)
    wf_output = Output(identity_lp_execution.outputs.task_output,
                       sdk_type=Types.Integer)
Esempio n. 23
0
class EdgeDetectorWf(object):
    image_input = Input(Types.String, required=True, help="Image to run for")
    run_edge_detection = edge_detection_canny(image_location=image_input)
    edges = Output(run_edge_detection.outputs.parsed_image,
                   sdk_type=Types.Blob)
Esempio n. 24
0
class TimeDemoWorkflow(object):
    dt = Input(Types.Datetime, help="Input time")
    duration = Input(Types.Timedelta, help="Input timedelta")
    time_example = time_task(dt=dt, duration=duration)
    new_time = Output(time_example.outputs.new_time, sdk_type=Types.Datetime)
Esempio n. 25
0
class DynamicSubWorkflowCaller(object):
    outer_a = Input(Types.Integer, default=5, help="Input for inner workflow")
    sub_wf_task = sub_wf_yield_task(num=outer_a)
    wf_output = Output(sub_wf_task.outputs.out, sdk_type=Types.Integer)
Esempio n. 26
0
class SimpleWorkflow(object):
    input_1 = Input(Types.Integer)
    input_2 = Input(Types.Integer, default=5, help='Not required.')
    a = add_one(a=input_1)
    output = Output(a.outputs.b, sdk_type=Types.Integer)
Esempio n. 27
0
 class IdentityWorkflow(object):
     a = Input(Types.Integer, default=5, help="Input for inner workflow")
     odd_nums_task = inner_task(num=a)
     task_output = Output(odd_nums_task.outputs.out, sdk_type=Types.Integer)
Esempio n. 28
0
class EdgeDetector(object):
    script = Input(Types.Blob)
    image = Input(Types.Blob)
    edge_task = edges(script=script, image=image)
    out =  Output(edge_task.outputs.edges, sdk_type=Types.Blob)
Esempio n. 29
0
class Parent(object):
    input_1 = Input(Types.Integer)
    child1 = child_lp(input_1=input_1)
    child2 = child_lp(input_1=input_1, input_2=10)
    final_sum = sum(a=child1.outputs.output, b=child2.outputs.output)
    output = Output(final_sum.outputs.c, sdk_type=Types.Integer)
Esempio n. 30
0
class SimpleDynamicSubworkflow(object):
    input_a = Input(Types.Integer, default=5, help="Input for inner workflow")
    lp_task = dynamic_wf_task(task_input_num=input_a)
    wf_output = Output(lp_task.outputs.out, sdk_type=Types.Integer)