class MultiRegionHousePricePredictionModelTrainer(object):
    """
    This pipeline trains an XGBoost model, also generated synthetic data and runs predictions against test dataset
    """
    regions = Input(Types.List(Types.String),
                    default=["SFO", "SEA", "DEN"],
                    help="Regions for where to train the model.")
    seed = Input(Types.Integer, default=7, help="Seed to use for splitting.")
    num_houses_per_region = Input(
        Types.Integer,
        default=1000,
        help="Number of houses to generate data for in each region")

    # the actual algorithm
    split = generate_and_split_data_multiloc(
        locations=regions,
        number_of_houses_per_location=num_houses_per_region,
        seed=seed)
    fit_task = parallel_fit(multi_train=split.outputs.train)
    predicted = parallel_predict(multi_models=fit_task.outputs.multi_models,
                                 multi_test=split.outputs.test)

    # Outputs: joblib seralized models per region and accuracy of the model per region
    # Note we should make this into a map, but for demo we will output a simple list
    models = Output(fit_task.outputs.multi_models,
                    sdk_type=Types.List(Types.Blob))
    accuracies = Output(predicted.outputs.accuracies,
                        sdk_type=Types.List(Types.Float))
Exemple #2
0
def write_special_types(wf_params, a, b, c, d, e):
    blob = Types.Blob()
    with blob as w:
        w.write("hello I'm a blob".encode('utf-8'))

    csv = Types.CSV()
    with csv as w:
        w.write("hello,i,iz,blob")

    mpcsv = Types.MultiPartCSV()
    with mpcsv.create_part('000000') as w:
        w.write("hello,i,iz,blob")
    with mpcsv.create_part('000001') as w:
        w.write("hello,i,iz,blob2")

    mpblob = Types.MultiPartBlob()
    with mpblob.create_part('000000') as w:
        w.write("hello I'm a mp blob".encode('utf-8'))
    with mpblob.create_part('000001') as w:
        w.write("hello I'm a mp blob too".encode('utf-8'))

    schema = Types.Schema([('a', Types.Integer), ('b', Types.Integer)])()
    with schema as w:
        w.write(_pd.DataFrame.from_dict({'a': [1, 2, 3], 'b': [4, 5, 6]}))
        w.write(_pd.DataFrame.from_dict({'a': [3, 2, 1], 'b': [6, 5, 4]}))

    a.set(blob)
    b.set(csv)
    c.set(mpcsv)
    d.set(mpblob)
    e.set(schema)
Exemple #3
0
def test_subset_of_columns():
    @outputs(a=Types.Schema([('a', Types.Integer), ('b', Types.String)]))
    @python_task()
    def source(wf_params, a):
        out = Types.Schema([('a', Types.Integer), ('b', Types.String)])()
        with out as writer:
            writer.write(
                pd.DataFrame.from_dict({
                    'a': [1, 2, 3, 4, 5],
                    'b': ['a', 'b', 'c', 'd', 'e']
                }))
        a.set(out)

    @inputs(a=Types.Schema([('a', Types.Integer)]))
    @python_task()
    def sink(wf_params, a):
        with a as reader:
            df = reader.read(concat=True)
            assert len(df.columns.values) == 1
            assert df['a'].tolist() == [1, 2, 3, 4, 5]

        with a as reader:
            df = reader.read(truncate_extra_columns=False)
            assert df.columns.values.tolist() == ['a', 'b']
            assert df['a'].tolist() == [1, 2, 3, 4, 5]
            assert df['b'].tolist() == ['a', 'b', 'c', 'd', 'e']

    o = source.unit_test()
    sink.unit_test(**o)
Exemple #4
0
def generate_queries(wf_params, hive_results):
    q1 = "SELECT 1"
    q2 = "SELECT 'two'"
    schema_1, formatted_query_1 = Types.Schema().create_from_hive_query(select_query=q1)
    schema_2, formatted_query_2 = Types.Schema().create_from_hive_query(select_query=q2)

    hive_results.set([schema_1, schema_2])
    return [formatted_query_1, formatted_query_2]
Exemple #5
0
def test_generic_schema():
    @inputs(a=Types.Schema())
    @outputs(b=Types.Schema())
    @python_task
    def copy_task(wf_params, a, b):
        out = Types.Schema()()
        with a as r:
            with out as w:
                for df in r.iter_chunks():
                    w.write(df)
        b.set(out)

    # Test generic copy and pass through
    a = Types.Schema()()
    with a as w:
        w.write(pd.DataFrame.from_dict({'a': [1, 2, 3], 'b': [4.0, 5.0, 6.0]}))
        w.write(pd.DataFrame.from_dict({'a': [3, 2, 1], 'b': [6.0, 5.0, 4.0]}))

    outs = copy_task.unit_test(a=a)

    with outs['b'] as r:
        df = r.read()
        assert list(df['a']) == [1, 2, 3]
        assert list(df['b']) == [4.0, 5.0, 6.0]

        df = r.read()
        assert list(df['a']) == [3, 2, 1]
        assert list(df['b']) == [6.0, 5.0, 4.0]

        assert r.read() is None

    # Test typed copy and pass through
    a = Types.Schema([('a', Types.Integer), ('b', Types.Float)])()
    with a as w:
        w.write(pd.DataFrame.from_dict({'a': [1, 2, 3], 'b': [4.0, 5.0, 6.0]}))
        w.write(pd.DataFrame.from_dict({'a': [3, 2, 1], 'b': [6.0, 5.0, 4.0]}))

    outs = copy_task.unit_test(a=a)

    with outs['b'] as r:
        df = r.read()
        assert list(df['a']) == [1, 2, 3]
        assert list(df['b']) == [4.0, 5.0, 6.0]

        df = r.read()
        assert list(df['a']) == [3, 2, 1]
        assert list(df['b']) == [6.0, 5.0, 4.0]

        assert r.read() is None
Exemple #6
0
def test_multipartblob_passing():
    @inputs(a=Types.MultiPartBlob)
    @outputs(b=Types.MultiPartBlob)
    @python_task
    def test_pass(wf_params, a, b):
        b.set(a)

    b = Types.MultiPartBlob()
    with b.create_part("0") as w:
        w.write("Hello world".encode("utf-8"))
    with b.create_part("1") as w:
        w.write("Hello world2".encode("utf-8"))

    out = test_pass.unit_test(a=b)
    assert len(out) == 1
    with out["b"] as r:
        assert len(r) == 2
        assert r[0].read().decode("utf-8") == "Hello world"
        assert r[1].read().decode("utf-8") == "Hello world2"

    out = test_pass.unit_test(a=out["b"])
    assert len(out) == 1
    with out["b"] as r:
        assert len(r) == 2
        assert r[0].read().decode("utf-8") == "Hello world"
        assert r[1].read().decode("utf-8") == "Hello world2"
Exemple #7
0
def test_no_output_set():
    @outputs(a=Types.Schema())
    @python_task()
    def null_set(wf_params, a):
        pass

    assert null_set.unit_test()['a'] is None
Exemple #8
0
 def copy_task(wf_params, a, b):
     out = Types.Schema()()
     with a as r:
         with out as w:
             for df in r.iter_chunks():
                 w.write(df)
     b.set(out)
Exemple #9
0
 def copy_task(wf_params, a, b):
     out = Types.Schema([('a', Types.Integer), ('b', Types.Float)])()
     with a as r:
         with out as w:
             for df in r.iter_chunks():
                 w.write(df)
     b.set(out)
Exemple #10
0
 def test_write(wf_params, a):
     b = Types.MultiPartCSV()
     with b.create_part("0") as w:
         w.write("Hello,world,1")
     with b.create_part("1") as w:
         w.write("Hello,world,2")
     a.set(b)
Exemple #11
0
 def test_write(wf_params, a):
     b = Types.MultiPartBlob()
     with b.create_part("0") as w:
         w.write("Hello world".encode("utf-8"))
     with b.create_part("1") as w:
         w.write("Hello world2".encode("utf-8"))
     a.set(b)
class StructuredSagemakerXGBoostHPO(object):
    # Input parameters
    static_hyperparameters = Input(
        Types.Generic,
        help=
        "A list of the static hyperparameters to pass to the training jobs.",
        default=example_hyperparams,
    )
    train_data = Input(
        Types.Schema(),
        help=
        "A Columnar schema that contains all the features used for training.",
    )
    train_target = Input(
        Types.Schema(),
        help=
        "A Columnar schema that contains all the labeled results for train_data.",
    )

    validation_data = Input(
        Types.Schema(),
        help=
        "A Columnar schema that contains all the features used for validation.",
    )
    validation_target = Input(
        Types.Schema(),
        help=
        "A Columnar schema that contains all the labeled results for validation_data.",
    )

    sagemaker_transform = convert_to_sagemaker_csv(x_train=train_data,
                                                   y_train=train_target,
                                                   x_test=validation_data,
                                                   y_test=validation_target)

    # Node definitions
    train_node = xgtrainer_task(
        static_hyperparameters=static_hyperparameters,
        train=sagemaker_transform.outputs.train,
        validation=sagemaker_transform.outputs.validation,
    )

    untar = untar_xgboost(model_tar=train_node.outputs.model, )

    # Outputs
    model = Output(untar.outputs.model, sdk_type=Types.Blob)
Exemple #13
0
def test_create_from_hive_query():
    s, q = Types.Schema().create_from_hive_query(
        "SELECT * FROM table", known_location="s3://somewhere/")

    assert s.mode == "wb"
    assert s.local_path is None
    assert s.remote_location == "s3://somewhere/"
    assert "SELECT * FROM table" in q
    assert s.remote_location in q
Exemple #14
0
 def source(wf_params, a):
     out = Types.Schema([("a", Types.Integer), ("b", Types.String)])()
     with out as writer:
         writer.write(
             pd.DataFrame.from_dict({
                 "a": [1, 2, 3, 4, 5],
                 "b": ["a", "b", "c", "d", "e"]
             }))
     a.set(out)
Exemple #15
0
 def source(wf_params, a):
     out = Types.Schema([('a', Types.Integer), ('b', Types.String)])()
     with out as writer:
         writer.write(
             pd.DataFrame.from_dict({
                 'a': [1, 2, 3, 4, 5],
                 'b': ['a', 'b', 'c', 'd', 'e']
             }))
     a.set(out)
Exemple #16
0
def test_typed_schema():
    @inputs(a=Types.Schema([("a", Types.Integer), ("b", Types.Float)]))
    @outputs(b=Types.Schema([("a", Types.Integer), ("b", Types.Float)]))
    @python_task
    def copy_task(wf_params, a, b):
        out = Types.Schema([("a", Types.Integer), ("b", Types.Float)])()
        with a as r:
            with out as w:
                for df in r.iter_chunks():
                    w.write(df)
        b.set(out)

    # Test typed copy and pass through
    a = Types.Schema([("a", Types.Integer), ("b", Types.Float)])()
    with a as w:
        w.write(pd.DataFrame.from_dict({"a": [1, 2, 3], "b": [4.0, 5.0, 6.0]}))
        w.write(pd.DataFrame.from_dict({"a": [3, 2, 1], "b": [6.0, 5.0, 4.0]}))

    outs = copy_task.unit_test(a=a)

    with outs["b"] as r:
        df = r.read()
        assert list(df["a"]) == [1, 2, 3]
        assert list(df["b"]) == [4.0, 5.0, 6.0]

        df = r.read()
        assert list(df["a"]) == [3, 2, 1]
        assert list(df["b"]) == [6.0, 5.0, 4.0]

        assert r.read() is None

    # Test untyped failure
    a = Types.Schema()()
    with a as w:
        w.write(pd.DataFrame.from_dict({"a": [1, 2, 3], "b": [4.0, 5.0, 6.0]}))
        w.write(pd.DataFrame.from_dict({"a": [3, 2, 1], "b": [6.0, 5.0, 4.0]}))

    with pytest.raises(_user_exceptions.FlyteTypeException):
        copy_task.unit_test(a=a)
Exemple #17
0
def test_bad_column_types():
    with pytest.raises(_user_exceptions.FlyteTypeException):
        Types.Schema([("a", Types.Blob)])
    with pytest.raises(_user_exceptions.FlyteTypeException):
        Types.Schema([("a", Types.MultiPartBlob)])
    with pytest.raises(_user_exceptions.FlyteTypeException):
        Types.Schema([("a", Types.MultiPartCSV)])
    with pytest.raises(_user_exceptions.FlyteTypeException):
        Types.Schema([("a", Types.CSV)])
    with pytest.raises(_user_exceptions.FlyteTypeException):
        Types.Schema([("a", Types.Schema())])
Exemple #18
0
def collect_blobs(folder_path):
    onlyfiles = [
        join(folder_path, f) for f in sorted(listdir(folder_path))
        if isfile(join(folder_path, f))
    ]
    my_blobs = []
    file_names = []
    for local_filepath in onlyfiles:

        my_blob = Types.Blob()
        with my_blob as fileobj:
            with open(local_filepath,
                      mode="rb") as file:  # b is important -> binary
                fileobj.write(file.read())
        my_blobs.append(my_blob)
        file_names.append(basename(local_filepath))
    return my_blobs, file_names
Exemple #19
0
def test_blob_passing():
    @inputs(a=Types.Blob)
    @outputs(b=Types.Blob)
    @python_task
    def test_pass(wf_params, a, b):
        b.set(a)

    b = Types.Blob()
    with b as w:
        w.write("Hello world".encode("utf-8"))

    out = test_pass.unit_test(a=b)
    assert len(out) == 1
    with out["b"] as r:
        assert r.read().decode("utf-8") == "Hello world"

    out = test_pass.unit_test(a=out["b"])
    assert len(out) == 1
    with out["b"] as r:
        assert r.read().decode("utf-8") == "Hello world"
Exemple #20
0
def test_bad_definition():
    with pytest.raises(_user_exceptions.FlyteValueException):
        Types.Schema([])
Exemple #21
0
def test_generic_schema():
    @inputs(a=Types.Schema())
    @outputs(b=Types.Schema())
    @python_task
    def fake_task(wf_params, a, b):
        pass
Exemple #22
0
    cache_version="1",
    cluster_label=_six.text_type("cluster_label"),
    tags=[_six.text_type("tag1")],
)
def sample_qubole_hive_task(wf_params, in1):
    return _six.text_type("select ") + _six.text_type(in1)


def test_hive_task():
    assert isinstance(sample_hive_task, _sdk_runnable.SdkRunnableTask)
    assert isinstance(sample_hive_task, _hive_task.SdkHiveTask)

    sample_hive_task.unit_test(in1=5)


@outputs(hive_results=[Types.Schema()])
@qubole_hive_task
def two_queries(wf_params, hive_results):
    q1 = "SELECT 1"
    q2 = "SELECT 'two'"
    schema_1, formatted_query_1 = Schema.create_from_hive_query(
        select_query=q1)
    schema_2, formatted_query_2 = Schema.create_from_hive_query(
        select_query=q2)

    hive_results.set([schema_1, schema_2])
    return [formatted_query_1, formatted_query_2]


def test_interface_setup():
    outs = two_queries.interface.outputs
Exemple #23
0
 def test_write(wf_params, a):
     b = Types.Blob()
     with b as w:
         w.write("Hello world".encode("utf-8"))
     a.set(b)
Exemple #24
0
from flytekit.sdk.tasks import inputs, outputs, python_task
from flytekit.sdk.types import Types
from flytekit.sdk.workflow import workflow_class, Input, Output
import json


@inputs(custom=Types.Generic)
@outputs(counts=Types.Generic, replicated=Types.List(Types.Generic))
@python_task
def generic_type_task(wf_params, custom, counts, replicated):
    """
    Go through each of the values of the input and if it's a str, count the length
    Also, create a replicated list of the Generic
    """
    wf_params.logging.info("Running custom object task")
    results = {}
    for k, v in custom.items():
        if type(v) == str:
            results[k] = len(v)
        else:
            results[k] = v

    counts.set(results)
    replicated.set([custom, custom])


@inputs(replicated=Types.List(Types.Generic))
@outputs(str_repr=Types.String)
@python_task
def generic_to_json(wf_params, replicated, str_repr):
    """
Exemple #25
0
def add_one(wf_params, a, b):
    b.set(a + 1)


@inputs(a=Types.Integer)
@outputs(b=Types.Integer)
@python_task(cache=True, cache_version='1')
def subtract_one(wf_params, a, b):
    b.set(a - 1)


@outputs(a=Types.Blob,
         b=Types.CSV,
         c=Types.MultiPartCSV,
         d=Types.MultiPartBlob,
         e=Types.Schema([('a', Types.Integer), ('b', Types.Integer)]))
@python_task
def write_special_types(wf_params, a, b, c, d, e):
    blob = Types.Blob()
    with blob as w:
        w.write("hello I'm a blob".encode('utf-8'))

    csv = Types.CSV()
    with csv as w:
        w.write("hello,i,iz,blob")

    mpcsv = Types.MultiPartCSV()
    with mpcsv.create_part('000000') as w:
        w.write("hello,i,iz,blob")
    with mpcsv.create_part('000001') as w:
        w.write("hello,i,iz,blob2")
import os

from flytekit.sdk.tasks import python_task, inputs, outputs, dynamic_task
from flytekit.sdk.types import Types
from flytekit.sdk.workflow import workflow_class, Input, Output

from demo.house_price_predictor import generate_data, save_to_file, save_to_dir, fit, predict


@inputs(locations=Types.List(Types.String),
        number_of_houses_per_location=Types.Integer,
        seed=Types.Integer)
@outputs(train=Types.List(Types.MultiPartCSV),
         val=Types.List(Types.MultiPartCSV),
         test=Types.List(Types.CSV))
@python_task(cache=True, cache_version="0.1", memory_request="200Mi")
def generate_and_split_data_multiloc(wf_params, locations,
                                     number_of_houses_per_location, seed,
                                     train, val, test):
    train_sets = []
    val_sets = []
    test_sets = []
    for loc in locations:
        _train, _val, _test = generate_data(loc, number_of_houses_per_location,
                                            seed)
        dir = "multi_data"
        os.makedirs(dir, exist_ok=True)
        train_sets.append(save_to_dir(dir, "train", _train))
        val_sets.append(save_to_dir(dir, "val", _val))
        test_sets.append(save_to_file(dir, "test", _test))
    train.set(train_sets)
Exemple #27
0
    # We know we are writing just one file, so we will just read the one file
    df = pd.read_csv(os.path.join(train.local_path, files[0]), header=None)
    y = df[df.columns[0]]
    x = df[df.columns[1:]]
    # fit model no training data
    m = XGBClassifier()
    m.fit(x, y)

    # TODO model Blob should be a file like object
    fname = "model.joblib.dat"
    joblib.dump(m, fname)
    model.set(fname)


@inputs(test=Types.CSV, model_ser=Types.Blob)  # TODO: format=".joblib.dat"))
@outputs(predictions=Types.List(Types.Float), accuracy=Types.Float)
@python_task(cache_version='1.0', cache=True, memory_request="200Mi")
def predict(ctx, test, model_ser, predictions, accuracy):
    """
    Given a any trained model, serialized using joblib (this method can be shared!) and features, this method returns
    predictions.
    """
    # Load model
    model_ser.download()
    model = joblib.load(model_ser.local_path)
    # Load test data
    test.download()
    test_df = pd.read_csv(test.local_path, header=None)
    x_df = test_df[test_df.columns[1:]]
    y_df = test_df[test_df.columns[0]]
    y_pred = model.predict(x_df)
Exemple #28
0
def test_typed_schema():
    @inputs(a=Types.Schema([("a", Types.Integer), ("b", Types.Integer)]))
    @outputs(b=Types.Schema([("a", Types.Integer), ("b", Types.Integer)]))
    @python_task
    def fake_task(wf_params, a, b):
        pass
Exemple #29
0
                "../../../common/configs/local.config",
            ),
            internal_overrides={
                "image": "myflyteimage:v123",
                "project": "myflyteproject",
                "domain": "development"
            },
    ):
        s = t.serialize()

    assert isinstance(s, _admin_task_pb2.TaskSpec)
    assert s.template.id.name == "tests.flytekit.unit.common_tests.tasks.test_task.my_task"
    assert s.template.container.image == "myflyteimage:v123"


schema = Types.Schema([("a", Types.String), ("b", Types.Integer)])


def test_task_produce_deterministic_version():
    containerless_task = SdkPrestoTask(
        task_inputs=inputs(ds=Types.String, rg=Types.String),
        statement=
        "SELECT * FROM flyte.widgets WHERE ds = '{{ .Inputs.ds}}' LIMIT 10",
        output_schema=schema,
        routing_group="{{ .Inputs.rg }}",
    )
    identical_containerless_task = SdkPrestoTask(
        task_inputs=inputs(ds=Types.String, rg=Types.String),
        statement=
        "SELECT * FROM flyte.widgets WHERE ds = '{{ .Inputs.ds}}' LIMIT 10",
        output_schema=schema,
Exemple #30
0
 def test_write(wf_params, a):
     b = Types.CSV()
     with b as w:
         w.write("Hello,world,hi")
     a.set(b)