Beispiel #1
0
def test_subset_of_columns():
    @outputs(a=Types.Schema([('a', Types.Integer), ('b', Types.String)]))
    @python_task()
    def source(wf_params, a):
        out = Types.Schema([('a', Types.Integer), ('b', Types.String)])()
        with out as writer:
            writer.write(
                pd.DataFrame.from_dict({
                    'a': [1, 2, 3, 4, 5],
                    'b': ['a', 'b', 'c', 'd', 'e']
                }))
        a.set(out)

    @inputs(a=Types.Schema([('a', Types.Integer)]))
    @python_task()
    def sink(wf_params, a):
        with a as reader:
            df = reader.read(concat=True)
            assert len(df.columns.values) == 1
            assert df['a'].tolist() == [1, 2, 3, 4, 5]

        with a as reader:
            df = reader.read(truncate_extra_columns=False)
            assert df.columns.values.tolist() == ['a', 'b']
            assert df['a'].tolist() == [1, 2, 3, 4, 5]
            assert df['b'].tolist() == ['a', 'b', 'c', 'd', 'e']

    o = source.unit_test()
    sink.unit_test(**o)
Beispiel #2
0
def generate_queries(wf_params, hive_results):
    q1 = "SELECT 1"
    q2 = "SELECT 'two'"
    schema_1, formatted_query_1 = Types.Schema().create_from_hive_query(select_query=q1)
    schema_2, formatted_query_2 = Types.Schema().create_from_hive_query(select_query=q2)

    hive_results.set([schema_1, schema_2])
    return [formatted_query_1, formatted_query_2]
Beispiel #3
0
def test_generic_schema():
    @inputs(a=Types.Schema())
    @outputs(b=Types.Schema())
    @python_task
    def copy_task(wf_params, a, b):
        out = Types.Schema()()
        with a as r:
            with out as w:
                for df in r.iter_chunks():
                    w.write(df)
        b.set(out)

    # Test generic copy and pass through
    a = Types.Schema()()
    with a as w:
        w.write(pd.DataFrame.from_dict({'a': [1, 2, 3], 'b': [4.0, 5.0, 6.0]}))
        w.write(pd.DataFrame.from_dict({'a': [3, 2, 1], 'b': [6.0, 5.0, 4.0]}))

    outs = copy_task.unit_test(a=a)

    with outs['b'] as r:
        df = r.read()
        assert list(df['a']) == [1, 2, 3]
        assert list(df['b']) == [4.0, 5.0, 6.0]

        df = r.read()
        assert list(df['a']) == [3, 2, 1]
        assert list(df['b']) == [6.0, 5.0, 4.0]

        assert r.read() is None

    # Test typed copy and pass through
    a = Types.Schema([('a', Types.Integer), ('b', Types.Float)])()
    with a as w:
        w.write(pd.DataFrame.from_dict({'a': [1, 2, 3], 'b': [4.0, 5.0, 6.0]}))
        w.write(pd.DataFrame.from_dict({'a': [3, 2, 1], 'b': [6.0, 5.0, 4.0]}))

    outs = copy_task.unit_test(a=a)

    with outs['b'] as r:
        df = r.read()
        assert list(df['a']) == [1, 2, 3]
        assert list(df['b']) == [4.0, 5.0, 6.0]

        df = r.read()
        assert list(df['a']) == [3, 2, 1]
        assert list(df['b']) == [6.0, 5.0, 4.0]

        assert r.read() is None
Beispiel #4
0
 def copy_task(wf_params, a, b):
     out = Types.Schema()()
     with a as r:
         with out as w:
             for df in r.iter_chunks():
                 w.write(df)
     b.set(out)
Beispiel #5
0
def write_special_types(wf_params, a, b, c, d, e):
    blob = Types.Blob()
    with blob as w:
        w.write("hello I'm a blob".encode('utf-8'))

    csv = Types.CSV()
    with csv as w:
        w.write("hello,i,iz,blob")

    mpcsv = Types.MultiPartCSV()
    with mpcsv.create_part('000000') as w:
        w.write("hello,i,iz,blob")
    with mpcsv.create_part('000001') as w:
        w.write("hello,i,iz,blob2")

    mpblob = Types.MultiPartBlob()
    with mpblob.create_part('000000') as w:
        w.write("hello I'm a mp blob".encode('utf-8'))
    with mpblob.create_part('000001') as w:
        w.write("hello I'm a mp blob too".encode('utf-8'))

    schema = Types.Schema([('a', Types.Integer), ('b', Types.Integer)])()
    with schema as w:
        w.write(_pd.DataFrame.from_dict({'a': [1, 2, 3], 'b': [4, 5, 6]}))
        w.write(_pd.DataFrame.from_dict({'a': [3, 2, 1], 'b': [6, 5, 4]}))

    a.set(blob)
    b.set(csv)
    c.set(mpcsv)
    d.set(mpblob)
    e.set(schema)
Beispiel #6
0
def test_no_output_set():
    @outputs(a=Types.Schema())
    @python_task()
    def null_set(wf_params, a):
        pass

    assert null_set.unit_test()['a'] is None
Beispiel #7
0
 def copy_task(wf_params, a, b):
     out = Types.Schema([('a', Types.Integer), ('b', Types.Float)])()
     with a as r:
         with out as w:
             for df in r.iter_chunks():
                 w.write(df)
     b.set(out)
class StructuredSagemakerXGBoostHPO(object):
    # Input parameters
    static_hyperparameters = Input(
        Types.Generic,
        help=
        "A list of the static hyperparameters to pass to the training jobs.",
        default=example_hyperparams,
    )
    train_data = Input(
        Types.Schema(),
        help=
        "A Columnar schema that contains all the features used for training.",
    )
    train_target = Input(
        Types.Schema(),
        help=
        "A Columnar schema that contains all the labeled results for train_data.",
    )

    validation_data = Input(
        Types.Schema(),
        help=
        "A Columnar schema that contains all the features used for validation.",
    )
    validation_target = Input(
        Types.Schema(),
        help=
        "A Columnar schema that contains all the labeled results for validation_data.",
    )

    sagemaker_transform = convert_to_sagemaker_csv(x_train=train_data,
                                                   y_train=train_target,
                                                   x_test=validation_data,
                                                   y_test=validation_target)

    # Node definitions
    train_node = xgtrainer_task(
        static_hyperparameters=static_hyperparameters,
        train=sagemaker_transform.outputs.train,
        validation=sagemaker_transform.outputs.validation,
    )

    untar = untar_xgboost(model_tar=train_node.outputs.model, )

    # Outputs
    model = Output(untar.outputs.model, sdk_type=Types.Blob)
Beispiel #9
0
 def source(wf_params, a):
     out = Types.Schema([('a', Types.Integer), ('b', Types.String)])()
     with out as writer:
         writer.write(
             pd.DataFrame.from_dict({
                 'a': [1, 2, 3, 4, 5],
                 'b': ['a', 'b', 'c', 'd', 'e']
             }))
     a.set(out)
Beispiel #10
0
 def source(wf_params, a):
     out = Types.Schema([("a", Types.Integer), ("b", Types.String)])()
     with out as writer:
         writer.write(
             pd.DataFrame.from_dict({
                 "a": [1, 2, 3, 4, 5],
                 "b": ["a", "b", "c", "d", "e"]
             }))
     a.set(out)
Beispiel #11
0
def test_create_from_hive_query():
    s, q = Types.Schema().create_from_hive_query(
        "SELECT * FROM table", known_location="s3://somewhere/")

    assert s.mode == "wb"
    assert s.local_path is None
    assert s.remote_location == "s3://somewhere/"
    assert "SELECT * FROM table" in q
    assert s.remote_location in q
Beispiel #12
0
def test_typed_schema():
    @inputs(a=Types.Schema([("a", Types.Integer), ("b", Types.Float)]))
    @outputs(b=Types.Schema([("a", Types.Integer), ("b", Types.Float)]))
    @python_task
    def copy_task(wf_params, a, b):
        out = Types.Schema([("a", Types.Integer), ("b", Types.Float)])()
        with a as r:
            with out as w:
                for df in r.iter_chunks():
                    w.write(df)
        b.set(out)

    # Test typed copy and pass through
    a = Types.Schema([("a", Types.Integer), ("b", Types.Float)])()
    with a as w:
        w.write(pd.DataFrame.from_dict({"a": [1, 2, 3], "b": [4.0, 5.0, 6.0]}))
        w.write(pd.DataFrame.from_dict({"a": [3, 2, 1], "b": [6.0, 5.0, 4.0]}))

    outs = copy_task.unit_test(a=a)

    with outs["b"] as r:
        df = r.read()
        assert list(df["a"]) == [1, 2, 3]
        assert list(df["b"]) == [4.0, 5.0, 6.0]

        df = r.read()
        assert list(df["a"]) == [3, 2, 1]
        assert list(df["b"]) == [6.0, 5.0, 4.0]

        assert r.read() is None

    # Test untyped failure
    a = Types.Schema()()
    with a as w:
        w.write(pd.DataFrame.from_dict({"a": [1, 2, 3], "b": [4.0, 5.0, 6.0]}))
        w.write(pd.DataFrame.from_dict({"a": [3, 2, 1], "b": [6.0, 5.0, 4.0]}))

    with pytest.raises(_user_exceptions.FlyteTypeException):
        copy_task.unit_test(a=a)
Beispiel #13
0
def test_bad_column_types():
    with pytest.raises(_user_exceptions.FlyteTypeException):
        Types.Schema([("a", Types.Blob)])
    with pytest.raises(_user_exceptions.FlyteTypeException):
        Types.Schema([("a", Types.MultiPartBlob)])
    with pytest.raises(_user_exceptions.FlyteTypeException):
        Types.Schema([("a", Types.MultiPartCSV)])
    with pytest.raises(_user_exceptions.FlyteTypeException):
        Types.Schema([("a", Types.CSV)])
    with pytest.raises(_user_exceptions.FlyteTypeException):
        Types.Schema([("a", Types.Schema())])
Beispiel #14
0
def add_one(wf_params, a, b):
    b.set(a + 1)


@inputs(a=Types.Integer)
@outputs(b=Types.Integer)
@python_task(cache=True, cache_version='1')
def subtract_one(wf_params, a, b):
    b.set(a - 1)


@outputs(a=Types.Blob,
         b=Types.CSV,
         c=Types.MultiPartCSV,
         d=Types.MultiPartBlob,
         e=Types.Schema([('a', Types.Integer), ('b', Types.Integer)]))
@python_task
def write_special_types(wf_params, a, b, c, d, e):
    blob = Types.Blob()
    with blob as w:
        w.write("hello I'm a blob".encode('utf-8'))

    csv = Types.CSV()
    with csv as w:
        w.write("hello,i,iz,blob")

    mpcsv = Types.MultiPartCSV()
    with mpcsv.create_part('000000') as w:
        w.write("hello,i,iz,blob")
    with mpcsv.create_part('000001') as w:
        w.write("hello,i,iz,blob2")
#  8. Age (years)
#  9. Class variable (0 or 1)
# Example Row: 6,148,72,35,0,33.6,0.627,50,1
TYPED_COLUMNS = [
    ('#preg', Types.Integer),
    ('pgc_2h', Types.Integer),
    ('diastolic_bp', Types.Integer),
    ('tricep_skin_fold_mm', Types.Integer),
    ('serum_insulin_2h', Types.Integer),
    ('bmi', Types.Float),
    ('diabetes_pedigree', Types.Float),
    ('age', Types.Integer),
    ('class', Types.Integer),
]
# the input dataset schema
DATASET_SCHEMA = Types.Schema(TYPED_COLUMNS)
# the first 8 columns are features
FEATURES_SCHEMA = Types.Schema(TYPED_COLUMNS[:8])
# the last column is the class
CLASSES_SCHEMA = Types.Schema([TYPED_COLUMNS[-1]])


class XGBoostModelHyperparams(object):
    """
    These are the xgboost hyper parameters available in scikit-learn library.
    """
    def __init__(self,
                 max_depth=3,
                 learning_rate=0.1,
                 n_estimators=100,
                 objective="binary:logistic",
Beispiel #16
0
    b.set(a + 1)


@inputs(a=Types.Integer)
@outputs(b=Types.Integer)
@python_task(cache=True, cache_version="1")
def subtract_one(wf_params, a, b):
    b.set(a - 1)


@outputs(
    a=Types.Blob,
    b=Types.CSV,
    c=Types.MultiPartCSV,
    d=Types.MultiPartBlob,
    e=Types.Schema([("a", Types.Integer), ("b", Types.Integer)]),
)
@python_task
def write_special_types(wf_params, a, b, c, d, e):
    blob = Types.Blob()
    with blob as w:
        w.write("hello I'm a blob".encode("utf-8"))

    csv = Types.CSV()
    with csv as w:
        w.write("hello,i,iz,blob")

    mpcsv = Types.MultiPartCSV()
    with mpcsv.create_part("000000") as w:
        w.write("hello,i,iz,blob")
    with mpcsv.create_part("000001") as w:
Beispiel #17
0
    cache_version="1",
    cluster_label=_six.text_type("cluster_label"),
    tags=[_six.text_type("tag1")],
)
def sample_qubole_hive_task(wf_params, in1):
    return _six.text_type("select ") + _six.text_type(in1)


def test_hive_task():
    assert isinstance(sample_hive_task, _sdk_runnable.SdkRunnableTask)
    assert isinstance(sample_hive_task, _hive_task.SdkHiveTask)

    sample_hive_task.unit_test(in1=5)


@outputs(hive_results=[Types.Schema()])
@qubole_hive_task
def two_queries(wf_params, hive_results):
    q1 = "SELECT 1"
    q2 = "SELECT 'two'"
    schema_1, formatted_query_1 = Schema.create_from_hive_query(
        select_query=q1)
    schema_2, formatted_query_2 = Schema.create_from_hive_query(
        select_query=q2)

    hive_results.set([schema_1, schema_2])
    return [formatted_query_1, formatted_query_2]


def test_interface_setup():
    outs = two_queries.interface.outputs
Beispiel #18
0
def test_typed_schema():
    @inputs(a=Types.Schema([("a", Types.Integer), ("b", Types.Integer)]))
    @outputs(b=Types.Schema([("a", Types.Integer), ("b", Types.Integer)]))
    @python_task
    def fake_task(wf_params, a, b):
        pass
Beispiel #19
0
def test_generic_schema():
    @inputs(a=Types.Schema())
    @outputs(b=Types.Schema())
    @python_task
    def fake_task(wf_params, a, b):
        pass
Beispiel #20
0
def test_bad_definition():
    with pytest.raises(_user_exceptions.FlyteValueException):
        Types.Schema([])
Beispiel #21
0
                "../../../common/configs/local.config",
            ),
            internal_overrides={
                "image": "myflyteimage:v123",
                "project": "myflyteproject",
                "domain": "development"
            },
    ):
        s = t.serialize()

    assert isinstance(s, _admin_task_pb2.TaskSpec)
    assert s.template.id.name == "tests.flytekit.unit.common_tests.tasks.test_task.my_task"
    assert s.template.container.image == "myflyteimage:v123"


schema = Types.Schema([("a", Types.String), ("b", Types.Integer)])


def test_task_produce_deterministic_version():
    containerless_task = SdkPrestoTask(
        task_inputs=inputs(ds=Types.String, rg=Types.String),
        statement=
        "SELECT * FROM flyte.widgets WHERE ds = '{{ .Inputs.ds}}' LIMIT 10",
        output_schema=schema,
        routing_group="{{ .Inputs.rg }}",
    )
    identical_containerless_task = SdkPrestoTask(
        task_inputs=inputs(ds=Types.String, rg=Types.String),
        statement=
        "SELECT * FROM flyte.widgets WHERE ds = '{{ .Inputs.ds}}' LIMIT 10",
        output_schema=schema,
def read_and_merge(first, second):
    """
    Sagemaker likes the target to be in column 1. This method takes the y and the x and just places the dataframes
    next to each other, yielding a common dataframe
    """
    with first as r:
        first_df = r.read()
    with second as r:
        second_df = r.read()
    if len(first_df) != len(second_df):
        raise Exception(
            "trying to merge to data frames which are not equal in length")
    return pd.concat([first_df, second_df], axis=1)


@inputs(x_train=Types.Schema(),
        x_test=Types.Schema(),
        y_train=Types.Schema(),
        y_test=Types.Schema())
@outputs(train=Types.MultiPartCSV, validation=Types.MultiPartCSV)
@python_task(cache_version='3.0', cache=True, memory_limit="500Mi")
def convert_to_sagemaker_csv(ctx, x_train, y_train, x_test, y_test, train,
                             validation):
    _train = read_and_merge(y_train, x_train)
    _validate = read_and_merge(y_test, x_test)

    with utils.AutoDeletingTempDir("train") as t:
        f = t.get_named_tempfile("train.csv")
        _train.to_csv(f, header=False, index=False)
        train.set(t.name)