Beispiel #1
0
def test_pod_task_serialized():
    pod = Pod(pod_spec=get_pod_spec(),
              primary_container_name="an undefined container")

    @task(task_config=pod,
          requests=Resources(cpu="10"),
          limits=Resources(gpu="2"),
          environment={"FOO": "bar"})
    def simple_pod_task(i: int):
        pass

    assert isinstance(simple_pod_task, PodFunctionTask)
    assert simple_pod_task.task_config == pod

    default_img = Image(name="default", fqn="test", tag="tag")
    ssettings = SerializationSettings(
        project="project",
        domain="domain",
        version="version",
        env={"FOO": "baz"},
        image_config=ImageConfig(default_image=default_img,
                                 images=[default_img]),
    )
    serialized = get_serializable(OrderedDict(), ssettings, simple_pod_task)
    assert serialized.task_type_version == 1
    assert serialized.config[
        "primary_container_name"] == "an undefined container"
Beispiel #2
0
def test_pod_task():
    pod = Pod(pod_spec=get_pod_spec(), primary_container_name="a container")

    @task(task_config=pod,
          requests=Resources(cpu="10"),
          limits=Resources(gpu="2"),
          environment={"FOO": "bar"})
    def simple_pod_task(i: int):
        pass

    assert isinstance(simple_pod_task, PodFunctionTask)
    assert simple_pod_task.task_config == pod

    default_img = Image(name="default", fqn="test", tag="tag")

    custom = simple_pod_task.get_custom(
        SerializationSettings(
            project="project",
            domain="domain",
            version="version",
            env={"FOO": "baz"},
            image_config=ImageConfig(default_image=default_img,
                                     images=[default_img]),
        ))
    assert custom["podSpec"]["restartPolicy"] == "OnFailure"
    assert len(custom["podSpec"]["containers"]) == 2
    primary_container = custom["podSpec"]["containers"][0]
    assert primary_container["name"] == "a container"
    assert primary_container["args"] == [
        "pyflyte-execute",
        "--task-module",
        "pod.test_pod",
        "--task-name",
        "simple_pod_task",
        "--inputs",
        "{{.input}}",
        "--output-prefix",
        "{{.outputPrefix}}",
        "--raw-output-data-prefix",
        "{{.rawOutputDataPrefix}}",
    ]
    assert primary_container["volumeMounts"] == [{
        "mountPath": "some/where",
        "name": "volume mount"
    }]
    assert primary_container["resources"] == {
        "requests": {
            "cpu": {
                "string": "10"
            }
        },
        "limits": {
            "gpu": {
                "string": "2"
            }
        },
    }
    assert primary_container["env"] == [{"name": "FOO", "value": "bar"}]
    assert custom["podSpec"]["containers"][1]["name"] == "another container"
    assert custom["primaryContainerName"] == "a container"
Beispiel #3
0
 def my_wf(a: typing.List[str]) -> typing.List[str]:
     mappy = map_task(t1)
     map_node = mappy(a=a).with_overrides(
         requests=Resources(cpu="1", mem="100", ephemeral_storage="500Mi"),
         limits=Resources(cpu="2", mem="200", ephemeral_storage="1Gi"),
     )
     return map_node
Beispiel #4
0
def test_pytorch_task():
    @task(
        task_config=PyTorch(num_workers=10),
        cache=True,
        cache_version="1",
        requests=Resources(cpu="1"),
    )
    def my_pytorch_task(x: int, y: str) -> int:
        return x

    assert my_pytorch_task(x=10, y="hello") == 10

    assert my_pytorch_task.task_config is not None

    default_img = Image(name="default", fqn="test", tag="tag")
    settings = SerializationSettings(
        project="project",
        domain="domain",
        version="version",
        env={"FOO": "baz"},
        image_config=ImageConfig(default_image=default_img,
                                 images=[default_img]),
    )

    assert my_pytorch_task.get_custom(settings) == {"workers": 10}
    assert my_pytorch_task.resources.limits == Resources()
    assert my_pytorch_task.resources.requests == Resources(cpu="1")
    assert my_pytorch_task.task_type == "pytorch"
Beispiel #5
0
def test_tensorflow_task():
    @task(
        task_config=TfJob(num_workers=10, num_ps_replicas=1, num_chief_replicas=1),
        cache=True,
        requests=Resources(cpu="1"),
        cache_version="1",
    )
    def my_tensorflow_task(x: int, y: str) -> int:
        return x

    assert my_tensorflow_task(x=10, y="hello") == 10

    assert my_tensorflow_task.task_config is not None

    default_img = Image(name="default", fqn="test", tag="tag")
    settings = SerializationSettings(
        project="project",
        domain="domain",
        version="version",
        env={"FOO": "baz"},
        image_config=ImageConfig(default_image=default_img, images=[default_img]),
    )

    assert my_tensorflow_task.get_custom(settings) == {"workers": 10, "psReplicas": 1, "chiefReplicas": 1}
    assert my_tensorflow_task.resources.limits == Resources()
    assert my_tensorflow_task.resources.requests == Resources(cpu="1")
    assert my_tensorflow_task.task_type == "tensorflow"
Beispiel #6
0
def my_map_workflow(a: typing.List[int]) -> str:
    mapped_out = map_task(a_mappable_task)(a=a).with_overrides(
        requests=Resources(mem="300Mi"),
        limits=Resources(mem="500Mi"),
        retries=1,
    )
    coalesced = coalesce(b=mapped_out)
    return coalesced
Beispiel #7
0
def test_dynamic_pod_task():
    dynamic_pod = Pod(pod_spec=get_pod_spec(), primary_container_name="a container")

    @task
    def t1(a: int) -> int:
        return a + 10

    @dynamic(
        task_config=dynamic_pod, requests=Resources(cpu="10"), limits=Resources(gpu="2"), environment={"FOO": "bar"}
    )
    def dynamic_pod_task(a: int) -> List[int]:
        s = []
        for i in range(a):
            s.append(t1(a=i))
        return s

    assert isinstance(dynamic_pod_task, PodFunctionTask)
    default_img = Image(name="default", fqn="test", tag="tag")

    custom = dynamic_pod_task.get_custom(
        SerializationSettings(
            project="project",
            domain="domain",
            version="version",
            env={"FOO": "baz"},
            image_config=ImageConfig(default_image=default_img, images=[default_img]),
        )
    )
    assert len(custom["podSpec"]["containers"]) == 2
    primary_container = custom["podSpec"]["containers"][0]
    assert isinstance(dynamic_pod_task.task_config, Pod)
    assert primary_container["resources"] == {
        "requests": {"cpu": {"string": "10"}},
        "limits": {"gpu": {"string": "2"}},
    }

    with context_manager.FlyteContext.current_context().new_serialization_settings(
        serialization_settings=SerializationSettings(
            project="test_proj",
            domain="test_domain",
            version="abc",
            image_config=ImageConfig(Image(name="name", fqn="image", tag="name")),
            env={},
        )
    ) as ctx:
        with ctx.new_execution_context(mode=ExecutionState.Mode.TASK_EXECUTION) as ctx:
            dynamic_job_spec = dynamic_pod_task.compile_into_workflow(ctx, dynamic_pod_task._task_function, a=5)
            assert len(dynamic_job_spec._nodes) == 5
Beispiel #8
0
def test_mpi_task():
    @task(
        task_config=MPIJob(num_workers=10, num_launcher_replicas=10, slots=1),
        requests=Resources(cpu="1"),
        cache=True,
        cache_version="1",
    )
    def my_mpi_task(x: int, y: str) -> int:
        return x

    assert my_mpi_task(x=10, y="hello") == 10

    assert my_mpi_task.task_config is not None

    default_img = Image(name="default", fqn="test", tag="tag")
    settings = SerializationSettings(
        project="project",
        domain="domain",
        version="version",
        env={"FOO": "baz"},
        image_config=ImageConfig(default_image=default_img,
                                 images=[default_img]),
    )

    assert my_mpi_task.get_custom(settings) == {
        "numLauncherReplicas": 10,
        "numWorkers": 10,
        "slots": 1
    }
    assert my_mpi_task.task_type == "mpi"
Beispiel #9
0
def test_pod_task_undefined_primary():
    pod = Pod(pod_spec=get_pod_spec(),
              primary_container_name="an undefined container")

    @task(task_config=pod,
          requests=Resources(cpu="10"),
          limits=Resources(gpu="2"),
          environment={"FOO": "bar"})
    def simple_pod_task(i: int):
        pass

    assert isinstance(simple_pod_task, PodFunctionTask)
    assert simple_pod_task.task_config == pod

    default_img = Image(name="default", fqn="test", tag="tag")
    custom = simple_pod_task.get_custom(
        SerializationSettings(
            project="project",
            domain="domain",
            version="version",
            env={"FOO": "baz"},
            image_config=ImageConfig(default_image=default_img,
                                     images=[default_img]),
        ))

    assert len(custom["containers"]) == 3

    primary_container = custom["containers"][2]
    assert primary_container["name"] == "an undefined container"

    config = simple_pod_task.get_config(
        SerializationSettings(
            project="project",
            domain="domain",
            version="version",
            env={"FOO": "baz"},
            image_config=ImageConfig(default_image=default_img,
                                     images=[default_img]),
        ))
    assert config["primary_container_name"] == "an undefined container"
Beispiel #10
0
def test_container():
    @task
    def t1(a: int) -> (int, str):
        return a + 2, str(a) + "-HELLO"

    t2 = ContainerTask(
        "raw",
        image="alpine",
        inputs=kwtypes(a=int, b=str),
        input_data_dir="/tmp",
        output_data_dir="/tmp",
        command=["cat"],
        arguments=["/tmp/a"],
        requests=Resources(mem="400Mi", cpu="1"),
    )

    sdk_task = get_serializable(OrderedDict(), serialization_settings, t2, fast=True)
    assert "pyflyte" not in sdk_task.container.args
Beispiel #11
0
def test_container():
    @task
    def t1(a: int) -> (int, str):
        return a + 2, str(a) + "-HELLO"

    t2 = ContainerTask(
        "raw",
        image="alpine",
        inputs=kwtypes(a=int, b=str),
        input_data_dir="/tmp",
        output_data_dir="/tmp",
        command=["cat"],
        arguments=["/tmp/a"],
        requests=Resources(mem="400Mi", cpu="1"),
    )

    ssettings = (
        serialization_settings.new_builder().with_fast_serialization_settings(
            FastSerializationSettings(enabled=True)).build())
    task_spec = get_serializable(OrderedDict(), ssettings, t2)
    assert "pyflyte" not in task_spec.template.container.args
#
@task(
    task_config=Spark(
        # the below configuration is applied to the Spark cluster
        spark_conf={
            "spark.driver.memory": "2000M",
            "spark.executor.memory": "2000M",
            "spark.executor.cores": "1",
            "spark.executor.instances": "2",
            "spark.driver.cores": "1",
            "spark.sql.shuffle.partitions": "16",
            "spark.worker.timeout": "120",
        }),
    cache=True,
    cache_version="0.2",
    requests=Resources(mem="1Gi"),
    limits=Resources(mem="1Gi"),
)
def horovod_spark_task(data_dir: FlyteDirectory, hp: Hyperparameters,
                       work_dir: FlyteDirectory) -> FlyteDirectory:

    max_sales, vocab, train_df, test_df = data_preparation(data_dir, hp)

    # working directory will have the model and predictions as separate files
    working_dir = flytekit.current_context().working_directory

    keras_model = train(
        max_sales,
        vocab,
        hp,
        work_dir,
Beispiel #13
0
def test_pod_task_deserialization():
    pod = Pod(pod_spec=get_pod_spec(), primary_container_name="a container")

    @task(task_config=pod,
          requests=Resources(cpu="10"),
          limits=Resources(gpu="2"),
          environment={"FOO": "bar"})
    def simple_pod_task(i: int):
        pass

    assert isinstance(simple_pod_task, PodFunctionTask)
    assert simple_pod_task.task_config == pod

    default_img = Image(name="default", fqn="test", tag="tag")

    custom = simple_pod_task.get_custom(
        SerializationSettings(
            project="project",
            domain="domain",
            version="version",
            env={"FOO": "baz"},
            image_config=ImageConfig(default_image=default_img,
                                     images=[default_img]),
        ))

    # Test that custom is correctly serialized by deserializing it with the python API client
    response = MagicMock()
    response.data = json.dumps(custom)
    deserialized_pod_spec = ApiClient().deserialize(response, V1PodSpec)

    assert deserialized_pod_spec.restart_policy == "OnFailure"
    assert len(deserialized_pod_spec.containers) == 2
    primary_container = deserialized_pod_spec.containers[0]
    assert primary_container.name == "a container"
    assert primary_container.args == [
        "pyflyte-execute",
        "--inputs",
        "{{.input}}",
        "--output-prefix",
        "{{.outputPrefix}}",
        "--raw-output-data-prefix",
        "{{.rawOutputDataPrefix}}",
        "--resolver",
        "flytekit.core.python_auto_container.default_task_resolver",
        "--",
        "task-module",
        "plugins.tests.pod.test_pod",
        "task-name",
        "simple_pod_task",
    ]
    assert primary_container.volume_mounts[0].mount_path == "some/where"
    assert primary_container.volume_mounts[0].name == "volume mount"
    assert primary_container.resources == V1ResourceRequirements(
        limits={"gpu": "2"}, requests={"cpu": "10"})
    assert primary_container.env == [V1EnvVar(name="FOO", value="bar")]
    assert deserialized_pod_spec.containers[1].name == "another container"

    config = simple_pod_task.get_config(
        SerializationSettings(
            project="project",
            domain="domain",
            version="version",
            env={"FOO": "baz"},
            image_config=ImageConfig(default_image=default_img,
                                     images=[default_img]),
        ))
    assert config["primary_container_name"] == "a container"
Beispiel #14
0
# ^^^^^^^^^^^^^^^^^
#
# This example shows how a Spark task can be written simply by adding a ``@task(task_config=Spark(...)...)`` decorator.
# Refer to `Spark <https://github.com/flyteorg/flytekit/blob/9e156bb0cf3d1441c7d1727729e8f9b4bbc3f168/plugins/flytekit-spark/flytekitplugins/spark/task.py#L18-L36>`__
# class to understand the various configuration options.
@task(
    task_config=Spark(
        # this configuration is applied to the spark cluster
        spark_conf={
            "spark.driver.memory": "1000M",
            "spark.executor.memory": "1000M",
            "spark.executor.cores": "1",
            "spark.executor.instances": "2",
            "spark.driver.cores": "1",
        }),
    limits=Resources(mem="2000M"),
    cache_version="1",
)
def hello_spark(partitions: int) -> float:
    print("Starting Spark with Partitions: {}".format(partitions))

    n = 100000 * partitions
    sess = flytekit.current_context().spark_session
    count = (sess.sparkContext.parallelize(range(1, n + 1),
                                           partitions).map(f).reduce(add))
    pi_val = 4.0 * count / n
    print("Pi val is :{}".format(pi_val))
    return pi_val


def f(_):
Beispiel #15
0
#
# Notice we are also generating an output variable called logs, these logs can be used to visualize the training in
# Tensorboard and are the output of the `SummaryWriter` interface
# Refer to section :ref:`pytorch_tensorboard` to visualize the outputs of this example.
TrainingOutputs = typing.NamedTuple(
    "TrainingOutputs",
    epoch_accuracies=typing.List[float],
    model_state=PythonPickledFile,
    logs=TensorboardLogs,
)


@task(
    task_config=PyTorch(
        num_workers=2,
        per_replica_requests=Resources(cpu="500m", mem="4Gi", gpu="1"),
        per_replica_limits=Resources(mem="8Gi", gpu="1"),
    ),
    retries=2,
    cache=True,
    cache_version="1.0",
)
def mnist_pytorch_job(hp: Hyperparameters) -> TrainingOutputs:
    log_dir = "logs"
    writer = SummaryWriter(log_dir)

    torch.manual_seed(hp.seed)

    use_cuda = torch.cuda.is_available()
    print(f"Use cuda {use_cuda}")
    device = torch.device("cuda" if use_cuda else "cpu")
Beispiel #16
0
#
# #. Loads the MNIST data
# #. Prepares the data for training
# #. Initializes a convnet model
# #. Calls the `training_step()` function to train the model
# #. Saves the model and checkpoint history and returns the result
@task(
    task_config=MPIJob(
        num_workers=2,
        num_launcher_replicas=1,
        slots=1,
    ),
    retries=3,
    cache=True,
    cache_version="0.1",
    requests=Resources(cpu='1', mem="300Mi"),
    limits=Resources(cpu='2'),
)
def horovod_train_task(batch_size: int, buffer_size: int,
                       dataset_size: int) -> FlyteDirectory:
    """
    :param batch_size: Represents the number of consecutive elements of this dataset to combine in a single batch.
    :param buffer_size: Defines the size of the buffer used to hold elements of the dataset used for training.
    :param dataset_size: The number of elements of this dataset that should be taken to form the new dataset when
        running batched training.
    """
    hvd.init()

    (mnist_images, mnist_labels), _ = \
        tf.keras.datasets.mnist.load_data(path='mnist-%d.npz' % hvd.rank())
Beispiel #17
0
# The following attributes can be specified for a ``Resource``.
#
# #. ``cpu``
# #. ``mem``
# #. ``gpu``
#
# The ``storage`` resources option is not yet supported, but coming soon
#
# The acutal values follow the `kubernetes convention <https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#resource-units-in-kubernetes>`_.

import typing

from flytekit import Resources, task, workflow


@task(requests=Resources(cpu="1", mem="2048"),
      limits=Resources(cpu="2", mem="4096"))
def count_unique_numbers(x: typing.List[int]) -> int:
    s = set()
    for i in x:
        s.add(i)
    return len(s)


# %%
# Now lets create a dummy task that squares the number
@task
def square(x: int) -> int:
    return x * x

Beispiel #18
0
        volumes=[
            V1Volume(name="shared-data",
                     empty_dir=V1EmptyDirVolumeSource(medium="Memory"))
        ],
    )

    return pod_spec


# %%
# Although pod tasks for the most part allow us to customize Kubernetes container attributes, we can still use Flyte directives to specify resources and even the image.
# The default image built for Flyte tasks will be used unless ``container_image`` task attribute is specified.
@task(
    task_config=Pod(pod_spec=generate_pod_spec_for_task(),
                    primary_container_name="primary"),
    requests=Resources(mem="1G", ),
)
def my_pod_task() -> str:
    # The code defined in this task will get injected into the primary container.
    while not os.path.isfile(_SHARED_DATA_PATH):
        time.sleep(5)

    with open(_SHARED_DATA_PATH, "r") as shared_message_file:
        return shared_message_file.read()


@workflow
def pod_workflow() -> str:
    s = my_pod_task()
    return s
    )


# %%
# Step 4: Task -- Generating & Splitting the Data
# ===============================================
# Call the previously defined helper functions to generate and split the data. Finally, return the DataFrame objects.
dataset = typing.NamedTuple(
    "GenerateSplitDataOutputs",
    train_data=pd.DataFrame,
    val_data=pd.DataFrame,
    test_data=pd.DataFrame,
)


@task(cache=True, cache_version="0.1", limits=Resources(mem="600Mi"))
def generate_and_split_data(number_of_houses: int, seed: int) -> dataset:
    _houses = gen_houses(number_of_houses)
    return split_data(_houses, seed, split=SPLIT_RATIOS)


# %%
# Step 5: Task -- Training the XGBoost Model
# ==========================================
# Serialize the XGBoost model using joblib and store the model in a dat file.
model_file = typing.NamedTuple("Model",
                               model=FlyteFile[typing.TypeVar("joblib.dat")])


@task(cache_version="1.0", cache=True, limits=Resources(mem="600Mi"))
def fit(loc: str, train: pd.DataFrame, val: pd.DataFrame) -> model_file:
Beispiel #20
0
# Data Generation and Preprocessing
# ====================================
# We call the :ref:`data generation <Data Generation>` and :ref:`data preprocessing <Data Preprocessing and Splitting>` functions to generate train, test, and validation data.
# First, let's create a ``NamedTuple`` that maps variable names to their respective data types.
dataset = typing.NamedTuple(
    "GenerateSplitDataOutputs",
    train_data=typing.List[pd.DataFrame],
    val_data=typing.List[pd.DataFrame],
    test_data=typing.List[pd.DataFrame],
)

# %%
# Next, we create a :py:func:`~flytekit:flytekit.dynamic` workflow to generate and split the data for multiple regions.


@dynamic(cache=True, cache_version="0.1", limits=Resources(mem="600Mi"))
def generate_and_split_data_multiloc(
    locations: typing.List[str],
    number_of_houses_per_location: int,
    seed: int,
) -> dataset:
    train_sets = [
    ]  # create empty lists for train, validation, and test subsets
    val_sets = []
    test_sets = []
    for _ in locations:
        _train, _val, _test = generate_and_split_data(
            number_of_houses=number_of_houses_per_location, seed=seed)
        train_sets.append(_train, )
        val_sets.append(_val, )
        test_sets.append(_test, )
    gpu = "0"
    storage = "500Mi"
    ephemeral_storage = "500Mi"
else:
    mem = "30Gi"
    gpu = str(WORLD_SIZE)
    ephemeral_storage = "500Mi"
    storage = "20Gi"


@task(
    retries=2,
    cache=True,
    cache_version="1.2",
    requests=Resources(gpu=gpu,
                       mem=mem,
                       storage=storage,
                       ephemeral_storage=ephemeral_storage),
    limits=Resources(gpu=gpu,
                     mem=mem,
                     storage=storage,
                     ephemeral_storage=ephemeral_storage),
)
def pytorch_mnist_task(hp: Hyperparameters) -> TrainingOutputs:
    print("Start MNIST training:")

    world_size = torch.cuda.device_count()
    print(f"Device count: {world_size}")
    download_mnist(DATA_DIR)
    mp.spawn(
        train_mnist,
        args=(world_size, hp),
def test_diabetes():
    # Since we are working with a specific dataset, we will create a strictly typed schema for the dataset.
    # If we wanted a generic data splitter we could use a Generic schema without any column type and name information
    # Example file: https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv
    # CSV Columns
    #  1. Number of times pregnant
    #  2. Plasma glucose concentration a 2 hours in an oral glucose tolerance test
    #  3. Diastolic blood pressure (mm Hg)
    #  4. Triceps skin fold thickness (mm)
    #  5. 2-Hour serum insulin (mu U/ml)
    #  6. Body mass index (weight in kg/(height in m)^2)
    #  7. Diabetes pedigree function
    #  8. Age (years)
    #  9. Class variable (0 or 1)
    # Example Row: 6,148,72,35,0,33.6,0.627,50,1
    # the input dataset schema
    DATASET_COLUMNS = OrderedDict({
        "#preg": int,
        "pgc_2h": int,
        "diastolic_bp": int,
        "tricep_skin_fold_mm": int,
        "serum_insulin_2h": int,
        "bmi": float,
        "diabetes_pedigree": float,
        "age": int,
        "class": int,
    })
    # the first 8 columns are features
    FEATURE_COLUMNS = OrderedDict(
        {k: v
         for k, v in DATASET_COLUMNS.items() if k != "class"})
    # the last column is the class
    CLASSES_COLUMNS = OrderedDict({"class": int})

    MODELSER_JOBLIB = typing.TypeVar("joblib.dat")

    class XGBoostModelHyperparams(object):
        """
        These are the xgboost hyper parameters available in scikit-learn library.
        """
        def __init__(self,
                     max_depth=3,
                     learning_rate=0.1,
                     n_estimators=100,
                     objective="binary:logistic",
                     booster="gbtree",
                     n_jobs=1,
                     **kwargs):
            self.n_jobs = int(n_jobs)
            self.booster = booster
            self.objective = objective
            self.n_estimators = int(n_estimators)
            self.learning_rate = learning_rate
            self.max_depth = int(max_depth)

        def to_dict(self):
            return self.__dict__

        @classmethod
        def from_dict(cls, d):
            return cls(**d)

    # load data
    # Example file: https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv
    @task(cache_version="1.0", cache=True, limits=Resources(mem="200Mi"))
    def split_traintest_dataset(
        dataset: FlyteFile[typing.TypeVar("csv")], seed: int,
        test_split_ratio: float
    ) -> (
            FlyteSchema[FEATURE_COLUMNS],
            FlyteSchema[FEATURE_COLUMNS],
            FlyteSchema[CLASSES_COLUMNS],
            FlyteSchema[CLASSES_COLUMNS],
    ):
        """
        Retrieves the training dataset from the given blob location and then splits it using the split ratio and returns the result
        This splitter is only for the dataset that has the format as specified in the example csv. The last column is assumed to be
        the class and all other columns 0-8 the features.

        The data is returned as a schema, which gets converted to a parquet file in the back.
        """
        column_names = [k for k in DATASET_COLUMNS.keys()]
        df = pd.read_csv(dataset, names=column_names)

        # Select all features
        x = df[column_names[:8]]
        # Select only the classes
        y = df[[column_names[-1]]]

        # We will fake train test split. Just return the same dataset multiple times
        return x, x, y, y

    nt = typing.NamedTuple("Outputs", model=FlyteFile[MODELSER_JOBLIB])

    @task(cache_version="1.0", cache=True, limits=Resources(mem="200Mi"))
    def fit(x: FlyteSchema[FEATURE_COLUMNS], y: FlyteSchema[CLASSES_COLUMNS],
            hyperparams: dict) -> nt:
        """
        This function takes the given input features and their corresponding classes to train a XGBClassifier.
        NOTE: We have simplified the number of hyper parameters we take for demo purposes
        """
        x_df = x.open().all()
        print(x_df)
        y_df = y.open().all()
        print(y_df)

        hp = XGBoostModelHyperparams.from_dict(hyperparams)
        print(hp)
        # fit model no training data
        # Faking fit

        fname = "model.joblib.dat"
        with open(fname, "w") as f:
            f.write("Some binary data")
        return nt(model=fname)

    @task(cache_version="1.0", cache=True, limits=Resources(mem="200Mi"))
    def predict(
            x: FlyteSchema[FEATURE_COLUMNS],
            model_ser: FlyteFile[MODELSER_JOBLIB]
    ) -> FlyteSchema[CLASSES_COLUMNS]:
        """
        Given a any trained model, serialized using joblib (this method can be shared!) and features, this method returns
        predictions.
        """
        # make predictions for test data
        x_df = x.open().all()
        print(x_df)
        col = [k for k in CLASSES_COLUMNS.keys()]
        y_pred_df = pd.DataFrame(data=[{
            col[0]: [0, 1]
        }],
                                 columns=col,
                                 dtype="int64")
        y_pred_df.round(0)
        return y_pred_df

    @task(cache_version="1.0", cache=True, limits=Resources(mem="200Mi"))
    def score(predictions: FlyteSchema[CLASSES_COLUMNS],
              y: FlyteSchema[CLASSES_COLUMNS]) -> float:
        """
        Compares the predictions with the actuals and returns the accuracy score.
        """
        pred_df = predictions.open().all()
        print(pred_df)
        y_df = y.open().all()
        print(y_df)
        # evaluate predictions
        return 0.2

    @workflow
    def diabetes_xgboost_model(
        dataset: FlyteFile[typing.TypeVar("csv")],
        # = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv",
        test_split_ratio: float = 0.33,
        seed: int = 7,
    ) -> typing.NamedTuple(
            "Outputs", model=FlyteFile[MODELSER_JOBLIB], accuracy=float):
        """
        This pipeline trains an XGBoost mode for any given dataset that matches the schema as specified in
        https://github.com/jbrownlee/Datasets/blob/master/pima-indians-diabetes.names.
        """
        x_train, x_test, y_train, y_test = split_traintest_dataset(
            dataset=dataset, seed=seed, test_split_ratio=test_split_ratio)
        model = fit(x=x_train,
                    y=y_train,
                    hyperparams=XGBoostModelHyperparams(max_depth=4).to_dict())
        predictions = predict(x=x_test, model_ser=model.model)
        return model.model, score(predictions=predictions, y=y_test)
Beispiel #23
0
# Notice we are also generating an output variable called logs, these logs can be used to visualize the training in
# Tensorboard and are the output of the `SummaryWriter` interface
# Refer to section :ref:`pytorch_tensorboard` to visualize the outputs of this example.
TrainingOutputs = typing.NamedTuple(
    "TrainingOutputs",
    epoch_accuracies=typing.List[float],
    model_state=PythonPickledFile,
    logs=TensorboardLogs,
)


@task(task_config=PyTorch(num_workers=2, ),
      retries=2,
      cache=True,
      cache_version="1.0",
      requests=Resources(cpu=cpu_request, mem=mem_request, gpu=gpu_request),
      limits=Resources(mem=mem_limit, gpu=gpu_limit))
def mnist_pytorch_job(hp: Hyperparameters) -> TrainingOutputs:
    log_dir = "logs"
    writer = SummaryWriter(log_dir)

    torch.manual_seed(hp.seed)

    use_cuda = torch.cuda.is_available()
    print(f"Use cuda {use_cuda}")
    device = torch.device("cuda" if use_cuda else "cpu")

    print("Using device: {}, world size: {}".format(device, WORLD_SIZE))

    if should_distribute():
        print("Using distributed PyTorch with {} backend".format(hp.backend))
Beispiel #24
0
# #. ``gpu``
#
# To ensure regular tasks that do not require GPUs are not scheduled on GPU nodes, a separate node group for GPU nodes can be configured with `taints <https://kubernetes.io/docs/concepts/scheduling-eviction/taint-and-toleration/>`_.
#
# To ensure tasks that do require GPUs get the needed tolerations on their pods, set up flytepropeller using the following `configuration <https://github.com/flyteorg/flytepropeller/blob/v0.10.5/config.yaml#L51,L56>`_. Make sure that this toleration config matches the taint config that you have configured to protect your gpu providing nodes from having to deal with regular non-gpu workloads (pods).
#
# The ``storage`` resources option is not yet supported, but coming soon.
#
# The actual values follow the `kubernetes convention <https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#resource-units-in-kubernetes>`_.

import typing

from flytekit import Resources, task, workflow


@task(requests=Resources(cpu="1", mem="100Mi"),
      limits=Resources(cpu="2", mem="150Mi"))
def count_unique_numbers(x: typing.List[int]) -> int:
    s = set()
    for i in x:
        s.add(i)
    return len(s)


# %%
# Let's create a dummy task that determines the square of a number.
@task
def square(x: int) -> int:
    return x * x


# %%
# We define a ``NotebookTask`` to run the `Jupyter notebook
# <https://github.com/flyteorg/flytesnacks/blob/master/cookbook/case_studies/feature_engineering/eda/supermarket_regression_1.ipynb>`__.
# This notebook returns ``dummified_data`` and ``dataset`` as the outputs.
#
# .. note::
#   ``dummified_data`` is used in this example, and ``dataset`` is used in the upcoming example.
nb = NotebookTask(
    name="eda-feature-eng-nb",
    notebook_path=os.path.join(
        pathlib.Path(__file__).parent.absolute(),
        "supermarket_regression_1.ipynb"),
    outputs=kwtypes(dummified_data=pd.DataFrame, dataset=str),
    requests=Resources(mem="500Mi"),
)


# %%
# Next, we define a ``cross_validate`` function and a ``modeling`` task to compute the MAE score of the data against
# the Gradient Boosting Regressor.
def cross_validate(model, nfolds, feats, targets):
    score = -1 * (cross_val_score(
        model, feats, targets, cv=nfolds, scoring="neg_mean_absolute_error"))
    return np.mean(score)


@task
def modeling(
    dataset: pd.DataFrame,
Beispiel #26
0
    mem = "100Mi"
    gpu = "0"
    storage = "500Mi"
else:
    print(f"SANDBOX ENV: '{os.getenv('SANDBOX')}'")

    mem = "3Gi"
    gpu = "1"
    storage = "1Gi"


@task(
    retries=2,
    cache=True,
    cache_version="1.0",
    requests=Resources(gpu=gpu, mem=mem, storage=storage),
    limits=Resources(gpu=gpu, mem=mem, storage=storage),
)
def pytorch_mnist_task(hp: Hyperparameters) -> TrainingOutputs:
    wandb_setup()

    # store the hyperparameters' config in ``wandb``
    wandb.config.update(json.loads(hp.to_json()))

    # set random seed
    torch.manual_seed(hp.seed)

    # ideally, if GPU training is required, and if cuda is not available, we can raise an exception
    # however, as we want this algorithm to work locally as well (and most users don't have a GPU locally), we will fallback to using a CPU
    use_cuda = torch.cuda.is_available()
    print(f"Use cuda {use_cuda}")
Beispiel #27
0
# Next, we define variables that we use throughout the code.
DATASET_LOCAL = "yellow_tripdata_sample_2019-01.csv"
DATASET_REMOTE = "https://raw.githubusercontent.com/superconductive/ge_tutorials/main/data/yellow_tripdata_sample_2019-01.csv"
CONTEXT_ROOT_DIR = "greatexpectations/great_expectations"

# %%
# Simple Type
# ===========
#
# We define a ``GreatExpectationsType`` that checks if the requested ``batch_filter_parameters`` can be used to fetch files from a directory.
# The directory that's being used is defined in ``my_assets``. You can find ``my_assets`` in the Great Expectations config file.
#
# The parameters within the ``data_connector_query`` convey that we're fetching all those files that have "2019" and "01" in the file names.


@task(limits=Resources(mem="500Mi"))
def simple_task(
    directory: GreatExpectationsType[
        str,
        GreatExpectationsFlyteConfig(
            datasource_name="data",  # noqa: F821
            expectation_suite_name="test.demo",  # noqa: F821
            data_connector_name="my_data_connector",  # noqa: F821
            batch_request_config=BatchRequestConfig(
                data_connector_query={
                    "batch_filter_parameters": {  # noqa: F821
                        "year": "2019",  # noqa: F821
                        "month": "01",  # noqa: F821, F722
                    },
                    "limit": 10,  # noqa: F821
                },
 def my_wf(a: typing.List[str]) -> typing.List[str]:
     mappy = map_task(t1)
     map_node = create_node(mappy, a=a).with_overrides(
         requests=Resources(cpu="1", mem="100"),
         limits=Resources(cpu="2", mem="200"))
     return map_node.o0
Beispiel #29
0
#
# #. Loads the MNIST data
# #. Prepares the data for training
# #. Initializes a convnet model
# #. Calls the `training_step()` function to train the model
# #. Saves the model and checkpoint history and returns the result
@task(
    task_config=MPIJob(
        num_workers=2,
        num_launcher_replicas=1,
        slots=1,
    ),
    retries=3,
    cache=True,
    cache_version="0.1",
    requests=Resources(cpu="1", mem="600Mi"),
    limits=Resources(cpu="2"),
)
def horovod_train_task(batch_size: int, buffer_size: int,
                       dataset_size: int) -> FlyteDirectory:
    """
    :param batch_size: Represents the number of consecutive elements of this dataset to combine in a single batch.
    :param buffer_size: Defines the size of the buffer used to hold elements of the dataset used for training.
    :param dataset_size: The number of elements of this dataset that should be taken to form the new dataset when
        running batched training.
    """
    hvd.init()

    (mnist_images,
     mnist_labels), _ = tf.keras.datasets.mnist.load_data(path="mnist-%d.npz" %
                                                          hvd.rank())
Beispiel #30
0
# * ``num_workers``: integer determining the number of worker replicas to be spawned in the cluster for this job
# * ``num_ps_replicas``: number of parameter server replicas to use
# * ``num_chief_replicas``: number of chief replicas to use
#
# MirroredStrategy uses an all-reduce algorithm to communicate the variable updates across the devices.
# Hence, ``num_ps_replicas`` is not useful in our example.
#
# .. note::
#   If you'd like to understand the various Tensorflow strategies in distributed training, refer to the `Types of strategies <https://www.tensorflow.org/guide/distributed_training#types_of_strategies>`__ section in the TensorFlow documentation.
training_outputs = NamedTuple(
    "TrainingOutputs", accuracy=float, loss=float, model_state=FlyteDirectory
)

if os.getenv("SANDBOX") != "":
    resources = Resources(
        gpu="0", mem="1000Mi", storage="500Mi", ephemeral_storage="500Mi"
    )
else:
    resources = Resources(
        gpu="2", mem="10Gi", storage="10Gi", ephemeral_storage="500Mi"
    )


@task(
    task_config=TfJob(num_workers=2, num_ps_replicas=1, num_chief_replicas=1),
    retries=2,
    cache=True,
    cache_version="1.0",
    requests=resources,
    limits=resources,
)