class MultiRegionHousePricePredictionModelTrainer(object): """ This pipeline trains an XGBoost model, also generated synthetic data and runs predictions against test dataset """ regions = Input(Types.List(Types.String), default=["SFO", "SEA", "DEN"], help="Regions for where to train the model.") seed = Input(Types.Integer, default=7, help="Seed to use for splitting.") num_houses_per_region = Input( Types.Integer, default=1000, help="Number of houses to generate data for in each region") # the actual algorithm split = generate_and_split_data_multiloc( locations=regions, number_of_houses_per_location=num_houses_per_region, seed=seed) fit_task = parallel_fit(multi_train=split.outputs.train) predicted = parallel_predict(multi_models=fit_task.outputs.multi_models, multi_test=split.outputs.test) # Outputs: joblib seralized models per region and accuracy of the model per region # Note we should make this into a map, but for demo we will output a simple list models = Output(fit_task.outputs.multi_models, sdk_type=Types.List(Types.Blob)) accuracies = Output(predicted.outputs.accuracies, sdk_type=Types.List(Types.Float))
from flytekit.sdk.tasks import inputs, outputs, python_task from flytekit.sdk.types import Types from flytekit.sdk.workflow import workflow_class, Input, Output import json @inputs(custom=Types.Generic) @outputs(counts=Types.Generic, replicated=Types.List(Types.Generic)) @python_task def generic_type_task(wf_params, custom, counts, replicated): """ Go through each of the values of the input and if it's a str, count the length Also, create a replicated list of the Generic """ wf_params.logging.info("Running custom object task") results = {} for k, v in custom.items(): if type(v) == str: results[k] = len(v) else: results[k] = v counts.set(results) replicated.set([custom, custom]) @inputs(replicated=Types.List(Types.Generic)) @outputs(str_repr=Types.String) @python_task def generic_to_json(wf_params, replicated, str_repr): """
import os from flytekit.sdk.tasks import python_task, inputs, outputs, dynamic_task from flytekit.sdk.types import Types from flytekit.sdk.workflow import workflow_class, Input, Output from demo.house_price_predictor import generate_data, save_to_file, save_to_dir, fit, predict @inputs(locations=Types.List(Types.String), number_of_houses_per_location=Types.Integer, seed=Types.Integer) @outputs(train=Types.List(Types.MultiPartCSV), val=Types.List(Types.MultiPartCSV), test=Types.List(Types.CSV)) @python_task(cache=True, cache_version="0.1", memory_request="200Mi") def generate_and_split_data_multiloc(wf_params, locations, number_of_houses_per_location, seed, train, val, test): train_sets = [] val_sets = [] test_sets = [] for loc in locations: _train, _val, _test = generate_data(loc, number_of_houses_per_location, seed) dir = "multi_data" os.makedirs(dir, exist_ok=True) train_sets.append(save_to_dir(dir, "train", _train)) val_sets.append(save_to_dir(dir, "val", _val)) test_sets.append(save_to_file(dir, "test", _test)) train.set(train_sets)
# We know we are writing just one file, so we will just read the one file df = pd.read_csv(os.path.join(train.local_path, files[0]), header=None) y = df[df.columns[0]] x = df[df.columns[1:]] # fit model no training data m = XGBClassifier() m.fit(x, y) # TODO model Blob should be a file like object fname = "model.joblib.dat" joblib.dump(m, fname) model.set(fname) @inputs(test=Types.CSV, model_ser=Types.Blob) # TODO: format=".joblib.dat")) @outputs(predictions=Types.List(Types.Float), accuracy=Types.Float) @python_task(cache_version='1.0', cache=True, memory_request="200Mi") def predict(ctx, test, model_ser, predictions, accuracy): """ Given a any trained model, serialized using joblib (this method can be shared!) and features, this method returns predictions. """ # Load model model_ser.download() model = joblib.load(model_ser.local_path) # Load test data test.download() test_df = pd.read_csv(test.local_path, header=None) x_df = test_df[test_df.columns[1:]] y_df = test_df[test_df.columns[0]] y_pred = model.predict(x_df)