Exemple #1
0
    def test_serializer_not_configurable(self):
        # By default creates own JSONSerializer
        result = PrefectResult()
        assert isinstance(result.serializer, JSONSerializer)

        # Can specify one manually as well
        serializer = JSONSerializer()
        result = PrefectResult(serializer=serializer)
        assert result.serializer is serializer

        # Can set if it's a JSONSerializer
        serializer2 = JSONSerializer()
        result.serializer = serializer2
        assert result.serializer is serializer2

        # Type errors for other serializer types
        with pytest.raises(TypeError):
            result.serializer = PickleSerializer()
        with pytest.raises(TypeError):
            result = PrefectResult(serializer=PickleSerializer())
Exemple #2
0
def test(e: Optional[Executor]):
    with TemporaryDirectory() as tmpdir:
        flow_result = LocalResult(tmpdir, serializer=JSONSerializer(),
                                  location="{task_name}.json")

        with Flow("write_result", result=flow_result) as f:
            _terminal = task(lambda: 42, checkpoint=True, name="magic")()

        with set_temporary_config({"flows.checkpointing": True}), \
             raise_on_exception():
            f.run(executor=e)

        files = os.listdir(tmpdir)
        assert files == ["magic.json"], files
        with open(os.path.join(tmpdir, files[0]), "rb") as file:
            val = json.load(file)
        assert val==42
Exemple #3
0
def test_compressed_serializer_equality() -> None:
    assert CompressedSerializer(PickleSerializer(),
                                format="bz2") == CompressedSerializer(
                                    PickleSerializer(), format="bz2")
    assert CompressedSerializer(PickleSerializer(),
                                format="bz2") != CompressedSerializer(
                                    JSONSerializer(), format="bz2")
    assert CompressedSerializer(PickleSerializer(),
                                format="bz2") != CompressedSerializer(
                                    PickleSerializer(),
                                    compress=gzip.compress,
                                    decompress=gzip.decompress)
    assert CompressedSerializer(
        PickleSerializer(), compress=gzip.compress,
        decompress=gzip.decompress) != CompressedSerializer(
            PickleSerializer(),
            compress=gzip.compress,
            decompress=gzip.decompress,
            compress_kwargs={"compresslevel": 8},
        )
Exemple #4
0
 def __init__(self, **kwargs: Any) -> None:
     if "serializer" not in kwargs:
         kwargs["serializer"] = JSONSerializer()
     super().__init__(**kwargs)
Exemple #5
0
 def test_serialize_returns_json(self):
     value = ["abc", 123]
     serialized = JSONSerializer().serialize(value)
     assert serialized == json.dumps(value).encode()
Exemple #6
0
 def test_deserialize_returns_objects(self):
     value = ["abc", 123]
     serialized = JSONSerializer().serialize(value)
     deserialized = JSONSerializer().deserialize(serialized)
     assert deserialized == value
Exemple #7
0
 def test_serialize_returns_bytes(self):
     value = ["abc", 123]
     serialized = JSONSerializer().serialize(value)
     assert isinstance(serialized, bytes)
Exemple #8
0
def test_equality():
    assert PickleSerializer() == PickleSerializer()
    assert JSONSerializer() == JSONSerializer()
    assert PickleSerializer() != JSONSerializer()
Exemple #9
0
from prefect import task, Flow
from prefect.engine.results import LocalResult
from prefect.engine.serializers import JSONSerializer


@task(target="test.json", result=LocalResult(serializer=JSONSerializer()))
def get_data():
    return {"asdf": "here"}


@task
def print_data(d):
    print(d)


with Flow("target_serializer") as f:
    d = get_data()
    print_data(d)

f.run()
Exemple #10
0
def gen_xgboost_pipeline() -> Flow:
    """Generate a ``xgboost`` fit pipeline.

    Parameters
    ----------
    None

    Returns
    -------
    Flow
        Generated pipeline.
    """
    # Create a time range for AUROC calculation -- start to the end of the fourth quarter
    times = np.arange(2890, step=10)
    # Initialize tasks
    calib_data = CollapseData(name="Create calibration data")
    train_data = CollapseData(name="Create training data")
    tune_data = CollapseData(name="Create tuning data")
    stop_data = CollapseData(name="Create stopping data")
    tuning = XGBoostTuning(
        name="Run XGBoost hyperparameter tuning",
        checkpoint=True,
        result=LocalResult(
            dir=".",
            location="{output_dir}/models/{today}/xgboost/tuning.pkl",
        ),
    )
    retrieve_best = GetItem(
        name="Get best parameters",
        checkpoint=True,
        result=LocalResult(
            dir=".",
            location="{output_dir}/models/{today}/xgboost/params.json",
            serializer=JSONSerializer(),
        ),
    )
    tuneplots = PlotTuning(
        name="Plot XGBoost hyperparameter tuning",
        checkpoint=True,
        result=LocalResult(
            serializer=Plot(),
            dir=".",
            location=
            "{output_dir}/models/{today}/xgboost/hyperparameter-tuning.png",
        ),
    )
    trained = FitXGBoost(
        name="Train XGBoost model",
        checkpoint=True,
        result=LocalResult(
            dir=".", location="{output_dir}/models/{today}/xgboost/model.pkl"),
    )
    calcshap = XGBoostShap(name="Calculate SHAP Values")
    plotshap = PlotShapSummary(
        name="Plot SHAP values",
        checkpoint=True,
        result=LocalResult(
            serializer=Plot(),
            dir=".",
            location="{output_dir}/models/{today}/xgboost/shap-summary.png",
        ),
    )
    calc_sprob = WinProbability(name="Calculate survival probability")
    cal = CalibrateClassifier(
        name="Calibrate XGBoost model",
        checkpoint=True,
        result=LocalResult(
            dir=".",
            location="{output_dir}/models/{today}/xgboost/calibrator.pkl"),
    )
    pcal = PlotCalibration(
        name="Plot calibration curve",
        checkpoint=True,
        result=LocalResult(
            serializer=Plot(),
            dir=".",
            location=
            "{output_dir}/models/{today}/xgboost/calibration-curve.png",
        ),
    )

    # Generate the flow
    with Flow(name="Train Cox model") as flow:
        # Define some parameters
        _ = Parameter("output_dir", "nba-data")
        data_dir = Parameter("data_dir", "nba-data")
        max_evals = Parameter("max_evals", 100)
        seed = Parameter("seed", 42)
        # Load the data
        rawtrain = load_df(data_dir=data_dir, dataset="train.csv")
        rawtune = load_df(data_dir=data_dir, dataset="tune.csv")
        # Collapse data to the final row
        train = train_data(rawtrain)
        tune = tune_data.map(data=unmapped(rawtune), timestep=times)
        stop = stop_data(rawtune)
        calib_input = calib_data.map(data=unmapped(rawtrain), timestep=times)
        # Run hyperparameter tuning
        params = tuning(
            train_data=train,
            tune_data=tune,
            stopping_data=stop,
            early_stopping_rounds=25,
            num_boost_round=10000,
            max_evals=max_evals,
            seed=seed,
        )
        _ = retrieve_best(task_result=params, key="best")
        tuneplots(params["trials"])
        # Fit the model
        model = trained(
            params=params["best"],
            train_data=train,
            stopping_data=stop,
            early_stopping_rounds=25,
            num_boost_round=10000,
            verbose_eval=False,
        )
        # SHAP
        shap_values = calcshap(model=model, train_data=train)
        _ = plotshap(shap_values=shap_values)
        # Calibrate
        sprob = calc_sprob.map(model=unmapped(model), data=calib_input)
        iso = cal(train_data=sprob)
        _ = pcal(data=sprob, calibrator=iso)

    return flow
Exemple #11
0
def gen_lifelines_pipeline() -> Flow:
    """Generate a ``lifelines`` model fit pipeline.

    Parameters
    ----------
    None

    Returns
    -------
    Flow
        The generated pipeline.
    """
    # Create a time range for AUROC calculation -- start to the end of the fourth quarter
    times = np.arange(2890, step=10)
    # Initialize tasks
    calib_data = CollapseData(name="Create calibration data")
    tune_data = CollapseData(name="Create tuning data")
    tuning = LifelinesTuning(
        name="Run lifelines hyperparameter tuning",
        checkpoint=True,
        result=LocalResult(
            dir=".",
            location="{output_dir}/models/{today}/lifelines/tuning.pkl"),
    )
    retrieve_best = GetItem(
        name="Get best parameters",
        checkpoint=True,
        result=LocalResult(
            dir=".",
            location="{output_dir}/models/{today}/lifelines/params.json",
            serializer=JSONSerializer(),
        ),
    )
    tuneplots = PlotTuning(
        name="Plot lifelines hyperparameter tuning",
        checkpoint=True,
        result=LocalResult(
            serializer=Plot(),
            dir=".",
            location=
            "{output_dir}/models/{today}/lifelines/hyperparameter-tuning.png",
        ),
    )
    model = InitializeLifelines(name="Initialize lifelines model")
    trained = FitLifelinesModel(
        name="Train lifelines model",
        checkpoint=True,
        result=LocalResult(
            dir=".",
            location="{output_dir}/models/{today}/lifelines/model.pkl"),
    )
    calc_sprob = WinProbability(name="Calculate survival probability")
    cal = CalibrateClassifier(
        name="Calibrate Lifelines model",
        checkpoint=True,
        result=LocalResult(
            dir=".",
            location="{output_dir}/models/{today}/lifelines/calibrator.pkl"),
    )
    pcal = PlotCalibration(
        name="Plot calibration curve",
        checkpoint=True,
        result=LocalResult(
            serializer=Plot(),
            dir=".",
            location=
            "{output_dir}/models/{today}/lifelines/calibration-curve.png",
        ),
    )

    # Generate the flow
    with Flow(name="Train Cox model") as flow:
        # Define some parameters
        data_dir = Parameter("data_dir", "nba-data")
        max_evals = Parameter("max_evals", 100)
        seed = Parameter("seed", 42)
        # Load the data
        train = load_df(data_dir=data_dir, dataset="train.csv")
        rawtune = load_df(data_dir=data_dir, dataset="tune.csv")
        # Collapse the data to the final row for Concordance calculations
        tune = tune_data.map(data=unmapped(rawtune), timestep=times)
        calib_input = calib_data.map(data=unmapped(train), timestep=times)
        # Run hyperparameter tuning
        params = tuning(train_data=train,
                        tune_data=tune,
                        max_evals=max_evals,
                        seed=seed)
        _ = retrieve_best(task_result=params, key="best")
        tuneplots(params["trials"])
        model_obj = model(params["best"])
        trained_model = trained(model=model_obj, data=train)
        sprob = calc_sprob.map(model=unmapped(trained_model), data=calib_input)
        iso = cal(train_data=sprob)
        _ = pcal(data=sprob, calibrator=iso)

    return flow