def test_serialize_deserialize_is_invariant(self, input_dataframe): file_type = "json" pd = pytest.importorskip("pandas", reason="Pandas not installed") serializer = PandasSerializer(file_type) serialized = serializer.serialize(input_dataframe) deserialized = serializer.deserialize(serialized) pd.testing.assert_frame_equal(input_dataframe, deserialized)
def test_serialize_kwargs_work_as_expected(self, input_dataframe): pd = pytest.importorskip("pandas", reason="Pandas not installed") serializer = PandasSerializer("csv", serialize_kwargs={ "sep": ":", "index": False }) serialized = serializer.serialize(input_dataframe) deserialized = serializer.deserialize(serialized) expected = pd.DataFrame({"one:two": ["1:4", "2:5", "3:6"]}) pd.testing.assert_frame_equal(expected, deserialized)
def test_deserialize_kwargs_work_as_expected(self, input_dataframe): pd = pytest.importorskip("pandas", reason="Pandas not installed") np = pytest.importorskip("numpy", reason="numpy not installed") serializer = PandasSerializer("csv", deserialize_kwargs={"na_values": [3, 5]}) serialized = serializer.serialize(input_dataframe) deserialized = serializer.deserialize(serialized) expected = pd.DataFrame({ "Unnamed: 0": [0, 1, 2], "one": [1, 2, np.nan], "two": [4, np.nan, 6] }) pd.testing.assert_frame_equal(expected, deserialized)
def test_serialize_returns_bytes(self, file_type, input_dataframe): pd = pytest.importorskip("pandas", reason="Pandas not installed") serialized = PandasSerializer(file_type).serialize(input_dataframe) assert isinstance(serialized, bytes)
def test_complains_when_unavailable_file_type_specified(self): pd = pytest.importorskip("pandas", reason="Pandas not installed") with pytest.raises(ValueError): PandasSerializer("blerg")
def test_pandas_serializer_equality(): pd = pytest.importorskip("pandas", reason="Pandas not installed") assert PickleSerializer() != PandasSerializer("csv") assert PandasSerializer("csv") == PandasSerializer("csv") assert PandasSerializer("csv", serialize_kwargs={"one": 1}) == PandasSerializer( "csv", serialize_kwargs={"one": 1}) assert PandasSerializer("csv") != PandasSerializer("parquet") assert PandasSerializer("csv", deserialize_kwargs={ "one": 1 }) != PandasSerializer("csv", deserialize_kwargs={"one": 2}) assert PandasSerializer("csv", serialize_kwargs={ "one": 1 }) != PandasSerializer("csv", serialize_kwargs={"one": 2})
def gen_data_pipeline() -> Flow: """Split the enire input set into build and holdout. Saves the ``build.csv`` and ``holdout.csv`` data to the ``models`` subfolder within ``data_dir``. Parameters ---------- None Returns ------- Flow Generated pipeline. """ # Initialize tasks load = LoadData(name="Load clean model data") format_data = SurvivalData(name="Convert input data to ranged form") segdata = SegmentData(name="Split data") retrieve_train = GetItem( name="Get training data", checkpoint=True, result=LocalResult( dir=".", location="{output_dir}/models/train.csv", serializer=PandasSerializer(file_type="csv", serialize_kwargs={"sep": "|"}), ), ) retrieve_tune = GetItem( name="Get tuning data", checkpoint=True, result=LocalResult( dir=".", location="{output_dir}/models/tune.csv", serializer=PandasSerializer(file_type="csv", serialize_kwargs={"sep": "|"}), ), ) retrieve_hold = GetItem( name="Get holdout data", checkpoint=True, result=LocalResult( dir=".", location="{output_dir}/models/holdout.csv", serializer=PandasSerializer(file_type="csv", serialize_kwargs={"sep": "|"}), ), ) # Generate the flow with Flow(name="Split data into build and holdout") as flow: # Set up parameters data_dir = Parameter("data_dir", "nba-data") splits = Parameter("splits", [0.6, 0.2, 0.2]) seed = Parameter("seed", 42) # Load the data basedata = load(data_dir=data_dir) # Format the data alldata = format_data(basedata) data = segdata(alldata, splits=splits, keys=["train", "tune", "holdout"], seed=seed) _ = retrieve_train(task_result=data, key="train") _ = retrieve_tune(task_result=data, key="tune") _ = retrieve_hold(task_result=data, key="holdout") return flow
end_date_string = timestamp_to_date_string(kwargs['end_date']) return f"d_{dataset}_w_{window_size}_o_{window_offset}_s_{start_date_string}_f_{end_date_string}.csv" @task def create_feature_store(dataset, window_size, window_offset): return FeatureStore(dataset, window_size=window_size, window_offset=window_offset) @task(target=generate_task_run_target_name, checkpoint=True, result=LocalResult(dir="../../data/processed/prefect_results", serializer=PandasSerializer( file_type="csv", serialize_kwargs={"index": False}))) def create_features(feature_store, start_date, end_date): feature_store.set_pointer(datetime.fromtimestamp(start_date)) return feature_store.next_samples_until(datetime.fromtimestamp(end_date)) @task def add_label_column(df, bots): df["isBot"] = df["IP"].isin(bots).astype(int) return df def build_feature_pipeline_flow(): with Flow("feature-pipeline") as feature_pipeline: dataset = Parameter("dataset")
from prefect import task, Flow, Parameter from embedding import college_embeddings, college_facts, create_wiki, create_dict from prefect.engine.results import LocalResult from prefect.engine.serializers import PandasSerializer with Flow("data analysis") as flow: """take all the python functions and feed them into prefect :returns: n/a :rtype: n/a """ bypass = Parameter("bypass", default=False, required=False) @task(log_stdout=True, nout=4, result=LocalResult(serializer=PandasSerializer(file_type='csv'), dir='/', location="facts.csv")) college_fact = college_facts(bypass=bypass) @task(log_stdout=True, nout=4, result=LocalResult(serializer=PandasSerializer(file_type='csv'), dir='/', location="wiki_list.csv")) wiki_list = create_wiki(college_fact, bypass=bypass) @task(log_stdout=True, nout=4, result=LocalResult(serializer=PandasSerializer(file_type='csv'), dir='/', location="embedding_dict.csv")) dict = create_dict(wiki_list, bypass=bypass) @task(log_stdout=True, nout=4, result=LocalResult(serializer=PandasSerializer(file_type='csv'), dir='/', location="college_embeddings.csv")) college_embedding = college_embeddings(dict, wiki_list, college_fact, bypass=bypass) #run functions flow.register(project_name="college") # LocalAgent().start() flow.run(bypass=False)