Example #1
0
 def test_load_parquet(self, tmp_path, sample_pandas_df):
     temp_path = str(tmp_path / "data")
     local_parquet_set = ParquetLocalDataSet(filepath=temp_path)
     local_parquet_set.save(sample_pandas_df)
     spark_data_set = SparkDataSet(filepath=temp_path)
     spark_df = spark_data_set.load()
     assert spark_df.count() == 4
Example #2
0
def test_load_parquet(tmpdir):
    temp_path = str(tmpdir.join("data"))
    pandas_df = _get_sample_pandas_data_frame()
    local_parquet_set = ParquetLocalDataSet(filepath=temp_path)
    local_parquet_set.save(pandas_df)
    spark_data_set = SparkDataSet(filepath=temp_path)
    spark_df = spark_data_set.load()
    assert spark_df.count() == 4
Example #3
0
    def test_version_str_repr(self, load_version, save_version):
        """Test that version is in string representation of the class instance
        when applicable."""
        filepath = "data"
        ds = ParquetLocalDataSet(filepath=filepath)
        ds_versioned = ParquetLocalDataSet(filepath=filepath,
                                           version=Version(
                                               load_version, save_version))
        assert filepath in str(ds)
        assert "version" not in str(ds)

        assert filepath in str(ds_versioned)
        ver_str = "version=Version(load={}, save='{}')".format(
            load_version, save_version)
        assert ver_str in str(ds_versioned)
Example #4
0
def write_parquet_locally(context):
    """Writes DataFrame as Parquet in a temporary directory."""
    file_name = "dummy.parq"
    context.full_path = context.temp_dir / file_name
    context.data_set = ParquetLocalDataSet(str(context.full_path))
    context.data_set.save(context.pandas_df)
    assert context.full_path.exists()
Example #5
0
 def test_str_representation(self):
     """Test string representation of the data set instance."""
     parquet_data_set = ParquetLocalDataSet("test_file.parquet")
     pattern = (
         "ParquetLocalDataSet(engine=auto, "
         "filepath=test_file.parquet, save_args={})"
     )
     assert pattern in str(parquet_data_set)
Example #6
0
def build_catalog(root_dir):
    """Creates the kedro catalog object stored as io"""
    return DataCatalog(
        {
            "raw_cities":
            CSVHTTPDataSet(
                fileurl=
                "https://people.sc.fsu.edu/~jburkardt/data/csv/cities.csv",
                auth=None,
                load_args=None,
            ),
            "int_cities":
            ParquetLocalDataSet(
                filepath=root_dir / "data" / "int" / "cities", ),
            "pri_cities":
            ParquetLocalDataSet(
                filepath=root_dir / "data" / "pri" / "cities", ),
        }, )
Example #7
0
    def test_save_parquet(self, tmp_path, sample_spark_df):
        # To cross check the correct Spark save operation we save to
        # a single spark partition and retrieve it with Kedro
        # ParquetLocalDataSet
        temp_dir = Path(str(tmp_path / "test_data"))
        spark_data_set = SparkDataSet(
            filepath=str(temp_dir), save_args={"compression": "none"}
        )
        spark_df = sample_spark_df.coalesce(1)
        spark_data_set.save(spark_df)

        single_parquet = [
            f for f in temp_dir.iterdir() if f.is_file() and f.name.startswith("part")
        ][0]

        local_parquet_data_set = ParquetLocalDataSet(filepath=str(single_parquet))

        pandas_df = local_parquet_data_set.load()

        assert pandas_df[pandas_df["name"] == "Bob"]["age"].iloc[0] == 12
Example #8
0
def test_save_parquet():
    # To cross check the correct Spark save operation we save to
    # a single spark partition and retrieve it with Kedro
    # ParquetLocalDataSet
    with tempfile.TemporaryDirectory() as temp_dir:
        temp_path = join(temp_dir, "test_data")
        spark_data_set = SparkDataSet(filepath=temp_path,
                                      save_args={"compression": "none"})
        spark_df = _get_sample_spark_data_frame().coalesce(1)
        spark_data_set.save(spark_df)

        single_parquet = [
            join(temp_path, f) for f in listdir(temp_path)
            if f.startswith("part")
        ][0]

        local_parquet_data_set = ParquetLocalDataSet(filepath=single_parquet)

        pandas_df = local_parquet_data_set.load()

        assert pandas_df[pandas_df["name"] == "Bob"]["age"].iloc[0] == 12
Example #9
0
def versioned_parquet_data_set(data_path, load_version, save_version):
    return ParquetLocalDataSet(
        filepath=data_path, version=Version(load_version, save_version)
    )
Example #10
0
def parquet_data_set(data_path, request):
    return ParquetLocalDataSet(filepath=data_path, **request.param)
Example #11
0
def multi_catalog():
    csv = CSVLocalDataSet(filepath="abc.csv")
    parq = ParquetLocalDataSet(filepath="xyz.parq")
    return DataCatalog({"abc": csv, "xyz": parq})
Example #12
0
def multi_catalog(mocker):
    csv = CSVLocalDataSet(filepath="abc.csv")
    parq = ParquetLocalDataSet(filepath="xyz.parq")
    journal = mocker.Mock()
    return DataCatalog({"abc": csv, "xyz": parq}, journal=journal)