コード例 #1
0
 def test_save_and_load_non_existing_dir(self, tmp_path, dummy_dataframe):
     """Test saving and reloading the data set to non-existing directory."""
     filepath = (tmp_path / "non-existing" / FILENAME).as_posix()
     data_set = ParquetDataSet(filepath=filepath)
     data_set.save(dummy_dataframe)
     reloaded = data_set.load()
     assert_frame_equal(dummy_dataframe, reloaded)
コード例 #2
0
 def test_load_parquet(self, tmp_path, sample_pandas_df):
     temp_path = str(tmp_path / "data")
     local_parquet_set = ParquetDataSet(filepath=temp_path)
     local_parquet_set.save(sample_pandas_df)
     spark_data_set = SparkDataSet(filepath=temp_path)
     spark_df = spark_data_set.load()
     assert spark_df.count() == 4
コード例 #3
0
 def test_catalog_release(self, protocol, path, mocker):
     filepath = protocol + path + FILENAME
     fs_mock = mocker.patch("fsspec.filesystem").return_value
     data_set = ParquetDataSet(filepath=filepath)
     data_set.release()
     if protocol != "https://":
         filepath = path + FILENAME
     fs_mock.invalidate_cache.assert_called_once_with(filepath)
コード例 #4
0
    def test_read_from_file(self, mocker):
        fs_mock = mocker.patch("fsspec.filesystem").return_value
        fs_mock.isdir.return_value = False
        mocker.patch("pandas.read_parquet")

        data_set = ParquetDataSet(filepath="/tmp/test.parquet")

        data_set.load()
        fs_mock.isdir.assert_called_once()
        fs_mock.open.assert_called_once()
コード例 #5
0
    def test_save_and_load(self, tmp_path, dummy_dataframe):
        """Test saving and reloading the data set."""
        filepath = (tmp_path / FILENAME).as_posix()
        data_set = ParquetDataSet(filepath=filepath)
        data_set.save(dummy_dataframe)
        reloaded = data_set.load()
        assert_frame_equal(dummy_dataframe, reloaded)
        assert data_set._fs_open_args_load == {}

        files = [child.is_file() for child in tmp_path.iterdir()]
        assert all(files)
        assert len(files) == 1
コード例 #6
0
    def test_read_from_non_local_dir(self, mocker):
        fs_mock = mocker.patch("fsspec.filesystem").return_value
        fs_mock.isdir.return_value = True
        pq_ds_mock = mocker.patch("pyarrow.parquet.ParquetDataset")

        data_set = ParquetDataSet(filepath="s3://bucket/dir")

        data_set.load()
        fs_mock.isdir.assert_called_once()
        assert not fs_mock.open.called
        pq_ds_mock.assert_called_once_with("bucket/dir", filesystem=fs_mock)
        pq_ds_mock().read().to_pandas.assert_called_once_with()
コード例 #7
0
def parquet_data_set(filepath_parquet, load_args, save_args, fs_args):
    return ParquetDataSet(
        filepath=filepath_parquet,
        load_args=load_args,
        save_args=save_args,
        fs_args=fs_args,
    )
コード例 #8
0
    def test_http_filesystem_no_versioning(self):
        pattern = r"HTTP\(s\) DataSet doesn't support versioning\."

        with pytest.raises(DataSetError, match=pattern):
            ParquetDataSet(
                filepath="https://example.com/test.parquet", version=Version(None, None)
            )
コード例 #9
0
    def test_read_partitioned_file(self, mocker, tmp_path, dummy_dataframe):
        """Test read partitioned parquet file from local directory."""
        pq_ds_mock = mocker.patch("pyarrow.parquet.ParquetDataset",
                                  wraps=pq.ParquetDataset)
        dummy_dataframe.to_parquet(str(tmp_path), partition_cols=["col2"])
        data_set = ParquetDataSet(filepath=tmp_path.as_posix())

        reloaded = data_set.load()
        # Sort by columns because reading partitioned file results
        # in different columns order
        reloaded = reloaded.sort_index(axis=1)
        # dtype for partition column is 'category'
        assert_frame_equal(dummy_dataframe,
                           reloaded,
                           check_dtype=False,
                           check_categorical=False)
        pq_ds_mock.assert_called_once()
コード例 #10
0
    def test_protocol_usage(self, filepath, instance_type):
        data_set = ParquetDataSet(filepath=filepath)
        assert isinstance(data_set._fs, instance_type)

        path = filepath.split(PROTOCOL_DELIMITER, 1)[-1]

        assert str(data_set._filepath) == path
        assert isinstance(data_set._filepath, PurePosixPath)
コード例 #11
0
    def test_credentials_propagated(self, mocker):
        """Test propagating credentials for connecting to GCS"""
        mock_fs = mocker.patch("fsspec.filesystem")
        credentials = {"key": "value"}

        ParquetDataSet(filepath=FILENAME, credentials=credentials)

        mock_fs.assert_called_once_with("file", auto_mkdir=True, **credentials)
コード例 #12
0
    def test_version_str_repr(self, load_version, save_version):
        """Test that version is in string representation of the class instance
        when applicable."""
        ds = ParquetDataSet(filepath=FILENAME)
        ds_versioned = ParquetDataSet(filepath=FILENAME,
                                      version=Version(load_version,
                                                      save_version))
        assert FILENAME in str(ds)
        assert "version" not in str(ds)

        assert FILENAME in str(ds_versioned)
        ver_str = f"version=Version(load={load_version}, save='{save_version}')"
        assert ver_str in str(ds_versioned)
        assert "ParquetDataSet" in str(ds_versioned)
        assert "ParquetDataSet" in str(ds)
        assert "protocol" in str(ds_versioned)
        assert "protocol" in str(ds)
コード例 #13
0
ファイル: test_data_catalog.py プロジェクト: zeta1999/kedro
def multi_catalog(mocker):
    csv = CSVDataSet(filepath="abc.csv")
    parq = ParquetDataSet(filepath="xyz.parq")
    journal = mocker.Mock()
    layers = {"raw": {"abc.csv"}, "model": {"xyz.parq"}}
    return DataCatalog({
        "abc": csv,
        "xyz": parq
    },
                       journal=journal,
                       layers=layers)
コード例 #14
0
    def test_save_parquet(self, tmp_path, sample_spark_df):
        # To cross check the correct Spark save operation we save to
        # a single spark partition and retrieve it with Kedro
        # ParquetDataSet
        temp_dir = Path(str(tmp_path / "test_data"))
        spark_data_set = SparkDataSet(filepath=str(temp_dir),
                                      save_args={"compression": "none"})
        spark_df = sample_spark_df.coalesce(1)
        spark_data_set.save(spark_df)

        single_parquet = [
            f for f in temp_dir.iterdir()
            if f.is_file() and f.name.startswith("part")
        ][0]

        local_parquet_data_set = ParquetDataSet(filepath=str(single_parquet))

        pandas_df = local_parquet_data_set.load()

        assert pandas_df[pandas_df["name"] == "Bob"]["age"].iloc[0] == 12
コード例 #15
0
    def test_protocol_usage(self, filepath, instance_type):
        data_set = ParquetDataSet(filepath=filepath)
        assert isinstance(data_set._fs, instance_type)

        # _strip_protocol() doesn't strip http(s) protocol
        if data_set._protocol == "https":
            path = filepath.split("://")[-1]
        else:
            path = data_set._fs._strip_protocol(filepath)

        assert str(data_set._filepath) == path
        assert isinstance(data_set._filepath, PurePosixPath)
コード例 #16
0
def multi_catalog(mocker):
    csv = CSVDataSet(filepath="abc.csv")
    parq = ParquetDataSet(filepath="xyz.parq")
    journal = mocker.Mock()
    return DataCatalog({"abc": csv, "xyz": parq}, journal=journal)
コード例 #17
0
    def test_write_to_dir(self, dummy_dataframe, tmp_path):
        data_set = ParquetDataSet(filepath=tmp_path.as_posix())
        pattern = "Saving ParquetDataSet to a directory is not supported"

        with pytest.raises(DataSetError, match=pattern):
            data_set.save(dummy_dataframe)
コード例 #18
0
def versioned_parquet_data_set(filepath_parquet, load_version, save_version):
    return ParquetDataSet(filepath=filepath_parquet,
                          version=Version(load_version, save_version))