コード例 #1
0
    def test_read_from_file(self, mocker):
        fs_mock = mocker.patch("fsspec.filesystem").return_value
        fs_mock.isdir.return_value = False
        mocker.patch("pandas.read_parquet")

        data_set = ParquetDataSet(filepath="/tmp/test.parquet")

        data_set.load()
        fs_mock.isdir.assert_called_once()
        fs_mock.open.assert_called_once()
コード例 #2
0
    def test_read_from_non_local_dir(self, mocker):
        fs_mock = mocker.patch("fsspec.filesystem").return_value
        fs_mock.isdir.return_value = True
        pq_ds_mock = mocker.patch("pyarrow.parquet.ParquetDataset")

        data_set = ParquetDataSet(filepath="s3://bucket/dir")

        data_set.load()
        fs_mock.isdir.assert_called_once()
        assert not fs_mock.open.called
        pq_ds_mock.assert_called_once_with("bucket/dir", filesystem=fs_mock)
        pq_ds_mock().read().to_pandas.assert_called_once_with()
コード例 #3
0
 def test_save_and_load_non_existing_dir(self, tmp_path, dummy_dataframe):
     """Test saving and reloading the data set to non-existing directory."""
     filepath = (tmp_path / "non-existing" / FILENAME).as_posix()
     data_set = ParquetDataSet(filepath=filepath)
     data_set.save(dummy_dataframe)
     reloaded = data_set.load()
     assert_frame_equal(dummy_dataframe, reloaded)
コード例 #4
0
    def test_save_and_load(self, tmp_path, dummy_dataframe):
        """Test saving and reloading the data set."""
        filepath = (tmp_path / FILENAME).as_posix()
        data_set = ParquetDataSet(filepath=filepath)
        data_set.save(dummy_dataframe)
        reloaded = data_set.load()
        assert_frame_equal(dummy_dataframe, reloaded)
        assert data_set._fs_open_args_load == {}

        files = [child.is_file() for child in tmp_path.iterdir()]
        assert all(files)
        assert len(files) == 1
コード例 #5
0
    def test_read_partitioned_file(self, mocker, tmp_path, dummy_dataframe):
        """Test read partitioned parquet file from local directory."""
        pq_ds_mock = mocker.patch("pyarrow.parquet.ParquetDataset",
                                  wraps=pq.ParquetDataset)
        dummy_dataframe.to_parquet(str(tmp_path), partition_cols=["col2"])
        data_set = ParquetDataSet(filepath=tmp_path.as_posix())

        reloaded = data_set.load()
        # Sort by columns because reading partitioned file results
        # in different columns order
        reloaded = reloaded.sort_index(axis=1)
        # dtype for partition column is 'category'
        assert_frame_equal(dummy_dataframe,
                           reloaded,
                           check_dtype=False,
                           check_categorical=False)
        pq_ds_mock.assert_called_once()
コード例 #6
0
    def test_save_parquet(self, tmp_path, sample_spark_df):
        # To cross check the correct Spark save operation we save to
        # a single spark partition and retrieve it with Kedro
        # ParquetDataSet
        temp_dir = Path(str(tmp_path / "test_data"))
        spark_data_set = SparkDataSet(filepath=str(temp_dir),
                                      save_args={"compression": "none"})
        spark_df = sample_spark_df.coalesce(1)
        spark_data_set.save(spark_df)

        single_parquet = [
            f for f in temp_dir.iterdir()
            if f.is_file() and f.name.startswith("part")
        ][0]

        local_parquet_data_set = ParquetDataSet(filepath=str(single_parquet))

        pandas_df = local_parquet_data_set.load()

        assert pandas_df[pandas_df["name"] == "Bob"]["age"].iloc[0] == 12