Example #1
0
 def test_partition_cols_supported(self, pa, df_full):
     # GH #23283
     partition_cols = ["bool", "int"]
     df = df_full
     with tm.ensure_clean_dir() as path:
         df.to_parquet(path, partition_cols=partition_cols, compression=None)
         check_partition_names(path, partition_cols)
         assert read_parquet(path).shape == df.shape
Example #2
0
 def test_partition_cols_string(self, pa, df_full):
     # GH #27117
     partition_cols = "bool"
     partition_cols_list = [partition_cols]
     df = df_full
     with tm.ensure_clean_dir() as path:
         df.to_parquet(path, partition_cols=partition_cols, compression=None)
         check_partition_names(path, partition_cols_list)
         assert read_parquet(path).shape == df.shape
Example #3
0
    def test_partition_cols_pathlib(self, pa, df_compat, path_type):
        # GH 35902

        partition_cols = "B"
        partition_cols_list = [partition_cols]
        df = df_compat

        with tm.ensure_clean_dir() as path_str:
            path = path_type(path_str)
            df.to_parquet(path, partition_cols=partition_cols_list)
Example #4
0
 def test_to_csv_zip_infer_name(self, filename, expected_arcname):
     # GH 39465
     df = DataFrame({"ABC": [1]})
     with tm.ensure_clean_dir() as dir:
         path = Path(dir, filename)
         df.to_csv(path, compression="zip")
         with ZipFile(path) as zp:
             assert len(zp.filelist) == 1
             archived_file = zp.filelist[0].filename
             assert archived_file == expected_arcname
Example #5
0
    def test_partition_cols_supported(self, pa, df_full):
        # GH #23283
        partition_cols = ["bool", "int"]
        df = df_full
        with tm.ensure_clean_dir() as path:
            df.to_parquet(path, partition_cols=partition_cols, compression=None)
            import pyarrow.parquet as pq

            dataset = pq.ParquetDataset(path, validate_schema=False)
            assert len(dataset.partitions.partition_names) == 2
            assert dataset.partitions.partition_names == set(partition_cols)
Example #6
0
    def test_partition_cols_string(self, pa, df_full):
        # GH #27117
        partition_cols = "bool"
        partition_cols_list = [partition_cols]
        df = df_full
        with tm.ensure_clean_dir() as path:
            df.to_parquet(path, partition_cols=partition_cols, compression=None)
            import pyarrow.parquet as pq

            dataset = pq.ParquetDataset(path, validate_schema=False)
            assert len(dataset.partitions.partition_names) == 1
            assert dataset.partitions.partition_names == set(partition_cols_list)
Example #7
0
 def test_error_on_using_partition_cols_and_partition_on(self, fp, df_full):
     # GH #23283
     partition_cols = ["bool", "int"]
     df = df_full
     with pytest.raises(ValueError):
         with tm.ensure_clean_dir() as path:
             df.to_parquet(
                 path,
                 engine="fastparquet",
                 compression=None,
                 partition_on=partition_cols,
                 partition_cols=partition_cols,
             )
Example #8
0
 def test_error_on_using_partition_cols_and_partition_on(self, fp, df_full):
     # GH #23283
     partition_cols = ["bool", "int"]
     df = df_full
     msg = (
         "Cannot use both partition_on and partition_cols. Use partition_cols for "
         "partitioning data")
     with pytest.raises(ValueError, match=msg):
         with tm.ensure_clean_dir() as path:
             df.to_parquet(
                 path,
                 engine="fastparquet",
                 compression=None,
                 partition_on=partition_cols,
                 partition_cols=partition_cols,
             )
Example #9
0
    def test_partition_on_supported(self, fp, df_full):
        # GH #23283
        partition_cols = ["bool", "int"]
        df = df_full
        with tm.ensure_clean_dir() as path:
            df.to_parquet(
                path,
                engine="fastparquet",
                compression=None,
                partition_on=partition_cols,
            )
            assert os.path.exists(path)
            import fastparquet

            actual_partition_cols = fastparquet.ParquetFile(path, False).cats
            assert len(actual_partition_cols) == 2
Example #10
0
    def test_partition_cols_string(self, fp, df_full):
        # GH #27117
        partition_cols = "bool"
        df = df_full
        with tm.ensure_clean_dir() as path:
            df.to_parquet(
                path,
                engine="fastparquet",
                partition_cols=partition_cols,
                compression=None,
            )
            assert os.path.exists(path)
            import fastparquet

            actual_partition_cols = fastparquet.ParquetFile(path, False).cats
            assert len(actual_partition_cols) == 1
Example #11
0
def test_ambiguous_archive_tar():
    with tm.ensure_clean_dir() as dir:
        csvAPath = os.path.join(dir, "a.csv")
        with open(csvAPath, "w") as a:
            a.write("foo,bar\n")
        csvBPath = os.path.join(dir, "b.csv")
        with open(csvBPath, "w") as b:
            b.write("foo,bar\n")

        tarpath = os.path.join(dir, "archive.tar")
        with tarfile.TarFile(tarpath, "w") as tar:
            tar.add(csvAPath, "a.csv")
            tar.add(csvBPath, "b.csv")

        with pytest.raises(ValueError, match="Multiple files found in TAR archive"):
            pd.read_csv(tarpath)
Example #12
0
def test_create_temp_directory():
    with tm.ensure_clean_dir() as path:
        assert os.path.exists(path)
        assert os.path.isdir(path)
    assert not os.path.exists(path)