def __init__( # pylint: disable=too-many-arguments self, query: str, columns: Optional[List[str]] = None, shuffle: bool = False, shuffler_capacity: int = 128, seed: Optional[int] = None, world_size: int = 1, rank: int = 0, ): self.uri = query self.columns = columns self.shuffle = shuffle self.shuffler_capacity = shuffler_capacity self.seed = seed self.rank = rank self.world_size = world_size if self.world_size > 1: logger.info("Running in distributed mode, world size=%s, rank=%s", world_size, rank) # Provide determinstic order between distributed workers. self.files = sorted(Resolver.resolve(self.uri)) logger.info("Loading parquet files: %s", self.files) self.spark_row_metadata = Resolver.get_schema(self.uri)
def test_resolve_local_fs(self): with tempfile.TemporaryDirectory() as testdir: for i in range(10): with open(os.path.join(testdir, f"{i}.parquet"), "w") as fobj: fobj.write("123") files = Resolver.resolve(testdir) expected_files = [ "file://" + os.path.join(testdir, f"{i}.parquet") for i in range(10) ] self.assertCountEqual(expected_files, files) with tempfile.TemporaryDirectory() as emptydir: self.assertEqual([], Resolver.resolve(emptydir))
def test_resolve_local_fs(tmp_path): for i in range(10): with (tmp_path / f"{i}.parquet").open(mode="w") as fobj: fobj.write("123") files = Resolver.resolve(tmp_path) expected_files = [ "file://" + str(tmp_path / f"{i}.parquet") for i in range(10) ] assert_count_equal(expected_files, files)
def test_resolve_empty_dir(tmp_path): assert [] == list(Resolver.resolve(tmp_path))
def teardown_function(_): Resolver.reset()
def tearDown(self) -> None: Resolver.reset()