Exemple #1
0
    def __init__(  # pylint: disable=too-many-arguments
        self,
        query: str,
        columns: Optional[List[str]] = None,
        shuffle: bool = False,
        shuffler_capacity: int = 128,
        seed: Optional[int] = None,
        world_size: int = 1,
        rank: int = 0,
    ):
        self.uri = query
        self.columns = columns
        self.shuffle = shuffle
        self.shuffler_capacity = shuffler_capacity
        self.seed = seed
        self.rank = rank
        self.world_size = world_size
        if self.world_size > 1:
            logger.info("Running in distributed mode, world size=%s, rank=%s",
                        world_size, rank)

        # Provide determinstic order between distributed workers.
        self.files = sorted(Resolver.resolve(self.uri))
        logger.info("Loading parquet files: %s", self.files)

        self.spark_row_metadata = Resolver.get_schema(self.uri)
Exemple #2
0
    def test_resolve_local_fs(self):
        with tempfile.TemporaryDirectory() as testdir:
            for i in range(10):
                with open(os.path.join(testdir, f"{i}.parquet"), "w") as fobj:
                    fobj.write("123")

            files = Resolver.resolve(testdir)
            expected_files = [
                "file://" + os.path.join(testdir, f"{i}.parquet") for i in range(10)
            ]
            self.assertCountEqual(expected_files, files)

        with tempfile.TemporaryDirectory() as emptydir:
            self.assertEqual([], Resolver.resolve(emptydir))
Exemple #3
0
def test_resolve_local_fs(tmp_path):
    for i in range(10):
        with (tmp_path / f"{i}.parquet").open(mode="w") as fobj:
            fobj.write("123")

    files = Resolver.resolve(tmp_path)
    expected_files = [
        "file://" + str(tmp_path / f"{i}.parquet") for i in range(10)
    ]
    assert_count_equal(expected_files, files)
Exemple #4
0
def test_resolve_empty_dir(tmp_path):
    assert [] == list(Resolver.resolve(tmp_path))
Exemple #5
0
def teardown_function(_):
    Resolver.reset()
Exemple #6
0
 def tearDown(self) -> None:
     Resolver.reset()