def test_construct_from_invalid_sources_raise(multisourcefs): child1 = ds.FileSystemDatasetFactory( multisourcefs, fs.FileSelector('/plain'), format=ds.ParquetFileFormat() ) child2 = ds.FileSystemDatasetFactory( multisourcefs, fs.FileSelector('/schema'), format=ds.ParquetFileFormat() ) with pytest.raises(TypeError, match='Expected.*FileSystemDatasetFactory'): ds.dataset([child1, child2]) expected = ( "Expected a list of path-like or dataset objects. The given list " "contains the following types: int" ) with pytest.raises(TypeError, match=expected): ds.dataset([1, 2, 3]) expected = ( "Expected a path-like, list of path-likes or a list of Datasets " "instead of the given type: NoneType" ) with pytest.raises(TypeError, match=expected): ds.dataset(None)
def _as_generator(self): info = self._local_fs_client.get_file_info([self.path])[0] if info.type == fs.FileType.NotFound: raise FileNotFoundError(f"file {self.path} not found") elif info.type == fs.FileType.File: for line in self._read_buffer_lines(): yield line else: selector = fs.FileSelector(self.path) file_infos = self._local_fs_client.get_file_info(selector) for file_info in file_infos: if file_info.base_name.startswith( ".") or file_info.base_name.startswith("_"): continue assert ( file_info.is_file ), f"{self.path} is directory contains a subdirectory: {file_info.path}" with io.TextIOWrapper( buffer=self._local_fs_client.open_input_stream( f"{self._address.file_path:}/{file_info.path}"), encoding="utf-8", ) as reader: for line in reader: yield line
def main2(): # By default, MinIO will listen for unencrypted HTTP traffic. minio = fs.S3FileSystem(scheme="http", endpoint_override="10.0.0.2:9000") # List all contents in a bucket, recursively file_selector = fs.FileSelector('customer-data-text', recursive=True) print_file_info(minio, file_selector) print(read_pafs_file(minio, 'customer-data-text/customer.csv')) print(read_pafs_stream(minio, 'customer-data-text/customer.csv')) endpoint_url = 'http://10.0.0.2:9000' print_boto3_buckets(endpoint_url) # TODO: read multiple files using dataset # https://stackoverflow.com/questions/45082832/how-to-read-partitioned-parquet-files-from-s3-using-pyarrow-in-python file_system = get_s3fs() print(file_system.ls('example-data')) bucket_uri = 's3://example-data/external-data' print_parquet_pandas_shape(bucket_uri, file_system) print_parquet_dataset_info(bucket_uri, file_system, verbose=False) bucket_uri = 's3://example-data/external-clustered' print_parquet_pandas_shape(bucket_uri, file_system) print_parquet_dataset_info(bucket_uri, file_system, verbose=False)
def get(self, local_path): bucket, path = parsePath(local_path) print(bucket, path) fs_client = fs.LocalFileSystem() file_info_list = fs_client.get_file_info( fs.FileSelector(path, recursive=False)) files = [] dirs = [] for info in file_info_list: if info.type.value == 2: # File type files.append({ 'name': info.base_name, 'ext': info.extension, 'size': info.size, 'mtime': info.mtime.isoformat() }) elif info.type.value == 3: # Directory type dirs.append({ 'name': info.base_name, 'mtime': info.mtime.isoformat() }) self.finish(json.dumps({'files': files, 'dirs': dirs}))
def test_open_dataset_from_source_additional_kwargs(multisourcefs): child = ds.FileSystemDatasetFactory( multisourcefs, fs.FileSelector('/plain'), format=ds.ParquetFileFormat() ) with pytest.raises(ValueError, match="cannot pass any additional"): ds.dataset(child, format="parquet")
def test_partitioning_factory(mockfs): paths_or_selector = fs.FileSelector('subdir', recursive=True) format = ds.ParquetFileFormat() options = ds.FileSystemFactoryOptions('subdir') partitioning_factory = ds.DirectoryPartitioning.discover(['group', 'key']) assert isinstance(partitioning_factory, ds.PartitioningFactory) options.partitioning_factory = partitioning_factory factory = ds.FileSystemDatasetFactory( mockfs, paths_or_selector, format, options ) inspected_schema = factory.inspect() # i64/f64 from data, group/key from "/1/xxx" and "/2/yyy" paths expected_schema = pa.schema([ ("i64", pa.int64()), ("f64", pa.float64()), ("str", pa.string()), ("group", pa.int32()), ("key", pa.string()), ]) assert inspected_schema.equals(expected_schema) hive_partitioning_factory = ds.HivePartitioning.discover() assert isinstance(hive_partitioning_factory, ds.PartitioningFactory)
def ls(self, path: str, recursive=False) -> List[File]: path = self._unwrap_path(path) files = [] try: curr_path_info = self._client.get_file_info(path) res_files = [] if curr_path_info.type == fs.FileType.File: res_files = [curr_path_info] elif curr_path_info.type == fs.FileType.Directory: res_files = self._client.get_file_info( fs.FileSelector(path, recursive=recursive)) for file in res_files: if file.type == fs.FileType.File: files.append( File( path=self._wrap_path(file.path), size=file.size, # ns to second mtime=int(file.mtime_ns / 1e9))) except RuntimeError as error: # This is a hack that snakebite can not handle generator if str(error) == 'generator raised StopIteration': pass else: raise return files
def test_fileinfo_list(self): fn = self.get_fresh_key() + "-listdir" self.write_file(fn, b"hello1") infs = self.s3.get_file_info(fs.FileSelector("bucket/", recursive=True)) fns = [x.path for x in infs] self.assertIn(fn, fns)
def _as_generator(self): info = self._hdfs_client.get_file_info([self.path])[0] if info.type == fs.FileType.NotFound: raise FileNotFoundError(f"file {self.path} not found") elif info.type == fs.FileType.File: # todo: with io.TextIOWrapper(buffer=self._hdfs_client.open_input_stream( self.path), encoding="utf-8") as reader: for line in reader: yield line else: selector = fs.FileSelector(os.path.join("/", self._address.path)) file_infos = self._hdfs_client.get_file_info(selector) for file_info in file_infos: if file_info.base_name == "_SUCCESS": continue assert ( file_info.is_file ), f"{self.path} is directory contains a subdirectory: {file_info.path}" with io.TextIOWrapper( buffer=self._hdfs_client.open_input_stream( f"{self._address.name_node}/{file_info.path}"), encoding="utf-8", ) as reader: for line in reader: yield line
def dataset(mockfs): format = ds.ParquetFileFormat() selector = fs.FileSelector('subdir', recursive=True) options = ds.FileSystemFactoryOptions('subdir') options.partitioning = ds.DirectoryPartitioning( pa.schema( [pa.field('group', pa.int32()), pa.field('key', pa.string())])) factory = ds.FileSystemDatasetFactory(mockfs, selector, format, options) return factory.finish()
def dataset(mockfs): format = ds.ParquetFileFormat() selector = fs.FileSelector('subdir', recursive=True) options = ds.FileSystemDiscoveryOptions('subdir') options.partition_scheme = ds.SchemaPartitionScheme( pa.schema( [pa.field('group', pa.int32()), pa.field('key', pa.string())])) discovery = ds.FileSystemDataSourceDiscovery(mockfs, selector, format, options) schema = discovery.inspect() source = discovery.finish() return ds.Dataset([source], schema)
def test_dataset_union(multisourcefs): child = ds.FileSystemDatasetFactory( multisourcefs, fs.FileSelector('/plain'), format=ds.ParquetFileFormat() ) factory = ds.UnionDatasetFactory([child]) # TODO(bkietz) reintroduce factory.children property assert len(factory.inspect_schemas()) == 1 assert all(isinstance(s, pa.Schema) for s in factory.inspect_schemas()) assert factory.inspect_schemas()[0].equals(child.inspect()) assert factory.inspect().equals(child.inspect()) assert isinstance(factory.finish(), ds.Dataset)
pa.field('f64', pa.float64())]) assert condition.validate(schema) == pa.bool_() i64_is_5 = ds.ComparisonExpression(ds.CompareOperator.Equal, ds.FieldExpression('i64'), ds.ScalarExpression(5)) i64_is_7 = ds.ComparisonExpression(ds.CompareOperator.Equal, ds.FieldExpression('i64'), ds.ScalarExpression(7)) assert condition.assume(i64_is_5).equals(ds.ScalarExpression(False)) assert condition.assume(i64_is_7).equals(ds.ScalarExpression(True)) assert str(condition) == "(i64 > 5:int64)" @pytest.mark.parametrize('paths_or_selector', [ fs.FileSelector('subdir', recursive=True), [ 'subdir', 'subdir/1', 'subdir/1/xxx', 'subdir/1/xxx/file0.parquet', 'subdir/2', 'subdir/2/yyy', 'subdir/2/yyy/file1.parquet', ] ]) def test_file_system_factory(mockfs, paths_or_selector): format = ds.ParquetFileFormat() options = ds.FileSystemFactoryOptions('subdir') options.partitioning = ds.DirectoryPartitioning(