def read_parquet(self, path, columns=None, metadata=None, schema=None, nthreads=1, use_pandas_metadata=False): """ Read Parquet data from path in file system. Can read from a single file or a directory of files Parameters ---------- path : str Single file path or directory columns : List[str], optional Subset of columns to read metadata : pyarrow.parquet.FileMetaData Known metadata to validate files against schema : pyarrow.parquet.Schema Known schema to validate files against. Alternative to metadata argument nthreads : int, default 1 Number of columns to read in parallel. If > 1, requires that the underlying file source is threadsafe use_pandas_metadata : boolean, default False If True and file has custom pandas schema metadata, ensure that index columns are also loaded Returns ------- table : pyarrow.Table """ from pyarrow.parquet import ParquetDataset dataset = ParquetDataset(path, schema=schema, metadata=metadata, filesystem=self) return dataset.read(columns=columns, nthreads=nthreads, use_pandas_metadata=use_pandas_metadata)
def read_parquet(self, path, columns=None, metadata=None, schema=None, use_threads=True, use_pandas_metadata=False): """ Read Parquet data from path in file system. Can read from a single file or a directory of files Parameters ---------- path : str Single file path or directory columns : List[str], optional Subset of columns to read metadata : pyarrow.parquet.FileMetaData Known metadata to validate files against schema : pyarrow.parquet.Schema Known schema to validate files against. Alternative to metadata argument use_threads : boolean, default True Perform multi-threaded column reads use_pandas_metadata : boolean, default False If True and file has custom pandas schema metadata, ensure that index columns are also loaded Returns ------- table : pyarrow.Table """ from pyarrow.parquet import ParquetDataset dataset = ParquetDataset(path, schema=schema, metadata=metadata, filesystem=self) return dataset.read(columns=columns, use_threads=use_threads, use_pandas_metadata=use_pandas_metadata)
def read_parquet(self, path, columns=None, metadata=None, schema=None, use_threads=True, use_pandas_metadata=False): """ Read Parquet data from path in file system. Can read from a single file or a directory of files. Parameters ---------- path : str Single file path or directory columns : List[str], optional Subset of columns to read. metadata : pyarrow.parquet.FileMetaData Known metadata to validate files against. schema : pyarrow.parquet.Schema Known schema to validate files against. Alternative to metadata argument. use_threads : bool, default True Perform multi-threaded column reads. use_pandas_metadata : bool, default False If True and file has custom pandas schema metadata, ensure that index columns are also loaded. Returns ------- table : pyarrow.Table """ from pyarrow.parquet import ParquetDataset dataset = ParquetDataset(path, schema=schema, metadata=metadata, filesystem=self) return dataset.read(columns=columns, use_threads=use_threads, use_pandas_metadata=use_pandas_metadata)
async def test_local_arrow_storage_provider( tmp_path: Path, test_values: dt_test_values) -> None: test_table, visit_ids = test_values structured_provider = LocalArrowProvider(tmp_path) await structured_provider.init() for table_name, test_data in test_table.items(): await structured_provider.store_record(TableName(table_name), test_data["visit_id"], test_data) token_list = [] for i in visit_ids: token_list.append(await structured_provider.finalize_visit_id(i)) await structured_provider.flush_cache() await asyncio.gather(*token_list) for table_name, test_data in test_table.items(): dataset = ParquetDataset(tmp_path / table_name) df: DataFrame = dataset.read().to_pandas() assert df.shape[0] == 1 for row in df.itertuples(index=False): if test_data["visit_id"] == INVALID_VISIT_ID: del test_data["visit_id"] assert row._asdict() == test_data
def read(cls, path, engine, columns, **kwargs): """Load a parquet object from the file path, returning a DataFrame. Ray DataFrame only supports pyarrow engine for now. Args: path: The filepath of the parquet file. We only support local files for now. engine: Ray only support pyarrow reader. This argument doesn't do anything for now. kwargs: Pass into parquet's read_pandas function. Notes: ParquetFile API is used. Please refer to the documentation here https://arrow.apache.org/docs/python/parquet.html """ from pyarrow.parquet import ParquetFile, ParquetDataset from modin.pandas.io import PQ_INDEX_REGEX if os.path.isdir(path): partitioned_columns = set() directory = True # We do a tree walk of the path directory because partitioned # parquet directories have a unique column at each directory level. # Thus, we can use os.walk(), which does a dfs search, to walk # through the different columns that the data is partitioned on for (root, dir_names, files) in os.walk(path): if dir_names: partitioned_columns.add(dir_names[0].split("=")[0]) if files: # Metadata files, git files, .DSStore if files[0][0] == ".": continue break partitioned_columns = list(partitioned_columns) if len(partitioned_columns): ErrorMessage.default_to_pandas("Mixed Partitioning Columns in Parquet") return cls.single_worker_read( path, engine=engine, columns=columns, **kwargs ) else: directory = False if not columns: if directory: # Path of the sample file that we will read to get the remaining columns pd = ParquetDataset(path) column_names = pd.schema.names else: pf = ParquetFile(path) column_names = pf.metadata.schema.names columns = [name for name in column_names if not PQ_INDEX_REGEX.match(name)] return cls.build_query_compiler(path, columns, **kwargs)
def preprocess_data(self): from table_utils import filter_outliers, sample self.shard = None with S3() as s3: from pyarrow.parquet import ParquetDataset if self.input: objs = s3.get_many(self.input) table = ParquetDataset([obj.path for obj in objs]).read() table = sample(filter_outliers(table, FIELDS), self.sample) self.shard = { field: table[field].to_numpy() for field in FIELDS } self.next(self.join)
def preprocess_data(self): with S3() as s3: from pyarrow.parquet import ParquetDataset if self.input: objs = s3.get_many(self.input) orig_table = ParquetDataset([obj.path for obj in objs]).read() self.num_rows_before = orig_table.num_rows table = process_data(orig_table) self.num_rows_after = table.num_rows print('selected %d/%d rows'\ % (self.num_rows_after, self.num_rows_before)) self.lat = table['pickup_latitude'].to_numpy() self.lon = table['pickup_longitude'].to_numpy() self.next(self.join)
def _build_node(build_dir, package, node_path, node, checks_contents=None, dry_run=False, env='default', ancestor_args={}): """ Parameters ---------- ancestor_args : dict any transform inherited from an ancestor plus any inherited handler kwargs Users can thus define kwargs that affect entire subtrees (e.g. transform: csv for 500 .txt files) and overriding of ancestor or peer values. Child transform or kwargs override ancestor k:v pairs. """ if _is_internal_node(node): if not dry_run: package.save_group(node_path, None) # Make a consumable copy. This is to cover a quirk introduced by accepting nodes named # like RESERVED keys -- if a RESERVED key is actually matched, it should be removed from # the node, or it gets treated like a subnode (or like a node with invalid content) node = node.copy() # NOTE: YAML parsing does not guarantee key order # fetch local transform and kwargs values; we do it using ifs # to prevent `key: None` from polluting the update local_args = _get_local_args( node, [RESERVED['transform'], RESERVED['kwargs']]) group_args = ancestor_args.copy() group_args.update(local_args) _consume(node, local_args) # if it's not a reserved word it's a group that we can descend groups = {k: v for k, v in iteritems(node) if _is_valid_group(v)} _consume(node, groups) if node: # Unused keys -- either keyword typos or node names with invalid values. # For now, until build.yml schemas, pointing out one should do. key, value = node.popitem() raise BuildException( "Invalid syntax: expected node data for {!r}, got {!r}".format( key, value)) for child_name, child_table in groups.items(): if glob.has_magic(child_name): # child_name is a glob string, use it to generate multiple child nodes for gchild_name, gchild_table in _gen_glob_data( build_dir, child_name, child_table): _build_node(build_dir, package, node_path + [gchild_name], gchild_table, checks_contents=checks_contents, dry_run=dry_run, env=env, ancestor_args=group_args) else: if not isinstance(child_name, str) or not is_nodename(child_name): raise StoreException("Invalid node name: %r" % child_name) _build_node(build_dir, package, node_path + [child_name], child_table, checks_contents=checks_contents, dry_run=dry_run, env=env, ancestor_args=group_args) else: # leaf node # prevent overwriting existing node names if '/'.join(node_path) in package: raise BuildException( "Naming conflict: {!r} added to package more than once".format( '/'.join(node_path))) # handle group leaf nodes (empty groups) if not node: if not dry_run: package.save_group(node_path, None) return include_package = node.get(RESERVED['package']) rel_path = node.get(RESERVED['file']) if rel_path and include_package: raise BuildException( "A node must define only one of {0} or {1}".format( RESERVED['file'], RESERVED['package'])) elif include_package: # package composition team, user, pkgname, subpath = parse_package(include_package, allow_subpath=True) existing_pkg = PackageStore.find_package(team, user, pkgname) if existing_pkg is None: raise BuildException("Package not found: %s" % include_package) if subpath: try: node = existing_pkg["/".join(subpath)] except KeyError: msg = "Package {team}:{owner}/{pkg} has no subpackage: {subpath}" raise BuildException( msg.format(team=team, owner=user, pkg=pkgname, subpath=subpath)) else: node = GroupNode(existing_pkg.get_contents().children) package.save_package_tree(node_path, node) elif rel_path: # handle nodes built from input files path = os.path.join(build_dir, rel_path) rel_meta_path = node.get(RESERVED['meta']) if rel_meta_path: with open(os.path.join(build_dir, rel_meta_path)) as fd: try: metadata = json.load(fd) except ValueError as ex: raise BuildException("Failed to parse %r as JSON: %s" % (rel_meta_path, ex)) if SYSTEM_METADATA in metadata: raise BuildException( "Invalid metadata in %r: not allowed to use key %r" % (rel_meta_path, SYSTEM_METADATA)) else: metadata = None # get either the locally defined transform and target or inherit from an ancestor transform = node.get(RESERVED['transform']) or ancestor_args.get( RESERVED['transform']) ID = 'id' # pylint:disable=C0103 PARQUET = 'parquet' # pylint:disable=C0103 if transform: transform = transform.lower() if transform in PANDAS_PARSERS: target = TargetType.PANDAS elif transform == PARQUET: target = TargetType.PANDAS elif transform == ID: target = TargetType.FILE else: raise BuildException("Unknown transform '%s' for %s" % (transform, rel_path)) else: # Guess transform and target based on file extension if not provided _, ext = splitext_no_dot(rel_path) if ext in PANDAS_PARSERS: transform = ext target = TargetType.PANDAS elif ext == PARQUET: transform = ext target = TargetType.PANDAS else: transform = ID target = TargetType.FILE print("Inferring 'transform: %s' for %s" % (transform, rel_path)) # TODO: parse/check environments: # environments = node.get(RESERVED['environments']) checks = node.get(RESERVED['checks']) if transform == ID: #TODO move this to a separate function if checks: with open(path, 'r') as fd: data = fd.read() _run_checks(data, checks, checks_contents, node_path, rel_path, target, env=env) if not dry_run: print("Registering %s..." % path) package.save_file(path, node_path, target, rel_path, transform, metadata) elif transform == PARQUET: if checks: from pyarrow.parquet import ParquetDataset dataset = ParquetDataset(path) table = dataset.read(nthreads=4) dataframe = table.to_pandas() _run_checks(dataframe, checks, checks_contents, node_path, rel_path, target, env=env) if not dry_run: print("Registering %s..." % path) package.save_file(path, node_path, target, rel_path, transform, metadata) else: # copy so we don't modify shared ancestor_args handler_args = dict(ancestor_args.get(RESERVED['kwargs'], {})) # local kwargs win the update handler_args.update(node.get(RESERVED['kwargs'], {})) # Check Cache store = PackageStore() path_hash = _path_hash(path, transform, handler_args) source_hash = digest_file(path) cachedobjs = [] if os.path.exists(store.cache_path(path_hash)): with open(store.cache_path(path_hash), 'r') as entry: cache_entry = json.load(entry) if cache_entry['source_hash'] == source_hash: cachedobjs = cache_entry['obj_hashes'] assert isinstance(cachedobjs, list) # TODO: check for changes in checks else use cache # below is a heavy-handed fix but it's OK for check builds to be slow if not checks and cachedobjs and all( os.path.exists(store.object_path(obj)) for obj in cachedobjs): # Use existing objects instead of rebuilding package.save_cached_df(cachedobjs, node_path, target, rel_path, transform, metadata) else: # read source file into DataFrame print("Serializing %s..." % path) if _have_pyspark(): dataframe = _file_to_spark_data_frame( transform, path, handler_args) else: dataframe = _file_to_data_frame( transform, path, handler_args) if checks: # TODO: test that design works for internal nodes... e.g. iterating # over the children and getting/checking the data, err msgs, etc. _run_checks(dataframe, checks, checks_contents, node_path, rel_path, target, env=env) # serialize DataFrame to file(s) if not dry_run: print("Saving as binary dataframe...") obj_hashes = package.save_df(dataframe, node_path, target, rel_path, transform, metadata) # Add to cache cache_entry = dict(source_hash=source_hash, obj_hashes=obj_hashes) with open(store.cache_path(path_hash), 'w') as entry: json.dump(cache_entry, entry) else: # rel_path and package are both None raise BuildException( "Leaf nodes must define either a %s or %s key" % (RESERVED['file'], RESERVED['package']))
def read_parquet(cls, path, engine, columns, **kwargs): """Load a parquet object from the file path, returning a DataFrame. Ray DataFrame only supports pyarrow engine for now. Args: path: The filepath of the parquet file. We only support local files for now. engine: Ray only support pyarrow reader. This argument doesn't do anything for now. kwargs: Pass into parquet's read_pandas function. Notes: ParquetFile API is used. Please refer to the documentation here https://arrow.apache.org/docs/python/parquet.html """ from pyarrow.parquet import ParquetFile, ParquetDataset if cls.read_parquet_remote_task is None: return super(RayIO, cls).read_parquet(path, engine, columns, **kwargs) file_path = path if os.path.isdir(path): directory = True partitioned_columns = set() # We do a tree walk of the path directory because partitioned # parquet directories have a unique column at each directory level. # Thus, we can use os.walk(), which does a dfs search, to walk # through the different columns that the data is partitioned on for (root, dir_names, files) in os.walk(path): if dir_names: partitioned_columns.add(dir_names[0].split("=")[0]) if files: # Metadata files, git files, .DSStore if files[0][0] == ".": continue file_path = os.path.join(root, files[0]) break partitioned_columns = list(partitioned_columns) else: directory = False if not columns: if directory: # Path of the sample file that we will read to get the remaining # columns. from pyarrow import ArrowIOError try: pd = ParquetDataset(file_path) except ArrowIOError: pd = ParquetDataset(path) column_names = pd.schema.names else: pf = ParquetFile(path) column_names = pf.metadata.schema.names columns = [ name for name in column_names if not PQ_INDEX_REGEX.match(name) ] # Cannot read in parquet file by only reading in the partitioned column. # Thus, we have to remove the partition columns from the columns to # ensure that when we do the math for the blocks, the partition column # will be read in along with a non partition column. if columns and directory and any(col in partitioned_columns for col in columns): columns = [ col for col in columns if col not in partitioned_columns ] # If all of the columns wanted are partition columns, return an # empty dataframe with the desired columns. if len(columns) == 0: return cls.query_compiler_cls.from_pandas( pandas.DataFrame(columns=partitioned_columns), block_partitions_cls=cls.frame_mgr_cls, ) num_partitions = cls.frame_mgr_cls._compute_num_partitions() num_splits = min(len(columns), num_partitions) # Each item in this list will be a list of column names of the original df column_splits = (len(columns) // num_partitions if len(columns) % num_partitions == 0 else len(columns) // num_partitions + 1) col_partitions = [ columns[i:i + column_splits] for i in range(0, len(columns), column_splits) ] column_widths = [len(c) for c in col_partitions] # Each item in this list will be a list of columns of original df # partitioned to smaller pieces along rows. # We need to transpose the oids array to fit our schema. # TODO (williamma12): This part can be parallelized even more if we # separate the partitioned parquet file code path from the default one. # The workers return multiple objects for each part of the file read: # - The first n - 2 objects are partitions of data # - The n - 1 object is the length of the partition. # - The nth object is the dtypes of the partition. We combine these to # form the final dtypes below. blk_partitions = np.array([ cls.read_parquet_remote_task._remote( args=(path, cols + partitioned_columns, num_splits, kwargs), num_return_vals=num_splits + 2, ) if directory and cols == col_partitions[len(col_partitions) - 1] else cls.read_parquet_remote_task._remote( args=(path, cols, num_splits, kwargs), num_return_vals=num_splits + 2, ) for cols in col_partitions ]).T # Metadata index_len = ray.get(blk_partitions[-2][0]) index = pandas.RangeIndex(index_len) index_chunksize = compute_chunksize(pandas.DataFrame(index=index), num_splits, axis=0) if index_chunksize > index_len: row_lengths = [index_len] + [0 for _ in range(num_splits - 1)] else: row_lengths = [ index_chunksize if i != num_splits - 1 else index_len - (index_chunksize * (num_splits - 1)) for i in range(num_splits) ] # Compute dtypes concatenating the results from each of the columns splits # determined above. This creates a pandas Series that contains a dtype for every # column. dtypes_ids = list(blk_partitions[-1]) dtypes = pandas.concat(ray.get(dtypes_ids), axis=0) blk_partitions = blk_partitions[:-2] remote_partitions = np.array([[ cls.frame_partition_cls( blk_partitions[i][j], length=row_lengths[i], width=column_widths[j], ) for j in range(len(blk_partitions[i])) ] for i in range(len(blk_partitions))]) if directory: columns += partitioned_columns dtypes.index = columns new_query_compiler = cls.query_compiler_cls( cls.frame_mgr_cls(remote_partitions), index, columns, dtypes=dtypes) return new_query_compiler
def _read(cls, path, engine, columns, **kwargs): """ Load a parquet object from the file path, returning a query compiler. Parameters ---------- path : str, path object or file-like object The filepath of the parquet file in local filesystem or hdfs. engine : str Parquet library to use (only 'PyArrow' is supported for now). columns : list If not None, only these columns will be read from the file. **kwargs : dict Keyword arguments. Returns ------- BaseQueryCompiler A new Query Compiler. Notes ----- ParquetFile API is used. Please refer to the documentation here https://arrow.apache.org/docs/python/parquet.html """ from pyarrow.parquet import ParquetDataset from modin.pandas.io import PQ_INDEX_REGEX if isinstance(path, str) and os.path.isdir(path): partitioned_columns = set() # We do a tree walk of the path directory because partitioned # parquet directories have a unique column at each directory level. # Thus, we can use os.walk(), which does a dfs search, to walk # through the different columns that the data is partitioned on for (root, dir_names, files) in os.walk(path): if dir_names: partitioned_columns.add(dir_names[0].split("=")[0]) if files: # Metadata files, git files, .DSStore if files[0][0] == ".": continue break partitioned_columns = list(partitioned_columns) if len(partitioned_columns): ErrorMessage.default_to_pandas("Mixed Partitioning Columns in Parquet") return cls.single_worker_read( path, engine=engine, columns=columns, **kwargs ) if not columns: import fsspec.core from pandas.io.common import is_fsspec_url fs, path_ = ( fsspec.core.url_to_fs(path, **(kwargs.get("storage_options") or {})) if is_fsspec_url(path) else (None, path) ) dataset = ParquetDataset(path_, filesystem=fs, use_legacy_dataset=False) column_names = dataset.schema.names if dataset.schema.pandas_metadata is not None: index_columns = dataset.schema.pandas_metadata.get("index_columns", []) column_names = [c for c in column_names if c not in index_columns] columns = [name for name in column_names if not PQ_INDEX_REGEX.match(name)] return cls.build_query_compiler(path, columns, **kwargs)
def _read(cls, path, engine, columns, **kwargs): """ Load a parquet object from the file path, returning a query compiler. Parameters ---------- path : str, path object or file-like object The filepath of the parquet file in local filesystem or hdfs. engine : str Parquet library to use (only 'PyArrow' is supported for now). columns : list If not None, only these columns will be read from the file. **kwargs : dict Keyword arguments. Returns ------- BaseQueryCompiler A new Query Compiler. Notes ----- ParquetFile API is used. Please refer to the documentation here https://arrow.apache.org/docs/python/parquet.html """ from pyarrow.parquet import ParquetFile, ParquetDataset from modin.pandas.io import PQ_INDEX_REGEX if isinstance(path, str) and os.path.isdir(path): partitioned_columns = set() directory = True # We do a tree walk of the path directory because partitioned # parquet directories have a unique column at each directory level. # Thus, we can use os.walk(), which does a dfs search, to walk # through the different columns that the data is partitioned on for (root, dir_names, files) in os.walk(path): if dir_names: partitioned_columns.add(dir_names[0].split("=")[0]) if files: # Metadata files, git files, .DSStore if files[0][0] == ".": continue break partitioned_columns = list(partitioned_columns) if len(partitioned_columns): ErrorMessage.default_to_pandas( "Mixed Partitioning Columns in Parquet") return cls.single_worker_read(path, engine=engine, columns=columns, **kwargs) else: directory = False if not columns: import s3fs if directory: # Path of the sample file that we will read to get the remaining columns pd = ParquetDataset(path) meta = pd.metadata column_names = pd.schema.to_arrow_schema().names elif isinstance(path, str) and path.startswith("hdfs://"): import fsspec.core fs, path = fsspec.core.url_to_fs(path) pd = ParquetDataset(path, filesystem=fs) meta = pd.metadata column_names = pd.schema.to_arrow_schema().names elif isinstance(path, s3fs.S3File) or (isinstance(path, str) and path.startswith("s3://")): from botocore.exceptions import NoCredentialsError if isinstance(path, s3fs.S3File): bucket_path = path.url().split(".s3.amazonaws.com") path = "s3://" + bucket_path[0].split( "://")[1] + bucket_path[1] try: fs = s3fs.S3FileSystem() pd = ParquetDataset(path, filesystem=fs) except NoCredentialsError: fs = s3fs.S3FileSystem(anon=True) pd = ParquetDataset(path, filesystem=fs) meta = pd.metadata column_names = pd.schema.to_arrow_schema().names else: meta = ParquetFile(path).metadata column_names = meta.schema.to_arrow_schema().names if meta is not None and meta.metadata is not None: pandas_metadata = meta.metadata.get(b"pandas", None) if pandas_metadata is not None: import json # This is how we convert the metadata from pyarrow to a python # dictionary, from which we then get the index columns. # We use these to filter out from the columns in the metadata since # the pyarrow storage has no concept of row labels/index. # This ensures that our metadata lines up with the partitions without # extra communication steps once we have done all the remote # computation. index_columns = json.loads( pandas_metadata.decode("utf8")).get( "index_columns", []) column_names = [ c for c in column_names if c not in index_columns ] columns = [ name for name in column_names if not PQ_INDEX_REGEX.match(name) ] return cls.build_query_compiler(path, columns, **kwargs)
def read(cls, path, engine, columns, **kwargs): """Load a parquet object from the file path, returning a Modin DataFrame. Modin only supports pyarrow engine for now. Args: path: The filepath of the parquet file. We only support local files for now. engine: Modin only supports pyarrow reader. This argument doesn't do anything for now. kwargs: Pass into parquet's read_pandas function. Notes: ParquetFile API is used. Please refer to the documentation here https://arrow.apache.org/docs/python/parquet.html """ from pyarrow.parquet import ParquetFile, ParquetDataset from modin.pandas.io import PQ_INDEX_REGEX if os.path.isdir(path): partitioned_columns = set() directory = True # We do a tree walk of the path directory because partitioned # parquet directories have a unique column at each directory level. # Thus, we can use os.walk(), which does a dfs search, to walk # through the different columns that the data is partitioned on for (root, dir_names, files) in os.walk(path): if dir_names: partitioned_columns.add(dir_names[0].split("=")[0]) if files: # Metadata files, git files, .DSStore if files[0][0] == ".": continue break partitioned_columns = list(partitioned_columns) if len(partitioned_columns): ErrorMessage.default_to_pandas( "Mixed Partitioning Columns in Parquet") return cls.single_worker_read(path, engine=engine, columns=columns, **kwargs) else: directory = False if not columns: if directory: # Path of the sample file that we will read to get the remaining columns pd = ParquetDataset(path) meta = pd.metadata column_names = pd.schema.names else: meta = ParquetFile(path).metadata column_names = meta.schema.names if meta is not None: # This is how we convert the metadata from pyarrow to a python # dictionary, from which we then get the index columns. # We use these to filter out from the columns in the metadata since # the pyarrow storage has no concept of row labels/index. # This ensures that our metadata lines up with the partitions without # extra communication steps once we `have done all the remote # computation. index_columns = eval(meta.metadata[b"pandas"].replace( b"null", b"None")).get("index_columns", []) column_names = [ c for c in column_names if c not in index_columns ] columns = [ name for name in column_names if not PQ_INDEX_REGEX.match(name) ] return cls.build_query_compiler(path, columns, **kwargs)
def read_parquet(path, columns, **kwargs): from legate.core import Rect from .runtime import _runtime as rt path = util.to_list_if_scalar(path) if len(path) == 1 and os.path.isdir(path[0]): from pyarrow.parquet import ParquetDataset ds = ParquetDataset(path) path = [piece.path for piece in ds.pieces] else: from pyarrow.parquet import ParquetFile ds = ParquetFile(path[0]) if rt.debug: assert all(ParquetFile(p).schema == ds.schema for p in path) dedup_names = set() for name in ds.schema.names: if name in dedup_names: raise ValueError( "Duplicate column names in schema are not supported.") dedup_names.add(name) schema = ds.schema.to_arrow_schema() index_descs = [] index_materialized = False if str.encode("pandas") in ds.metadata.metadata: import json pandas_metadata = json.loads( ds.metadata.metadata[str.encode("pandas")]) index_descs = pandas_metadata["index_columns"] index_materialized = len(index_descs) > 0 and all( isinstance(desc, str) for desc in index_descs) if columns is None: column_names = schema.names elif index_materialized: column_names = columns + index_descs else: column_names = columns for name in column_names: if name not in dedup_names: raise ValueError("Field named %s not found in the schema." % name) schema = [schema.field(name) for name in column_names] del columns storage = rt.create_output_storage() offsets_storage = None columns = [] for column_info in schema: dtype = ty.to_legate_dtype(column_info.type) column = storage.create_column(dtype) if ty.is_string_dtype(dtype): if offsets_storage is None: offsets_storage = rt.create_output_storage() offsets_column = offsets_storage.create_column(ty.int32, nullable=False) chars_storage = rt.create_output_storage() char_column = chars_storage.create_column(ty.int8, nullable=False) column.add_child(offsets_column) column.add_child(char_column) column = column.as_string_column() columns.append(column) plan = Map(rt, OpCode.READ_PARQUET) plan.add_scalar_arg(len(path), ty.uint32) for f in path: plan.add_scalar_arg(f, ty.string) plan.add_scalar_arg(len(column_names), ty.uint32) for name in column_names: plan.add_scalar_arg(name, ty.string) plan.add_scalar_arg(len(columns), ty.uint32) for column in columns: column.add_to_plan_output_only(plan) counts = plan.execute(Rect([rt.num_pieces])) storage = plan.promote_output_storage(storage) rt.register_external_weighted_partition(storage.default_ipart, counts) del plan size = counts.cast(ty.int64).sum() if index_materialized: to_filter = set(index_descs) index_columns = [] value_columns = [] value_column_names = [] for idx, name in enumerate(column_names): if name in to_filter: index_columns.append(columns[idx]) else: value_columns.append(columns[idx]) value_column_names.append(column_names[idx]) sanitized_names = [ None if name == f"__index_level_{level}__" else name for level, name in enumerate(index_descs) ] index = create_index_from_columns(index_columns, size, sanitized_names) else: value_columns = columns value_column_names = column_names if len(index_descs) > 0: assert len(index_descs) == 1 index_desc = index_descs[0] name = index_desc["name"] start = rt.create_future(index_desc["start"], ty.int64) stop = rt.create_future(index_desc["stop"], ty.int64) step = rt.create_future(index_desc["step"], ty.int64) index = create_range_index(storage, size, name, start, stop, step) else: index = create_range_index(storage, size) from pandas import Index return { "frame": Table(rt, index, value_columns), "columns": Index(value_column_names), }