def run_partition_test(input_file: str, output_dir: str, filters: Optional[list] = None): milliseconds_since_epoch = int(time() * 1000) print('Parquet metadata: ' + str(pq.read_metadata(input_file))) print('Parquet schema: ' + pq.read_schema(input_file).to_string()) data = pq.read_table(source=input_file, filters=filters) # Write a dataset and collect metadata information of all written files metadata_collector = [] root_path = output_dir + 'partitioned_' + str(milliseconds_since_epoch) pq.write_to_dataset(data, root_path=root_path, partition_cols=['start_year'], metadata_collector=metadata_collector) # Write the ``_common_metadata`` parquet file without row groups statistics pq.write_metadata(data.schema, root_path + '/_common_metadata') # Write the ``_metadata`` parquet file with row groups statistics of all files # Gives following error: # File "pyarrow/_parquet.pyx", line 616, in pyarrow._parquet.FileMetaData.append_row_groups # RuntimeError: AppendRowGroups requires equal schemas. # data.schema has one more column than partitioned files when partitioning by one column # Related? https://github.com/dask/dask/issues/6243 # pq.write_metadata(data.schema, root_path + '/_metadata', metadata_collector=metadata_collector) # Read from partitioned dataset # use the new generic Dataset API start_year = 2018 value = 50000 table = pq.read_table(root_path, filters=[('start_year', '>=', start_year), ('value', '>', value)]) # filters=[('start_year', '>=', start_year)]) print(table.to_pandas())
def test_read_common_metadata_files(tmpdir): import pyarrow.parquet as pq N = 100 df = pd.DataFrame({ 'index': np.arange(N), 'values': np.random.randn(N) }, columns=['index', 'values']) base_path = str(tmpdir) data_path = pjoin(base_path, 'data.parquet') table = pa.Table.from_pandas(df) _write_table(table, data_path) metadata_path = pjoin(base_path, '_metadata') pq.write_metadata(table.schema, metadata_path) dataset = pq.ParquetDataset(base_path) assert dataset.metadata_path == metadata_path pf = pq.ParquetFile(data_path) assert dataset.schema.equals(pf.schema) # handle list of one directory dataset2 = pq.ParquetDataset([base_path]) assert dataset2.schema.equals(dataset.schema)
def _test_read_common_metadata_files(fs, base_path): import pyarrow.parquet as pq N = 100 df = pd.DataFrame({ 'index': np.arange(N), 'values': np.random.randn(N) }, columns=['index', 'values']) data_path = pjoin(base_path, 'data.parquet') table = pa.Table.from_pandas(df) with fs.open(data_path, 'wb') as f: _write_table(table, f) metadata_path = pjoin(base_path, '_metadata') with fs.open(metadata_path, 'wb') as f: pq.write_metadata(table.schema, f) dataset = pq.ParquetDataset(base_path, filesystem=fs) assert dataset.metadata_path == metadata_path with fs.open(data_path) as f: common_schema = pq.read_metadata(f).schema assert dataset.schema.equals(common_schema) # handle list of one directory dataset2 = pq.ParquetDataset([base_path], filesystem=fs) assert dataset2.schema.equals(dataset.schema)
def _write_partition_pyarrow( df, path, fs, filename, write_index, partition_on, metadata_path=None, **kwargs ): import pyarrow as pa from pyarrow import parquet t = pa.Table.from_pandas(df, preserve_index=write_index) if partition_on: parquet.write_to_dataset( t, path, partition_cols=partition_on, preserve_index=write_index, filesystem=fs, **kwargs ) else: with fs.open(filename, "wb") as fil: parquet.write_table(t, fil, **kwargs) if metadata_path is not None: with fs.open(metadata_path, "wb") as fil: # Get only arguments specified in the function kwargs_meta = { k: v for k, v in kwargs.items() if k in _pyarrow_write_metadata_kwargs } parquet.write_metadata(t.schema, fil, **kwargs_meta)
def write_metadata(parts, fmd, fs, path, append=False, **kwargs): if parts: if not append: # Get only arguments specified in the function common_metadata_path = fs.sep.join([path, "_common_metadata"]) keywords = getargspec(pq.write_metadata).args kwargs_meta = { k: v for k, v in kwargs.items() if k in keywords } with fs.open(common_metadata_path, "wb") as fil: pq.write_metadata(parts[0][0]["schema"], fil, **kwargs_meta) # Aggregate metadata and write to _metadata file metadata_path = fs.sep.join([path, "_metadata"]) if append and fmd is not None: _meta = fmd i_start = 0 else: _meta = parts[0][0]["meta"] i_start = 1 for i in range(i_start, len(parts)): _meta.append_row_groups(parts[i][0]["meta"]) with fs.open(metadata_path, "wb") as fil: _meta.write_metadata_file(fil)
def write_commonmetadata_file(): with filesystem.open(os.path.join(path, "part.0.parquet")) as f: pf = pq.ParquetFile(f) all_metadata = copy.copy(pf.metadata.metadata) all_metadata[b'spatialpandas'] = b_spatial_metadata new_schema = pf.schema.to_arrow_schema().with_metadata(all_metadata) with filesystem.open(os.path.join(path, "_common_metadata"), 'wb') as f: pq.write_metadata(new_schema, f)
def _write_partition_pyarrow(df, open_with, filename, write_index, metadata_path=None, **kwargs): import pyarrow as pa from pyarrow import parquet t = pa.Table.from_pandas(df, preserve_index=write_index) with open_with(filename, 'wb') as fil: parquet.write_table(t, fil, **kwargs) if metadata_path is not None: with open_with(metadata_path, 'wb') as fil: parquet.write_metadata(t.schema, fil, **kwargs)
def _write_partition_pyarrow(df, open_with, filename, write_index, metadata_path=None, **kwargs): import pyarrow as pa from pyarrow import parquet t = pa.Table.from_pandas(df, preserve_index=write_index) with open_with(filename, 'wb') as fil: parquet.write_table(t, fil, **kwargs) if metadata_path is not None: with open_with(metadata_path, 'wb') as fil: kwargs.pop('compression', None) parquet.write_metadata(t.schema, fil, **kwargs)
def test_write_metadata(tempdir): path = str(tempdir / "metadata") schema = pa.schema([("a", "int64"), ("b", "float64")]) # write a pyarrow schema pq.write_metadata(schema, path) parquet_meta = pq.read_metadata(path) schema_as_arrow = parquet_meta.schema.to_arrow_schema() assert schema_as_arrow.equals(schema) # ARROW-8980: Check that the ARROW:schema metadata key was removed if schema_as_arrow.metadata: assert b'ARROW:schema' not in schema_as_arrow.metadata # pass through writer keyword arguments for version in ["1.0", "2.0"]: pq.write_metadata(schema, path, version=version) parquet_meta = pq.read_metadata(path) assert parquet_meta.format_version == version # metadata_collector: list of FileMetaData objects table = pa.table({'a': [1, 2], 'b': [.1, .2]}, schema=schema) pq.write_table(table, tempdir / "data.parquet") parquet_meta = pq.read_metadata(str(tempdir / "data.parquet")) pq.write_metadata(schema, path, metadata_collector=[parquet_meta, parquet_meta]) parquet_meta_mult = pq.read_metadata(path) assert parquet_meta_mult.num_row_groups == 2 # append metadata with different schema raises an error with pytest.raises(RuntimeError, match="requires equal schemas"): pq.write_metadata(pa.schema([("a", "int32"), ("b", "null")]), path, metadata_collector=[parquet_meta, parquet_meta])
def to_parquet_dask( ddf: DaskGeoDataFrame, path, compression: Optional[str] = "snappy", filesystem: Optional[fsspec.spec.AbstractFileSystem] = None, storage_options: Optional[Dict[str, Any]] = None, **kwargs, ) -> None: assert isinstance(ddf, DaskGeoDataFrame) filesystem = validate_coerce_filesystem(path, filesystem) if path and filesystem.isdir(path): filesystem.rm(path, recursive=True) dd_to_parquet( ddf, path, engine="pyarrow", compression=compression, storage_options=storage_options, **kwargs, ) # Write partition bounding boxes to the _metadata file partition_bounds = {} for series_name in ddf.columns: series = ddf[series_name] if isinstance(series.dtype, GeometryDtype): if series._partition_bounds is None: # Bounds are not already computed. Compute bounds from the parquet file # that was just written. filesystem.invalidate_cache(path) series = read_parquet_dask(path, columns=[series_name], filesystem=filesystem, load_divisions=False)[series_name] partition_bounds[series_name] = series.partition_bounds.to_dict() spatial_metadata = {'partition_bounds': partition_bounds} b_spatial_metadata = json.dumps(spatial_metadata).encode('utf') pqds = pq.ParquetDataset(path, filesystem=filesystem, validate_schema=False) all_metadata = copy.copy(pqds.common_metadata.metadata) all_metadata[b'spatialpandas'] = b_spatial_metadata schema = pqds.common_metadata.schema.to_arrow_schema() new_schema = schema.with_metadata(all_metadata) with filesystem.open(pqds.common_metadata_path, 'wb') as f: pq.write_metadata(new_schema, f)
def _write_partition_pyarrow(df, open_with, filename, write_index, metadata_path=None, **kwargs): import pyarrow as pa from pyarrow import parquet t = pa.Table.from_pandas(df, preserve_index=write_index) with open_with(filename, 'wb') as fil: parquet.write_table(t, fil, **kwargs) if metadata_path is not None: with open_with(metadata_path, 'wb') as fil: # Get only arguments specified in the function kwargs_meta = {k: v for k, v in kwargs.items() if k in _pyarrow_write_metadata_kwargs} parquet.write_metadata(t.schema, fil, **kwargs_meta)
def write_parquet( fs: fsspec.AbstractFileSystem, path: str, df: pd.DataFrame, partition_cols: Optional[List[str]], schema: pa.Schema, **kwargs, ): """ Write a single dataframe to parquet. """ # Check partition values are valid before writing to parquet mappings = check_partition_columns(df=df, partition_columns=partition_cols) df = clean_partition_cols(df=df, mappings=mappings) # Dataframe -> pyarrow Table table = pa.Table.from_pandas(df, schema=schema) if "basename_template" not in kwargs and "ts_init" in df.columns: kwargs["basename_template"] = ( f"{df['ts_init'].iloc[0]}-{df['ts_init'].iloc[-1]}" + "-{i}.parquet") # Write the actual file partitions = (ds.partitioning( schema=pa.schema( fields=[table.schema.field(c) for c in (partition_cols)]), flavor="hive", ) if partition_cols else None) ds.write_dataset( data=table, base_dir=path, filesystem=fs, partitioning=partitions, format="parquet", **kwargs, ) # Write the ``_common_metadata`` parquet file without row groups statistics pq.write_metadata(table.schema, f"{path}/_common_metadata", version="2.0", filesystem=fs) # Write out any partition columns we had to modify due to filesystem requirements if mappings: write_partition_column_mappings(fs=fs, path=path, mappings=mappings)
def test_dataset_read_pandas_common_metadata(tmpdir): # ARROW-1103 import pyarrow.parquet as pq nfiles = 5 size = 5 dirpath = tmpdir.join(guid()).strpath os.mkdir(dirpath) test_data = [] frames = [] paths = [] for i in range(nfiles): df = _test_dataframe(size, seed=i) df.index = pd.Index(np.arange(i * size, (i + 1) * size)) df.index.name = 'index' path = pjoin(dirpath, '{0}.parquet'.format(i)) df_ex_index = df.reset_index(drop=True) df_ex_index['index'] = df.index table = pa.Table.from_pandas(df_ex_index, preserve_index=False) # Obliterate metadata table = table.replace_schema_metadata(None) assert table.schema.metadata is None _write_table(table, path) test_data.append(table) frames.append(df) paths.append(path) # Write _metadata common file table_for_metadata = pa.Table.from_pandas(df) pq.write_metadata(table_for_metadata.schema, pjoin(dirpath, '_metadata')) dataset = pq.ParquetDataset(dirpath) columns = ['uint8', 'strings'] result = dataset.read_pandas(columns=columns).to_pandas() expected = pd.concat([x[columns] for x in frames]) tm.assert_frame_equal(result, expected)
def _write_partition_pyarrow(df, open_with, path, fs, filename, write_index, partition_on, metadata_path=None, **kwargs): import pyarrow as pa from pyarrow import parquet t = pa.Table.from_pandas(df, preserve_index=write_index) if partition_on: parquet.write_to_dataset(t, path, partition_cols=partition_on, filesystem=fs) else: with open_with(filename, 'wb') as fil: parquet.write_table(t, fil, **kwargs) if metadata_path is not None: with open_with(metadata_path, 'wb') as fil: # Get only arguments specified in the function kwargs_meta = {k: v for k, v in kwargs.items() if k in _pyarrow_write_metadata_kwargs} parquet.write_metadata(t.schema, fil, **kwargs_meta)
def test_dataset_read_pandas_common_metadata(tempdir, preserve_index): # ARROW-1103 nfiles = 5 size = 5 dirpath = tempdir / guid() dirpath.mkdir() test_data = [] frames = [] paths = [] for i in range(nfiles): df = _test_dataframe(size, seed=i) df.index = pd.Index(np.arange(i * size, (i + 1) * size), name='index') path = dirpath / '{}.parquet'.format(i) table = pa.Table.from_pandas(df, preserve_index=preserve_index) # Obliterate metadata table = table.replace_schema_metadata(None) assert table.schema.metadata is None _write_table(table, path) test_data.append(table) frames.append(df) paths.append(path) # Write _metadata common file table_for_metadata = pa.Table.from_pandas( df, preserve_index=preserve_index ) pq.write_metadata(table_for_metadata.schema, dirpath / '_metadata') dataset = pq.ParquetDataset(dirpath) columns = ['uint8', 'strings'] result = dataset.read_pandas(columns=columns).to_pandas() expected = pd.concat([x[columns] for x in frames]) expected.index.name = ( df.index.name if preserve_index is not False else None) tm.assert_frame_equal(result, expected)
def to_parquet_dask(ddf, path, compression="default", storage_options=None, **kwargs): assert isinstance(ddf, DaskGeoDataFrame) dd_to_parquet( ddf, path, engine="pyarrow", compression=compression, storage_options=storage_options, **kwargs ) # Write partition bounding boxes to the _metadata file partition_bounds = {} for series_name in ddf.columns: series = ddf[series_name] if isinstance(series.dtype, GeometryDtype): partition_bounds[series_name] = series.partition_bounds.to_dict() spatial_metadata = {'partition_bounds': partition_bounds} b_spatial_metadata = json.dumps(spatial_metadata).encode('utf') pqds = pq.ParquetDataset(path) all_metadata = copy.copy(pqds.common_metadata.metadata) all_metadata[b'spatialpandas'] = b_spatial_metadata new_schema = pqds.metadata.schema.to_arrow_schema().with_metadata(all_metadata) pq.write_metadata(new_schema, pqds.common_metadata_path)
def _schema2bytes(schema): buf = pa.BufferOutputStream() pq.write_metadata(schema, buf, version="2.0", coerce_timestamps="us") return buf.getvalue().to_pybytes()
def _write_chunks(self, chunk, append_only=False, **kwargs): # noqa: C901 processed_raw_files = self._load_processed_raw_files() log_filenames = kwargs.pop("log_filenames", False) # Split objects into their respective tables type_conv = { OrderBookDeltas: OrderBookDelta, OrderBookSnapshot: OrderBookDelta } tables = defaultdict(dict) skip_file = False for obj in chunk: if skip_file: continue if isinstance(obj, NewFile): if log_filenames: print(obj.name) if obj.name in processed_raw_files: skip_file = True else: skip_file = False processed_raw_files.append(obj.name) continue # TODO (bm) - better handling of instruments -> currency we're writing a file per instrument cls = type_conv.get(type(obj), type(obj)) for data in maybe_list(_serialize(obj)): instrument_id = data.get("instrument_id", None) if instrument_id not in tables[cls]: tables[cls][instrument_id] = [] tables[cls][instrument_id].append(data) for cls in tables: for ins_id in tables[cls]: name = f"{camel_to_snake_case(cls.__name__)}.parquet" if is_custom_data(cls): name = f"{GENERIC_DATA_PREFIX}{camel_to_snake_case(cls.__name__)}.parquet" fn = self.root.joinpath(name) df = pd.DataFrame(tables[cls][ins_id]) if df.empty: continue # Load any existing data, drop dupes if not append_only: if self.fs.exists(fn): existing = pd.read_parquet( str(fn), fs=self.fs, filters=[("instrument_id", "=", ins_id)], ) df = df.append(existing).drop_duplicates() # Remove file, will be written again self.fs.rm(fn, recursive=True) df = df.astype({ k: "category" for k in category_attributes.get(cls.__name__, []) }) for col in ("ts_event_ns", "ts_recv_ns", "timestamp_ns"): if col in df.columns: df = df.sort_values(col) break table = pa.Table.from_pandas(df) metadata_collector = [] pq.write_to_dataset( table=table, root_path=str(fn), filesystem=self.fs, partition_cols=["instrument_id"] if ins_id is not None else None, # use_legacy_dataset=True, version="2.0", metadata_collector=metadata_collector, **kwargs, ) # Write the ``_common_metadata`` parquet file without row groups statistics pq.write_metadata(table.schema, fn / "_common_metadata", version="2.0") # Write the ``_metadata`` parquet file with row groups statistics of all files pq.write_metadata(table.schema, fn / "_metadata", version="2.0") # Save any new processed files self._save_processed_raw_files(files=processed_raw_files)
def write_parquet( fs: fsspec.AbstractFileSystem, path: str, df: pd.DataFrame, partition_cols: Optional[List[str]], schema: pa.Schema, **kwargs, ): """ Write a single dataframe to parquet. """ # Check partition values are valid before writing to parquet mappings = check_partition_columns(df=df, partition_columns=partition_cols) df = clean_partition_cols(df=df, mappings=mappings) # Dataframe -> pyarrow Table table = pa.Table.from_pandas(df, schema=schema) if "basename_template" not in kwargs and "ts_init" in df.columns: kwargs["basename_template"] = ( f"{df['ts_init'].min()}-{df['ts_init'].max()}" + "-{i}.parquet" ) # Write the actual file partitions = ( ds.partitioning( schema=pa.schema(fields=[table.schema.field(c) for c in partition_cols]), flavor="hive", ) if partition_cols else None ) if pa.__version__ >= "6.0.0": kwargs.update(existing_data_behavior="overwrite_or_ignore") files = set(fs.glob(f"{path}/**")) ds.write_dataset( data=table, base_dir=path, filesystem=fs, partitioning=partitions, format="parquet", **kwargs, ) # Ensure data written by write_dataset is sorted new_files = set(fs.glob(f"{path}/**/*.parquet")) - files del df for fn in new_files: ndf = pd.read_parquet(fs.open(fn)) # assert ndf.shape[0] == shape if "ts_init" in ndf.columns: ndf = ndf.sort_values("ts_init").reset_index(drop=True) pq.write_table( table=pa.Table.from_pandas(ndf), where=fn, filesystem=fs, ) # Write the ``_common_metadata`` parquet file without row groups statistics pq.write_metadata(table.schema, f"{path}/_common_metadata", version="2.6", filesystem=fs) # Write out any partition columns we had to modify due to filesystem requirements if mappings: write_partition_column_mappings(fs=fs, path=path, mappings=mappings)
def _write_map_parquet(hsp_map, filepath, clobber=False, nside_io=4): """ Internal method to write a HealSparseMap to a parquet dataset. use the `metadata` property from the map to persist additional information in the parquet metadata. Parameters ---------- hsp_map : `HealSparseMap` HealSparseMap to write to a file. filepath : `str` Name of dataset to save clobber : `bool`, optional Clobber existing file? Not supported. nside_io : `int`, optional The healpix nside to partition the output map files in parquet. Must be less than or equal to nside_coverage, and not greater than 16. Raises ------ RuntimeError if file exists. ValueError if nside_io is out of range. """ if os.path.isfile(filepath) or os.path.isdir(filepath): raise RuntimeError("Filepath %s exists and clobber is not supported." % (filepath)) if nside_io > hsp_map.nside_coverage: raise ValueError("nside_io must be <= nside_coverage.") elif nside_io > 16: raise ValueError("nside_io must be <= 16") elif nside_io < 0: raise ValueError("nside_io must be >= 0") # Make the path os.makedirs(filepath) # Create the nside_io paths cov_mask = hsp_map.coverage_mask cov_pixels = np.where(cov_mask)[0].astype(np.int32) bitshift_io = _compute_bitshift(nside_io, hsp_map.nside_coverage) cov_pixels_io = np.right_shift(cov_pixels, bitshift_io) if hsp_map.is_wide_mask_map: wmult = hsp_map.wide_mask_width else: wmult = 1 if np.isclose(hsp_map._sentinel, hpg.UNSEEN): sentinel_string = 'UNSEEN' else: sentinel_string = str(hsp_map._sentinel) metadata = { 'healsparse::version': '1', 'healsparse::nside_sparse': str(hsp_map.nside_sparse), 'healsparse::nside_coverage': str(hsp_map.nside_coverage), 'healsparse::nside_io': str(nside_io), 'healsparse::filetype': 'healsparse', 'healsparse::primary': '' if hsp_map.primary is None else hsp_map.primary, 'healsparse::sentinel': sentinel_string, 'healsparse::widemask': str(hsp_map.is_wide_mask_map), 'healsparse::wwidth': str(hsp_map._wide_mask_width) } # Add additional metadata if hsp_map.metadata is not None: # Use the fits header serialization for compatibility hdr_string = _make_header(hsp_map.metadata).tostring() metadata['healsparse::header'] = hdr_string if not hsp_map.is_rec_array: schema = pa.schema([('cov_pix', pa.from_numpy_dtype(np.int32)), ('sparse', pa.from_numpy_dtype(hsp_map.dtype))], metadata=metadata) else: type_list = [(name, pa.from_numpy_dtype(hsp_map.dtype[name].type)) for name in hsp_map.dtype.names] type_list[0:0] = [('cov_pix', pa.from_numpy_dtype(np.int32))] schema = pa.schema(type_list, metadata=metadata) cov_map = hsp_map._cov_map sparse_map = hsp_map._sparse_map.ravel() cov_index_map_temp = cov_map[:] + np.arange( hpg.nside_to_npixel( hsp_map.nside_coverage), dtype=np.int64) * cov_map.nfine_per_cov pix_arr = np.zeros(cov_map.nfine_per_cov * wmult, dtype=np.int32) last_pix_io = -1 writer = None row_groups = np.zeros_like(cov_pixels) for ctr, (pix_io, pix) in enumerate(zip(cov_pixels_io, cov_pixels)): # These are always going to be sorted if pix_io > last_pix_io: last_pix_io = pix_io if writer is not None: writer.close() writer = None # Create a new file pixpath = os.path.join(filepath, f'iopix={pix_io:03d}') os.makedirs(pixpath) pixfile = os.path.join(pixpath, f'{pix_io:03d}.parquet') writer = parquet.ParquetWriter(pixfile, schema) row_group_ctr = 0 sparsepix = sparse_map[cov_index_map_temp[pix] * wmult:(cov_index_map_temp[pix] + cov_map.nfine_per_cov) * wmult] pix_arr[:] = pix if not hsp_map.is_rec_array: arrays = [pa.array(pix_arr), pa.array(sparsepix)] else: arrays = [ pa.array(sparsepix[name]) for name in hsp_map.dtype.names ] arrays[0:0] = [pa.array(pix_arr)] tab = pa.Table.from_arrays(arrays, schema=schema) writer.write_table(tab) row_groups[ctr] = row_group_ctr row_group_ctr += 1 if writer is not None: writer.close() # And write the coverage pixels and row groups tab = pa.Table.from_pydict({ 'cov_pix': pa.array(cov_pixels), 'row_group': pa.array(row_groups) }) parquet.write_table(tab, os.path.join(filepath, '_coverage.parquet')) # And write the metadata parquet.write_metadata(schema, os.path.join(filepath, '_common_metadata')) parquet.write_metadata(schema, os.path.join(filepath, '_metadata'))