def from_arrow_schema(cls, parquet_dataset, omit_unsupported_fields=False): """ Convert an apache arrow schema into a unischema object. This is useful for datasets of only scalars which need no special encoding/decoding. If there is an unsupported type in the arrow schema, it will throw an exception. When the warn_only parameter is turned to True, unsupported column types prints only warnings. We do not set codec field in the generated fields since all parquet fields are out-of-the-box supported by pyarrow and we do not need perform any custom decoding. :param arrow_schema: :class:`pyarrow.lib.Schema` :param omit_unsupported_fields: :class:`Boolean` :return: A :class:`Unischema` object. """ meta = compat_get_metadata(parquet_dataset.pieces[0], parquet_dataset.fs.open) arrow_schema = meta.schema.to_arrow_schema() unischema_fields = [] for partition in parquet_dataset.partitions: if (pa.types.is_binary(partition.dictionary.type) and six.PY2) or \ (pa.types.is_string(partition.dictionary.type) and six.PY3): numpy_dtype = np.str_ elif pa.types.is_int64(partition.dictionary.type): numpy_dtype = np.int64 else: raise RuntimeError(( 'Expected partition type to be one of currently supported types: string or int64. ' 'Got {}').format(partition.dictionary.type)) unischema_fields.append( UnischemaField(partition.name, numpy_dtype, (), None, False)) for column_name in arrow_schema.names: arrow_field = compat_schema_field(arrow_schema, column_name) field_type = arrow_field.type field_shape = () if isinstance(field_type, ListType): if isinstance(field_type.value_type, ListType) or isinstance( field_type.value_type, pyStructType): warnings.warn( '[ARROW-1644] Ignoring unsupported structure %r for field %r' % (field_type, column_name)) continue field_shape = (None, ) try: np_type = _numpy_and_codec_from_arrow_type(field_type) except ValueError: if omit_unsupported_fields: warnings.warn( 'Column %r has an unsupported field %r. Ignoring...' % (column_name, field_type)) continue else: raise unischema_fields.append( UnischemaField(column_name, np_type, field_shape, None, arrow_field.nullable)) return Unischema('inferred_schema', unischema_fields)
def _split_piece(piece, fs_open): metadata = compat_get_metadata(piece, fs_open) return [ compat_make_parquet_piece(piece.path, fs_open, row_group=row_group, partition_keys=piece.partition_keys) for row_group in range(metadata.num_row_groups) ]
def add_to_dataset_metadata(dataset, key, value): """ Adds a key and value to the parquet metadata file of a parquet dataset. :param dataset: (ParquetDataset) parquet dataset :param key: (str) key of metadata entry :param value: (str) value of metadata """ if not isinstance(dataset.paths, str): raise ValueError( 'Expected dataset.paths to be a single path, not a list of paths') metadata_file_path = dataset.paths.rstrip('/') + '/_metadata' common_metadata_file_path = dataset.paths.rstrip('/') + '/_common_metadata' common_metadata_file_crc_path = dataset.paths.rstrip( '/') + '/._common_metadata.crc' # If the metadata file already exists, add to it. # Otherwise fetch the schema from one of the existing parquet files in the dataset if dataset.fs.exists(common_metadata_file_path): with dataset.fs.open(common_metadata_file_path) as f: arrow_metadata = pyarrow.parquet.read_metadata(f) elif dataset.fs.exists(metadata_file_path): # If just the metadata file exists and not the common metadata file, copy the contents of # the metadata file to the common_metadata file for backwards compatibility with dataset.fs.open(metadata_file_path) as f: arrow_metadata = pyarrow.parquet.read_metadata(f) else: arrow_metadata = compat_get_metadata(dataset.pieces[0], dataset.fs.open) base_schema = arrow_metadata.schema.to_arrow_schema() # base_schema.metadata may be None, e.g. metadata_dict = base_schema.metadata or dict() metadata_dict[key] = value schema = compat_with_metadata(base_schema, metadata_dict) with dataset.fs.open(common_metadata_file_path, 'wb') as metadata_file: pyarrow.parquet.write_metadata(schema, metadata_file) # We have just modified _common_metadata file, but the filesystem implementation used by pyarrow does not # update the .crc value. We better delete the .crc to make sure there is no mismatch between _common_metadata # content and the checksum. if isinstance(dataset.fs, LocalFileSystem) and dataset.fs.exists( common_metadata_file_crc_path): try: dataset.fs.rm(common_metadata_file_crc_path) except NotImplementedError: os.remove(common_metadata_file_crc_path)
def test_asymetric_parquet_pieces(reader_factory, tmpdir): """Check that datasets with parquet files that all rows in datasets that have different number of rowgroups can be fully read """ url = 'file://' + tmpdir.strpath ROWS_COUNT = 1000 # id_div_700 forces asymetric split between partitions and hopefully get us files with different number of row # groups create_test_scalar_dataset(url, ROWS_COUNT, partition_by=['id_div_700']) # We verify we have pieces with different number of row-groups dataset = pq.ParquetDataset(tmpdir.strpath) row_group_counts = set(compat_get_metadata(piece, dataset.fs.open).num_row_groups for piece in dataset.pieces) assert len(row_group_counts) > 1 # Make sure we are not missing any rows. with reader_factory(url, schema_fields=['id']) as reader: row_ids_batched = [row.id for row in reader] actual_row_ids = list(itertools.chain(*row_ids_batched)) assert ROWS_COUNT == len(actual_row_ids)
def split_piece(piece): metadata = compat_get_metadata(dataset.pieces[0], dataset.fs.open) return [compat_make_parquet_piece(piece.path, dataset.fs.open, row_group=row_group, partition_keys=piece.partition_keys) for row_group in range(metadata.num_row_groups)]