def get_schema(dataset): """Retrieves schema object stored as part of dataset methadata. :param dataset: an instance of :class:`pyarrow.parquet.ParquetDataset object` :return: A :class:`petastorm.unischema.Unischema` object """ if not dataset.common_metadata: raise PetastormMetadataError( 'Could not find _common_metadata file. Use materialize_dataset(..) in' ' petastorm.etl.dataset_metadata.py to generate this file in your ETL code.' ' You can generate it on an existing dataset using petastorm-generate-metadata.py' ) dataset_metadata_dict = dataset.common_metadata.metadata # Read schema if UNISCHEMA_KEY not in dataset_metadata_dict: raise PetastormMetadataError( 'Could not find the unischema in the dataset common metadata file.' ' Please provide or generate dataset with the unischema attached.' ' Common Metadata file might not be generated properly.' ' Make sure to use materialize_dataset(..) in petastorm.etl.dataset_metadata to' ' properly generate this file in your ETL code.' ' You can generate it on an existing dataset using petastorm-generate-metadata.py' ) ser_schema = dataset_metadata_dict[UNISCHEMA_KEY] # Since we have moved the unischema class around few times, unpickling old schemas will not work. In this case we # override the old import path to get backwards compatibility schema = depickle_legacy_package_name_compatible(ser_schema) return schema
def get_row_group_indexes(dataset): """ Extract and return row group indexes from dataset :param dataset: dataset object :return: dataset indexes as dictionary """ if not dataset.common_metadata: raise ValueError( 'Could not find _metadata file. add_dataset_metadata(..) in' ' petastorm.etl.dataset_metadata.py should be used to' ' generate this file in your ETL code.' ' You can generate it on an existing dataset using rowgroup_indexing_run.py' ) dataset_metadata_dict = dataset.common_metadata.metadata # Load rowgroups_index if ROWGROUPS_INDEX_KEY not in dataset_metadata_dict: raise ValueError( 'Row groups index is not available in the dataset metadata file. ' 'You can generate it on an existing dataset using rowgroup_indexing_run.py' ) serialized_indexes = dataset_metadata_dict[ROWGROUPS_INDEX_KEY] index_dict = depickle_legacy_package_name_compatible(serialized_indexes) return index_dict