Exemple #1
0
def get_schema(dataset):
    """Retrieves schema object stored as part of dataset methadata.

    :param dataset: an instance of :class:`pyarrow.parquet.ParquetDataset object`
    :return: A :class:`petastorm.unischema.Unischema` object
    """
    if not dataset.common_metadata:
        raise PetastormMetadataError(
            'Could not find _common_metadata file. Use materialize_dataset(..) in'
            ' petastorm.etl.dataset_metadata.py to generate this file in your ETL code.'
            ' You can generate it on an existing dataset using petastorm-generate-metadata.py'
        )

    dataset_metadata_dict = dataset.common_metadata.metadata

    # Read schema
    if UNISCHEMA_KEY not in dataset_metadata_dict:
        raise PetastormMetadataError(
            'Could not find the unischema in the dataset common metadata file.'
            ' Please provide or generate dataset with the unischema attached.'
            ' Common Metadata file might not be generated properly.'
            ' Make sure to use materialize_dataset(..) in petastorm.etl.dataset_metadata to'
            ' properly generate this file in your ETL code.'
            ' You can generate it on an existing dataset using petastorm-generate-metadata.py'
        )
    ser_schema = dataset_metadata_dict[UNISCHEMA_KEY]
    # Since we have moved the unischema class around few times, unpickling old schemas will not work. In this case we
    # override the old import path to get backwards compatibility

    schema = depickle_legacy_package_name_compatible(ser_schema)

    return schema
def get_row_group_indexes(dataset):
    """
    Extract and return row group indexes from dataset
    :param dataset: dataset object
    :return: dataset indexes as dictionary
    """
    if not dataset.common_metadata:
        raise ValueError(
            'Could not find _metadata file. add_dataset_metadata(..) in'
            ' petastorm.etl.dataset_metadata.py should be used to'
            ' generate this file in your ETL code.'
            ' You can generate it on an existing dataset using rowgroup_indexing_run.py'
        )

    dataset_metadata_dict = dataset.common_metadata.metadata

    # Load rowgroups_index
    if ROWGROUPS_INDEX_KEY not in dataset_metadata_dict:
        raise ValueError(
            'Row groups index is not available in the dataset metadata file. '
            'You can generate it on an existing dataset using rowgroup_indexing_run.py'
        )

    serialized_indexes = dataset_metadata_dict[ROWGROUPS_INDEX_KEY]

    index_dict = depickle_legacy_package_name_compatible(serialized_indexes)
    return index_dict