def build_rowgroup_index(dataset_url, spark_context, indexers, hdfs_driver='libhdfs3'): """ Build index for given list of fields to use for fast rowgroup selection :param dataset_url: (str) the url for the dataset (or a path if you would like to use the default hdfs config) :param spark_context: (SparkContext) :param indexers: list of objects to build row groups indexes. Should support RowGroupIndexerBase interface :param hdfs_driver: A string denoting the hdfs driver to use (if using a dataset on hdfs). Current choices are libhdfs (java through JNI) or libhdfs3 (C++) :return: None, upon successful completion the rowgroup predicates will be saved to _metadata file """ if dataset_url and dataset_url[-1] == '/': dataset_url = dataset_url[:-1] # Create pyarrow file system resolver = FilesystemResolver(dataset_url, spark_context._jsc.hadoopConfiguration(), hdfs_driver=hdfs_driver, user=spark_context.sparkUser()) dataset = pq.ParquetDataset(resolver.get_dataset_path(), filesystem=resolver.filesystem(), validate_schema=False) split_pieces = dataset_metadata.load_row_groups(dataset) schema = dataset_metadata.get_schema(dataset) # We need direct reference on partitions object partitions = dataset.partitions pieces_num = len(split_pieces) piece_info_list = [] for piece_index in range(pieces_num): # indexes relies on the ordering of the split dataset pieces. # This relies on how the dataset pieces are split and sorted which although should not change, # still might and we should make sure not to forget that could break this. piece = split_pieces[piece_index] piece_info_list.append( PieceInfo(piece_index, piece.path, piece.row_group, piece.partition_keys)) start_time = time.time() piece_info_rdd = spark_context.parallelize( piece_info_list, min(len(piece_info_list), PARALLEL_SLICE_NUM)) indexer_rdd = piece_info_rdd.map( lambda piece_info: _index_columns(piece_info, dataset_url, partitions, indexers, schema, hdfs_driver=hdfs_driver)) indexer_list = indexer_rdd.reduce(_combine_indexers) indexer_dict = {indexer.index_name: indexer for indexer in indexer_list} serialized_indexers = pickle.dumps(indexer_dict, pickle.HIGHEST_PROTOCOL) utils.add_to_dataset_metadata(dataset, ROWGROUPS_INDEX_KEY, serialized_indexers) logger.info("Elapsed time of index creation: %f s", (time.time() - start_time))
def _generate_num_row_groups_per_file(dataset, spark_context, filesystem_factory): """ Generates the metadata file containing the number of row groups in each file for the parquet dataset located at the dataset_url. It does this in spark by opening all parquet files in the dataset on the executors and collecting the number of row groups in each file back on the driver. :param dataset: :class:`pyarrow.parquet.ParquetDataset` :param spark_context: spark context to use for retrieving the number of row groups in each parquet file in parallel :return: None, upon successful completion the metadata file will exist. """ if not isinstance(dataset.paths, str): raise ValueError('Expected dataset.paths to be a single path, not a list of paths') # Get the common prefix of all the base path in order to retrieve a relative path paths = [piece.path for piece in dataset.pieces] # Needed pieces from the dataset must be extracted for spark because the dataset object is not serializable base_path = dataset.paths def get_row_group_info(path): fs = filesystem_factory() relative_path = os.path.relpath(path, base_path) pq_file = fs.open(path) num_row_groups = pq.read_metadata(pq_file).num_row_groups pq_file.close() return relative_path, num_row_groups row_groups = spark_context.parallelize(paths, len(paths)) \ .map(get_row_group_info) \ .collect() num_row_groups_str = json.dumps(dict(row_groups)) # Add the dict for the number of row groups in each file to the parquet file metadata footer utils.add_to_dataset_metadata(dataset, ROW_GROUPS_PER_FILE_KEY, num_row_groups_str)
def generate_petastorm_metadata(spark, dataset_url, unischema_class=None, use_summary_metadata=False): """ Generates metadata necessary to read a petastorm dataset to an existing dataset. :param spark: spark session :param dataset_url: url of existing dataset :param unischema_class: (optional) fully qualified dataset unischema class. If not specified will attempt to find one already in the dataset. (e.g. :class:`examples.hello_world.generate_hello_world_dataset.HelloWorldSchema`) """ sc = spark.sparkContext resolver = FilesystemResolver(dataset_url, sc._jsc.hadoopConfiguration()) dataset = pq.ParquetDataset( resolver.get_dataset_path(), filesystem=resolver.filesystem(), validate_schema=False) if unischema_class: schema = locate(unischema_class) if not isinstance(schema, Unischema): raise ValueError('The specified class %s is not an instance of a petastorm.Unischema object.', unischema_class) else: try: schema = get_schema(dataset) except ValueError: raise ValueError('Unischema class could not be located in existing dataset,' ' please specify it') # In order to be backwards compatible, we retrieve the common metadata from the dataset before # overwriting the metadata to keep row group indexes and the old row group per file index arrow_metadata = dataset.common_metadata or None with materialize_dataset(spark, dataset_url, schema, use_summary_metadata=use_summary_metadata): if use_summary_metadata: # Inside the materialize dataset context we just need to write the metadata file as the schema will # be written by the context manager. # We use the java ParquetOutputCommitter to write the metadata file for the existing dataset # which will read all the footers of the dataset in parallel and merge them. hadoop_config = sc._jsc.hadoopConfiguration() Path = sc._gateway.jvm.org.apache.hadoop.fs.Path parquet_output_committer = sc._gateway.jvm.org.apache.parquet.hadoop.ParquetOutputCommitter parquet_output_committer.writeMetaDataFile(hadoop_config, Path(dataset_url)) spark.stop() if use_summary_metadata and arrow_metadata: # When calling writeMetaDataFile it will overwrite the _common_metadata file which could have schema information # or row group indexers. Therefore we want to retain this information and will add it to the new # _common_metadata file. If we were using the old legacy metadata method this file wont be deleted base_schema = arrow_metadata.schema.to_arrow_schema() metadata_dict = base_schema.metadata if ROW_GROUPS_PER_FILE_KEY in metadata_dict: add_to_dataset_metadata(dataset, ROW_GROUPS_PER_FILE_KEY, metadata_dict[ROW_GROUPS_PER_FILE_KEY]) if ROWGROUPS_INDEX_KEY in metadata_dict: add_to_dataset_metadata(dataset, ROWGROUPS_INDEX_KEY, metadata_dict[ROWGROUPS_INDEX_KEY])
def _generate_unischema_metadata(dataset, schema): """ Generates the serialized unischema and adds it to the dataset parquet metadata to be used upon reading. :param dataset: (ParquetDataset) Dataset to attach schema :param schema: (Unischema) Schema to attach to dataset :return: None """ # TODO(robbieg): Simply pickling unischema will break if the UnischemaField class is changed, # or the codec classes are changed. We likely need something more robust. serialized_schema = pickle.dumps(schema) utils.add_to_dataset_metadata(dataset, UNISCHEMA_KEY, serialized_schema)
def generate_petastorm_metadata(spark, dataset_url, unischema_class=None): """ Generate metadata necessary to read a petastorm dataset to an existing dataset. :param spark: spark session :param dataset_url: url of existing dataset :param unischema_class: (optional) fully qualified dataset unischema class. If not specified will attempt to find one already in the dataset. (e.g. examples.hello_world.hello_world_dataset.HelloWorldSchema) :return: """ sc = spark.sparkContext resolver = FilesystemResolver(dataset_url, sc._jsc.hadoopConfiguration()) dataset = pq.ParquetDataset(resolver.parsed_dataset_url().path, filesystem=resolver.filesystem(), validate_schema=False) if unischema_class: schema = locate(unischema_class) else: try: schema = get_schema(dataset) except ValueError: raise ValueError( 'Unischema class could not be located in existing dataset,' ' please specify it') # In order to be backwards compatible, we retrieve the common metadata from the dataset before # overwriting the metadata to keep row group indexes and the old row group per file index arrow_metadata = dataset.common_metadata or None with materialize_dataset(spark, dataset_url, schema): # Inside the materialize dataset context we just need to write the metadata file as the schema will # be written by the context manager. # We use the java ParquetOutputCommitter to write the metadata file for the existing dataset # which will read all the footers of the dataset in parallel and merge them. hadoop_config = sc._jsc.hadoopConfiguration() Path = sc._gateway.jvm.org.apache.hadoop.fs.Path parquet_output_committer = sc._gateway.jvm.org.apache.parquet.hadoop.ParquetOutputCommitter parquet_output_committer.writeMetaDataFile(hadoop_config, Path(dataset_url)) if arrow_metadata: # If there was the old row groups per file key or the row groups index key, add them to the new dataset metadata base_schema = arrow_metadata.schema.to_arrow_schema() metadata_dict = base_schema.metadata if ROW_GROUPS_PER_FILE_KEY in metadata_dict: add_to_dataset_metadata(dataset, ROW_GROUPS_PER_FILE_KEY, metadata_dict[ROW_GROUPS_PER_FILE_KEY]) if ROWGROUPS_INDEX_KEY in metadata_dict: add_to_dataset_metadata(dataset, ROWGROUPS_INDEX_KEY, metadata_dict[ROWGROUPS_INDEX_KEY])