Python FilesystemResolver Examples, petastorm.fs_utils.FilesystemResolver Python Examples

Example #1

0

Show file

def build_rowgroup_index(dataset_url, spark_context, indexers):
    """
    Build index for given list of fields to use for fast rowgroup selection
    :param dataset_url: (str) the url for the dataset (or a path if you would like to use the default hdfs config)
    :param spark_context: (SparkContext)
    :param indexers: list of objects to build row groups indexes. Should support RowGroupIndexerBase interface
    :return: None, upon successful completion the rowgroup predicates will be saved to _metadata file
    """

    if dataset_url and dataset_url[-1] == '/':
        dataset_url = dataset_url[:-1]

    # Create pyarrow file system
    resolver = FilesystemResolver(dataset_url,
                                  spark_context._jsc.hadoopConfiguration())
    dataset = pq.ParquetDataset(resolver.get_dataset_path(),
                                filesystem=resolver.filesystem(),
                                validate_schema=False)

    split_pieces = dataset_metadata.load_row_groups(dataset)
    schema = dataset_metadata.get_schema(dataset)

    # We need direct reference on partitions object
    partitions = dataset.partitions
    pieces_num = len(split_pieces)
    piece_info_list = []
    for piece_index in range(pieces_num):
        #  indexes relies on the ordering of the split dataset pieces.
        # This relies on how the dataset pieces are split and sorted which although should not change,
        # still might and we should make sure not to forget that could break this.
        piece = split_pieces[piece_index]
        piece_info_list.append(
            PieceInfo(piece_index, piece.path, piece.row_group,
                      piece.partition_keys))

    start_time = time.time()
    piece_info_rdd = spark_context.parallelize(
        piece_info_list, min(len(piece_info_list), PARALLEL_SLICE_NUM))
    indexer_rdd = piece_info_rdd.map(lambda piece_info: _index_columns(
        piece_info, dataset_url, partitions, indexers, schema))
    indexer_list = indexer_rdd.reduce(_combine_indexers)

    indexer_dict = {indexer.index_name: indexer for indexer in indexer_list}
    serialized_indexers = pickle.dumps(indexer_dict, pickle.HIGHEST_PROTOCOL)
    utils.add_to_dataset_metadata(dataset, ROWGROUPS_INDEX_KEY,
                                  serialized_indexers)
    logger.info("Elapsed time of index creation: %f s",
                (time.time() - start_time))

Example #2

0

Show file

File: rowgroup_indexing.py Project: wangduan023/petastorm

def _index_columns(piece_info,
                   dataset_url,
                   partitions,
                   indexers,
                   schema,
                   hdfs_driver='libhdfs3'):
    """
    Function build indexes for  dataset piece described in piece_info
    :param piece_info: description of dataset piece
    :param dataset_url: dataset location
    :param partitions: dataset partitions
    :param indexers: list of indexer objects
    :param schema: dataset schema
    :param hdfs_driver: A string denoting the hdfs driver to use (if using a dataset on hdfs). Current choices are
        libhdfs (java through JNI) or libhdfs3 (C++)
    :return: list of indexers containing index data
    """
    # Create pyarrow piece
    piece = pq.ParquetDatasetPiece(piece_info.path,
                                   row_group=piece_info.row_group,
                                   partition_keys=piece_info.partition_keys)

    # Collect column names needed for indexing
    column_names = set()
    for indexer in indexers:
        column_names.update(indexer.column_names)

    # Read columns needed for indexing
    # Resolver in executor context will get hadoop config from environment
    resolver = FilesystemResolver(dataset_url, hdfs_driver=hdfs_driver)
    column_rows = piece.read(
        open_file_func=resolver.filesystem().open,
        columns=list(column_names),
        partitions=partitions).to_pandas().to_dict('records')

    # Decode columns values
    decoded_rows = [utils.decode_row(row, schema) for row in column_rows]
    if not decoded_rows:
        raise ValueError(
            'Cannot build index with empty decoded_rows, columns: {}, partitions: {}'
            .format(column_names, partitions))

    # Index columns values
    for indexer in indexers:
        indexer.build_index(decoded_rows, piece_info.piece_index)

    # Indexer objects contain index data, it will be consolidated on reduce phace
    return indexers

Example #3

0

Show file

def get_schema_from_dataset_url(dataset_url, hdfs_driver='libhdfs3'):
    """Returns a :class:`petastorm.unischema.Unischema` object loaded from a dataset specified by a url.

    :param dataset_url: A dataset URL
    :param hdfs_driver: A string denoting the hdfs driver to use (if using a dataset on hdfs). Current choices are
        libhdfs (java through JNI) or libhdfs3 (C++)
    :return: A :class:`petastorm.unischema.Unischema` object
    """
    resolver = FilesystemResolver(dataset_url, hdfs_driver=hdfs_driver)
    dataset = pq.ParquetDataset(resolver.get_dataset_path(), filesystem=resolver.filesystem(),
                                validate_schema=False)

    # Get a unischema stored in the dataset metadata.
    stored_schema = get_schema(dataset)

    return stored_schema

Example #4

0

Show file

    def test_error_url_cases(self):
        """Various error cases that result in exception raised."""
        # Case 1: Schemeless path asserts
        with self.assertRaises(ValueError):
            FilesystemResolver(ABS_PATH, {})

        # Case 4b: HDFS default path case with NO defaultFS
        with self.assertRaises(RuntimeError):
            FilesystemResolver('hdfs:///some/path', {})

        # Case 4b: Using `default` as host, while apparently a pyarrow convention, is NOT valid
        with self.assertRaises(ArrowIOError):
            FilesystemResolver('hdfs://default', {})

        # Case 5: other schemes result in ValueError; urlparse to cover an else branch!
        with self.assertRaises(ValueError):
            FilesystemResolver(urlparse('http://foo/bar'), {})
        with self.assertRaises(ValueError):
            FilesystemResolver(urlparse('ftp://foo/bar'), {})
        with self.assertRaises(ValueError):
            FilesystemResolver(urlparse('ssh://foo/bar'), {})

        # s3 paths must have the bucket as the netloc
        with self.assertRaises(ValueError):
            FilesystemResolver(urlparse('s3:///foo/bar'), {})

Example #5

0

Show file

File: dataset_metadata.py Project: kashmawy/petastorm

def materialize_dataset(spark, dataset_url, schema, row_group_size_mb=None):
    """
    A Context Manager which handles all the initialization and finalization necessary
    to generate metadata for a petastorm dataset. This should be used around your
    spark logic to materialize a dataset (specifically the writing of parquet output).

    Note: Any rowgroup indexing should happen outside the materialize_dataset block

    e.g.
    spark = SparkSession.builder...
    dataset_url = 'hdfs:///path/to/my/dataset'
    with materialize_dataset(spark, dataset_url, MyUnischema, 64):
      spark.sparkContext.parallelize(range(0, 10)).\
        ...
        .write.parquet(dataset_url)

    indexers = [SingleFieldIndexer(...)]
    build_rowgroup_index(dataset_url, spark.sparkContext, indexers)

    :param spark The spark session you are using
    :param dataset_url The dataset url to output your dataset to (e.g. hdfs:///path/to/dataset)
    :param schema The unischema definition of your dataset
    :param row_group_size_mb The parquet row group size to use for your dataset
    """
    spark_config = {}
    _init_spark(spark, spark_config, row_group_size_mb)
    yield

    # After job completes, add the unischema metadata and check for the metadata summary file
    resolver = FilesystemResolver(
        dataset_url, spark.sparkContext._jsc.hadoopConfiguration())
    dataset = pq.ParquetDataset(resolver.parsed_dataset_url().path,
                                filesystem=resolver.filesystem(),
                                validate_schema=False)

    _generate_unischema_metadata(dataset, schema)
    if not dataset.metadata_path:
        raise MetadataGenerationError(
            'Could not find summary metadata file. The dataset will exist but you will need'
            ' to execute petastorm-generate-metadata before you can read your dataset '
            ' in order to generate the necessary metadata.'
            ' Try increasing spark driver memory next time and making sure you are'
            ' using parquet-mr >= 1.8.3')

    _cleanup_spark(spark, spark_config, row_group_size_mb)

Example #6

0

Show file

File: test_spark_dataset_converter.py Project: praateekmahajan/petastorm

def test_atexit(test_ctx):
    lines = """
    from petastorm.spark import SparkDatasetConverter, make_spark_converter
    from pyspark.sql import SparkSession
    import os
    spark = SparkSession.builder.getOrCreate()
    spark.conf.set(SparkDatasetConverter.PARENT_CACHE_DIR_URL_CONF, '{temp_url}')
    df = spark.createDataFrame([(1, 2),(4, 5)], ["col1", "col2"])
    converter = make_spark_converter(df)
    f = open(os.path.join('{tempdir}', 'test_atexit.out'), "w")
    f.write(converter.cache_dir_url)
    f.close()
    """.format(tempdir=test_ctx.tempdir, temp_url=test_ctx.temp_url)
    code_str = "; ".join(line.strip() for line in lines.strip().splitlines())
    ret_code = subprocess.call([sys.executable, "-c", code_str])
    assert 0 == ret_code
    with open(os.path.join(test_ctx.tempdir, 'test_atexit.out')) as f:
        cache_dir_url = f.read()

    fs = FilesystemResolver(cache_dir_url).filesystem()
    assert not fs.exists(urlparse(cache_dir_url).path)

Example #7

0

Show file

    def __init__(self, dataset_url, schema, ngram, local_cache,
                 worker_predicate):
        """RowGroupLoader responsible for loading one rowgroup at a time. Rows returned are returned encoded.

        :param dataset_url: A url of a parquet dataset.
        :param schema: A unischema corresponding to the data in the dataset
        :param ngram: An instance of NGram if ngrams should be read or None, if each row in the dataset corresponds to
          a single sample returned.
        :param local_cache: An instance of a rowgroup cache (CacheBase interface) object to be used.
        :param worker_predicate: An instance of predicate (PredicateBase interface)
        """
        self._dataset_url_parsed = urlparse(dataset_url)
        self._schema = schema
        self._ngram = ngram
        self._local_cache = local_cache
        self._worker_predicate = worker_predicate

        resolver = FilesystemResolver(self._dataset_url_parsed)
        self._dataset = pq.ParquetDataset(resolver.get_dataset_path(),
                                          filesystem=resolver.filesystem(),
                                          validate_schema=False)

Example #8

0

Show file

File: row_group_loader.py Project: wxrui/petastorm

    def __init__(self, dataset_url, schema, ngram, local_cache, worker_predicate, hdfs_driver='libhdfs3'):
        """RowGroupLoader responsible for loading one rowgroup at a time. Rows returned are returned encoded.

        :param dataset_url: A url of a parquet dataset.
        :param schema: A unischema corresponding to the data in the dataset
        :param ngram: An instance of NGram if ngrams should be read or None, if each row in the dataset corresponds to
          a single sample returned.
        :param local_cache: An instance of a rowgroup cache (CacheBase interface) object to be used.
        :param worker_predicate: An instance of predicate (PredicateBase interface)
        :param hdfs_driver: A string denoting the hdfs driver to use (if using a dataset on hdfs). Current choices are
        libhdfs (java through JNI) or libhdfs3 (C++)
        """
        self._dataset_url_parsed = urlparse(dataset_url)
        self._schema = schema
        self._ngram = ngram
        self._local_cache = local_cache
        self._worker_predicate = worker_predicate

        resolver = FilesystemResolver(self._dataset_url_parsed, hdfs_driver=hdfs_driver)
        self._dataset = pq.ParquetDataset(
            resolver.get_dataset_path(),
            filesystem=resolver.filesystem(),
            validate_schema=False)

Example #9

0

Show file