Example #1
0
def build_rowgroup_index(dataset_url, spark_context, indexers):
    """
    Build index for given list of fields to use for fast rowgroup selection
    :param dataset_url: (str) the url for the dataset (or a path if you would like to use the default hdfs config)
    :param spark_context: (SparkContext)
    :param indexers: list of objects to build row groups indexes. Should support RowGroupIndexerBase interface
    :return: None, upon successful completion the rowgroup predicates will be saved to _metadata file
    """

    if dataset_url and dataset_url[-1] == '/':
        dataset_url = dataset_url[:-1]

    # Create pyarrow file system
    resolver = FilesystemResolver(dataset_url,
                                  spark_context._jsc.hadoopConfiguration())
    dataset = pq.ParquetDataset(resolver.get_dataset_path(),
                                filesystem=resolver.filesystem(),
                                validate_schema=False)

    split_pieces = dataset_metadata.load_row_groups(dataset)
    schema = dataset_metadata.get_schema(dataset)

    # We need direct reference on partitions object
    partitions = dataset.partitions
    pieces_num = len(split_pieces)
    piece_info_list = []
    for piece_index in range(pieces_num):
        #  indexes relies on the ordering of the split dataset pieces.
        # This relies on how the dataset pieces are split and sorted which although should not change,
        # still might and we should make sure not to forget that could break this.
        piece = split_pieces[piece_index]
        piece_info_list.append(
            PieceInfo(piece_index, piece.path, piece.row_group,
                      piece.partition_keys))

    start_time = time.time()
    piece_info_rdd = spark_context.parallelize(
        piece_info_list, min(len(piece_info_list), PARALLEL_SLICE_NUM))
    indexer_rdd = piece_info_rdd.map(lambda piece_info: _index_columns(
        piece_info, dataset_url, partitions, indexers, schema))
    indexer_list = indexer_rdd.reduce(_combine_indexers)

    indexer_dict = {indexer.index_name: indexer for indexer in indexer_list}
    serialized_indexers = pickle.dumps(indexer_dict, pickle.HIGHEST_PROTOCOL)
    utils.add_to_dataset_metadata(dataset, ROWGROUPS_INDEX_KEY,
                                  serialized_indexers)
    logger.info("Elapsed time of index creation: %f s",
                (time.time() - start_time))
def _index_columns(piece_info,
                   dataset_url,
                   partitions,
                   indexers,
                   schema,
                   hdfs_driver='libhdfs3'):
    """
    Function build indexes for  dataset piece described in piece_info
    :param piece_info: description of dataset piece
    :param dataset_url: dataset location
    :param partitions: dataset partitions
    :param indexers: list of indexer objects
    :param schema: dataset schema
    :param hdfs_driver: A string denoting the hdfs driver to use (if using a dataset on hdfs). Current choices are
        libhdfs (java through JNI) or libhdfs3 (C++)
    :return: list of indexers containing index data
    """
    # Create pyarrow piece
    piece = pq.ParquetDatasetPiece(piece_info.path,
                                   row_group=piece_info.row_group,
                                   partition_keys=piece_info.partition_keys)

    # Collect column names needed for indexing
    column_names = set()
    for indexer in indexers:
        column_names.update(indexer.column_names)

    # Read columns needed for indexing
    # Resolver in executor context will get hadoop config from environment
    resolver = FilesystemResolver(dataset_url, hdfs_driver=hdfs_driver)
    column_rows = piece.read(
        open_file_func=resolver.filesystem().open,
        columns=list(column_names),
        partitions=partitions).to_pandas().to_dict('records')

    # Decode columns values
    decoded_rows = [utils.decode_row(row, schema) for row in column_rows]
    if not decoded_rows:
        raise ValueError(
            'Cannot build index with empty decoded_rows, columns: {}, partitions: {}'
            .format(column_names, partitions))

    # Index columns values
    for indexer in indexers:
        indexer.build_index(decoded_rows, piece_info.piece_index)

    # Indexer objects contain index data, it will be consolidated on reduce phace
    return indexers
Example #3
0
def get_schema_from_dataset_url(dataset_url, hdfs_driver='libhdfs3'):
    """Returns a :class:`petastorm.unischema.Unischema` object loaded from a dataset specified by a url.

    :param dataset_url: A dataset URL
    :param hdfs_driver: A string denoting the hdfs driver to use (if using a dataset on hdfs). Current choices are
        libhdfs (java through JNI) or libhdfs3 (C++)
    :return: A :class:`petastorm.unischema.Unischema` object
    """
    resolver = FilesystemResolver(dataset_url, hdfs_driver=hdfs_driver)
    dataset = pq.ParquetDataset(resolver.get_dataset_path(), filesystem=resolver.filesystem(),
                                validate_schema=False)

    # Get a unischema stored in the dataset metadata.
    stored_schema = get_schema(dataset)

    return stored_schema
Example #4
0
    def test_error_url_cases(self):
        """Various error cases that result in exception raised."""
        # Case 1: Schemeless path asserts
        with self.assertRaises(ValueError):
            FilesystemResolver(ABS_PATH, {})

        # Case 4b: HDFS default path case with NO defaultFS
        with self.assertRaises(RuntimeError):
            FilesystemResolver('hdfs:///some/path', {})

        # Case 4b: Using `default` as host, while apparently a pyarrow convention, is NOT valid
        with self.assertRaises(ArrowIOError):
            FilesystemResolver('hdfs://default', {})

        # Case 5: other schemes result in ValueError; urlparse to cover an else branch!
        with self.assertRaises(ValueError):
            FilesystemResolver(urlparse('http://foo/bar'), {})
        with self.assertRaises(ValueError):
            FilesystemResolver(urlparse('ftp://foo/bar'), {})
        with self.assertRaises(ValueError):
            FilesystemResolver(urlparse('ssh://foo/bar'), {})

        # s3 paths must have the bucket as the netloc
        with self.assertRaises(ValueError):
            FilesystemResolver(urlparse('s3:///foo/bar'), {})
Example #5
0
def materialize_dataset(spark, dataset_url, schema, row_group_size_mb=None):
    """
    A Context Manager which handles all the initialization and finalization necessary
    to generate metadata for a petastorm dataset. This should be used around your
    spark logic to materialize a dataset (specifically the writing of parquet output).

    Note: Any rowgroup indexing should happen outside the materialize_dataset block

    e.g.
    spark = SparkSession.builder...
    dataset_url = 'hdfs:///path/to/my/dataset'
    with materialize_dataset(spark, dataset_url, MyUnischema, 64):
      spark.sparkContext.parallelize(range(0, 10)).\
        ...
        .write.parquet(dataset_url)

    indexers = [SingleFieldIndexer(...)]
    build_rowgroup_index(dataset_url, spark.sparkContext, indexers)

    :param spark The spark session you are using
    :param dataset_url The dataset url to output your dataset to (e.g. hdfs:///path/to/dataset)
    :param schema The unischema definition of your dataset
    :param row_group_size_mb The parquet row group size to use for your dataset
    """
    spark_config = {}
    _init_spark(spark, spark_config, row_group_size_mb)
    yield

    # After job completes, add the unischema metadata and check for the metadata summary file
    resolver = FilesystemResolver(
        dataset_url, spark.sparkContext._jsc.hadoopConfiguration())
    dataset = pq.ParquetDataset(resolver.parsed_dataset_url().path,
                                filesystem=resolver.filesystem(),
                                validate_schema=False)

    _generate_unischema_metadata(dataset, schema)
    if not dataset.metadata_path:
        raise MetadataGenerationError(
            'Could not find summary metadata file. The dataset will exist but you will need'
            ' to execute petastorm-generate-metadata before you can read your dataset '
            ' in order to generate the necessary metadata.'
            ' Try increasing spark driver memory next time and making sure you are'
            ' using parquet-mr >= 1.8.3')

    _cleanup_spark(spark, spark_config, row_group_size_mb)
def test_atexit(test_ctx):
    lines = """
    from petastorm.spark import SparkDatasetConverter, make_spark_converter
    from pyspark.sql import SparkSession
    import os
    spark = SparkSession.builder.getOrCreate()
    spark.conf.set(SparkDatasetConverter.PARENT_CACHE_DIR_URL_CONF, '{temp_url}')
    df = spark.createDataFrame([(1, 2),(4, 5)], ["col1", "col2"])
    converter = make_spark_converter(df)
    f = open(os.path.join('{tempdir}', 'test_atexit.out'), "w")
    f.write(converter.cache_dir_url)
    f.close()
    """.format(tempdir=test_ctx.tempdir, temp_url=test_ctx.temp_url)
    code_str = "; ".join(line.strip() for line in lines.strip().splitlines())
    ret_code = subprocess.call([sys.executable, "-c", code_str])
    assert 0 == ret_code
    with open(os.path.join(test_ctx.tempdir, 'test_atexit.out')) as f:
        cache_dir_url = f.read()

    fs = FilesystemResolver(cache_dir_url).filesystem()
    assert not fs.exists(urlparse(cache_dir_url).path)
Example #7
0
    def __init__(self, dataset_url, schema, ngram, local_cache,
                 worker_predicate):
        """RowGroupLoader responsible for loading one rowgroup at a time. Rows returned are returned encoded.

        :param dataset_url: A url of a parquet dataset.
        :param schema: A unischema corresponding to the data in the dataset
        :param ngram: An instance of NGram if ngrams should be read or None, if each row in the dataset corresponds to
          a single sample returned.
        :param local_cache: An instance of a rowgroup cache (CacheBase interface) object to be used.
        :param worker_predicate: An instance of predicate (PredicateBase interface)
        """
        self._dataset_url_parsed = urlparse(dataset_url)
        self._schema = schema
        self._ngram = ngram
        self._local_cache = local_cache
        self._worker_predicate = worker_predicate

        resolver = FilesystemResolver(self._dataset_url_parsed)
        self._dataset = pq.ParquetDataset(resolver.get_dataset_path(),
                                          filesystem=resolver.filesystem(),
                                          validate_schema=False)
Example #8
0
    def __init__(self, dataset_url, schema, ngram, local_cache, worker_predicate, hdfs_driver='libhdfs3'):
        """RowGroupLoader responsible for loading one rowgroup at a time. Rows returned are returned encoded.

        :param dataset_url: A url of a parquet dataset.
        :param schema: A unischema corresponding to the data in the dataset
        :param ngram: An instance of NGram if ngrams should be read or None, if each row in the dataset corresponds to
          a single sample returned.
        :param local_cache: An instance of a rowgroup cache (CacheBase interface) object to be used.
        :param worker_predicate: An instance of predicate (PredicateBase interface)
        :param hdfs_driver: A string denoting the hdfs driver to use (if using a dataset on hdfs). Current choices are
        libhdfs (java through JNI) or libhdfs3 (C++)
        """
        self._dataset_url_parsed = urlparse(dataset_url)
        self._schema = schema
        self._ngram = ngram
        self._local_cache = local_cache
        self._worker_predicate = worker_predicate

        resolver = FilesystemResolver(self._dataset_url_parsed, hdfs_driver=hdfs_driver)
        self._dataset = pq.ParquetDataset(
            resolver.get_dataset_path(),
            filesystem=resolver.filesystem(),
            validate_schema=False)
Example #9
0
    def test_s3_url(self):
        suj = FilesystemResolver('s3://bucket{}'.format(ABS_PATH),
                                 self._hadoop_configuration,
                                 connector=self.mock)
        self.assertTrue(isinstance(suj.filesystem(), S3FSWrapper))
        self.assertEqual('bucket', suj.parsed_dataset_url().netloc)
        self.assertEqual('bucket' + ABS_PATH, suj.get_dataset_path())

        # Make sure we did not capture FilesystemResolver in a closure by mistake
        dill.dumps(suj.filesystem_factory())
Example #10
0
    def test_file_url(self):
        """ Case 2: File path, agnostic to content of hadoop configuration."""
        suj = FilesystemResolver('file://{}'.format(ABS_PATH),
                                 self._hadoop_configuration,
                                 connector=self.mock)
        self.assertTrue(isinstance(suj.filesystem(), LocalFileSystem))
        self.assertEqual('', suj.parsed_dataset_url().netloc)
        self.assertEqual(ABS_PATH, suj.get_dataset_path())

        # Make sure we did not capture FilesystemResolver in a closure by mistake
        dill.dumps(suj.filesystem_factory())
Example #11
0
    def test_hdfs_url_no_nameservice(self):
        """ Case 3b: HDFS with no nameservice should connect to default namenode."""
        suj = FilesystemResolver('hdfs:///some/path',
                                 self._hadoop_configuration,
                                 connector=self.mock,
                                 user=self.mock_name)
        self.assertEqual(MockHdfs, type(suj.filesystem()._hdfs))
        self.assertEqual(self.mock_name, suj.filesystem()._user)
        self.assertEqual(HC.WARP_TURTLE, suj.parsed_dataset_url().netloc)
        # ensure path is preserved in parsed URL
        self.assertEqual('/some/path', suj.get_dataset_path())
        self.assertEqual(1, self.mock.connect_attempted(HC.WARP_TURTLE_NN2))
        self.assertEqual(0, self.mock.connect_attempted(HC.WARP_TURTLE_NN1))
        self.assertEqual(0, self.mock.connect_attempted(HC.DEFAULT_NN))

        # Make sure we did not capture FilesystemResolver in a closure by mistake
        dill.dumps(suj.filesystem_factory())
Example #12
0
    def test_hdfs_url_direct_namenode(self):
        """ Case 4: direct namenode."""
        suj = FilesystemResolver('hdfs://{}/path'.format(HC.WARP_TURTLE_NN1),
                                 self._hadoop_configuration,
                                 connector=self.mock,
                                 user=self.mock_name)
        self.assertEqual(MockHdfs, type(suj.filesystem()))
        self.assertEqual(self.mock_name, suj.filesystem()._user)
        self.assertEqual(HC.WARP_TURTLE_NN1, suj.parsed_dataset_url().netloc)
        self.assertEqual(0, self.mock.connect_attempted(HC.WARP_TURTLE_NN2))
        self.assertEqual(1, self.mock.connect_attempted(HC.WARP_TURTLE_NN1))
        self.assertEqual(0, self.mock.connect_attempted(HC.DEFAULT_NN))

        # Make sure we did not capture FilesystemResolver in a closure by mistake
        dill.dumps(suj.filesystem_factory())
Example #13
0
    def test_hdfs_url_with_nameservice(self):
        """ Case 3a: HDFS nameservice."""
        suj = FilesystemResolver(HC.WARP_TURTLE_PATH,
                                 self._hadoop_configuration,
                                 connector=self.mock,
                                 user=self.mock_name)
        self.assertEqual(MockHdfs, type(suj.filesystem()._hdfs))
        self.assertEqual(self.mock_name, suj.filesystem()._user)
        self.assertEqual(HC.WARP_TURTLE, suj.parsed_dataset_url().netloc)
        self.assertEqual(1, self.mock.connect_attempted(HC.WARP_TURTLE_NN2))
        self.assertEqual(0, self.mock.connect_attempted(HC.WARP_TURTLE_NN1))
        self.assertEqual(0, self.mock.connect_attempted(HC.DEFAULT_NN))

        # Make sure we did not capture FilesystemResolver in a closure by mistake
        dill.dumps(suj.filesystem_factory())
def materialize_dataset(spark,
                        dataset_url,
                        schema,
                        row_group_size_mb=None,
                        use_summary_metadata=False,
                        filesystem_factory=None):
    """
    A Context Manager which handles all the initialization and finalization necessary
    to generate metadata for a petastorm dataset. This should be used around your
    spark logic to materialize a dataset (specifically the writing of parquet output).

    Note: Any rowgroup indexing should happen outside the materialize_dataset block

    Example:

    >>> spark = SparkSession.builder...
    >>> ds_url = 'hdfs:///path/to/my/dataset'
    >>> with materialize_dataset(spark, ds_url, MyUnischema, 64):
    >>>   spark.sparkContext.parallelize(range(0, 10)).
    >>>     ...
    >>>     .write.parquet(ds_url)
    >>> indexer = [SingleFieldIndexer(...)]
    >>> build_rowgroup_index(ds_url, spark.sparkContext, indexer)

    A user may provide their own recipe for creation of pyarrow filesystem object in ``filesystem_factory``
    argument (otherwise, petastorm will create a default one based on the url).

    The following example shows how a custom pyarrow HDFS filesystem, instantiated using ``libhdfs`` driver can be used
    during Petastorm dataset generation:

    >>> resolver=FilesystemResolver(dataset_url, spark.sparkContext._jsc.hadoopConfiguration(),
    >>>                             hdfs_driver='libhdfs')
    >>> with materialize_dataset(..., filesystem_factory=resolver.filesystem_factory()):
    >>>     ...


    :param spark: The spark session you are using
    :param dataset_url: The dataset url to output your dataset to (e.g. ``hdfs:///path/to/dataset``)
    :param schema: The :class:`petastorm.unischema.Unischema` definition of your dataset
    :param row_group_size_mb: The parquet row group size to use for your dataset
    :param use_summary_metadata: Whether to use the parquet summary metadata for row group indexing or a custom
      indexing method. The custom indexing method is more scalable for very large datasets.
    :param filesystem_factory: A filesystem factory function to be used when saving Petastorm specific metadata to the
      Parquet store.
    """
    spark_config = {}
    _init_spark(spark, spark_config, row_group_size_mb, use_summary_metadata)
    yield
    # After job completes, add the unischema metadata and check for the metadata summary file
    if filesystem_factory is None:
        resolver = FilesystemResolver(
            dataset_url,
            spark.sparkContext._jsc.hadoopConfiguration(),
            user=spark.sparkContext.sparkUser())
        filesystem_factory = resolver.filesystem_factory()
        dataset_path = resolver.get_dataset_path()
    else:
        dataset_path = get_dataset_path(urlparse(dataset_url))
    filesystem = filesystem_factory()

    dataset = pq.ParquetDataset(dataset_path,
                                filesystem=filesystem,
                                validate_schema=False)

    _generate_unischema_metadata(dataset, schema)
    if not use_summary_metadata:
        _generate_num_row_groups_per_file(dataset, spark.sparkContext,
                                          filesystem_factory)

    # Reload the dataset to take into account the new metadata
    dataset = pq.ParquetDataset(dataset_path,
                                filesystem=filesystem,
                                validate_schema=False)
    try:
        # Try to load the row groups, if it fails that means the metadata was not generated properly
        load_row_groups(dataset)
    except PetastormMetadataError:
        raise PetastormMetadataGenerationError(
            'Could not find summary metadata file. The dataset will exist but you will need'
            ' to execute petastorm-generate-metadata.py before you can read your dataset '
            ' in order to generate the necessary metadata.'
            ' Try increasing spark driver memory next time and making sure you are'
            ' using parquet-mr >= 1.8.3')

    _cleanup_spark(spark, spark_config, row_group_size_mb)
Example #15
0
    def __init__(self,
                 dataset_url,
                 schema_fields=None,
                 shuffle=None,
                 predicate=None,
                 rowgroup_selector=None,
                 reader_pool=None,
                 num_epochs=1,
                 sequence=None,
                 training_partition=None,
                 num_training_partitions=None,
                 read_timeout_s=None,
                 cache=None,
                 shuffle_options=None):
        """Initializes a reader object.

        :param dataset_url: an filepath or a url to a parquet directory,
            e.g. ``'hdfs://some_hdfs_cluster/user/yevgeni/parquet8'``, or ``'/tmp/mydataset'``.
        :param schema_fields: Either list of unischema fields to subset, or ``None`` to read all fields.
            OR an NGram object, then it will return an NGram of the specified properties.
        :param predicate: instance of predicate object to filter rows to be returned by reader.
        :param rowgroup_selector: instance of row group selector object to select row groups to be read
        :param reader_pool: parallelization pool. ``ThreadPool(10)`` (10 threads) is used by default.
            This pool is a custom implementation used to parallelize reading data from the dataset.
            Any object from workers_pool package can be used
            (e.g. :class:`petastorm.workers_pool.process_pool.ProcessPool`).
        :param num_epochs: An epoch is a single pass over all samples in the dataset. Setting ``num_epochs`` to
            ``None`` will result in an infinite number of epochs.
        :param training_partition: An int denoting the partition number used for multi node training. Each node should
            pass in a unique partition number in the range ``[0, num_training_partitions)``.
            ``num_training_partitions`` must be supplied as well.
        :param num_training_partitions: An int denoting the number of training partitions (how many nodes are performing
            the multi node training).
        :param read_timeout_s: A numeric with the amount of time in seconds you would like to give a read before it
            times out and raises an EmptyResultError. Pass in None for an infinite timeout.
        :param cache: An object conforming to :class:`.CacheBase` interface. Before loading row groups from a parquet
            file the Reader will attempt to load these values from cache. Caching is useful when communication
            to the main data store is either slow or expensive and the local machine has large enough storage
            to store entire dataset (or a partition of a dataset if num_training_partitions is used).
            By default, use the :class:`.NullCache` implementation.
        :param shuffle_options: ShuffleOptions object to describe how to shuffle dataset (supercedes shuffle parameter)
            defaults to shuffling row groups but not to drop rows based on partitions.
        :param sequence: *DEPRECATED* To use sequence/ngram, please supply the argument in
            ``schema_fields`` instead.
        :param shuffle: *DEPRECATED* Boolean whether to shuffle the row group order.
            Use ``shuffle_row_groups`` in :class:`.ShuffleOptions` instead.
        """

        # 1. Resolve dataset path (hdfs://, file://) and open the parquet storage (dataset)
        # 2. Get a list of all groups
        # 3. Filter rowgroups
        #    a. predicates
        #    b. row-group selector (our indexing mechanism)
        #    c. partition: used to get a subset of data for distributed training
        # 4. Create a rowgroup ventilator object
        # 5. Start workers pool
        if dataset_url is None or not isinstance(dataset_url,
                                                 six.string_types):
            raise ValueError("""dataset_url must be a string""")

        if not (isinstance(schema_fields, collections.Iterable)
                or isinstance(schema_fields, NGram) or schema_fields is None):
            raise ValueError(
                """Fields must be either None, an iterable collection of Unischema fields or an NGram
            object.""")

        if sequence is not None:
            raise ValueError(
                """'sequence' argument of Reader object is deprecated. Please pass an NGram instance to
            'schema_fields' argument instead.""")

        self.ngram = schema_fields if isinstance(schema_fields,
                                                 NGram) else None

        if self.ngram and not self.ngram.timestamp_overlap and shuffle_options.shuffle_row_drop_partitions > 1:
            raise NotImplementedError(
                'Using timestamp_overlap=False is not implemented with'
                ' shuffle_options.shuffle_row_drop_partitions > 1')

        cache = cache or NullCache()
        dataset_url = dataset_url[:-1] if dataset_url[
            -1] == '/' else dataset_url
        self._workers_pool = reader_pool or ThreadPool(10)

        # 1. Resolve dataset path (hdfs://, file://) and open the parquet storage (dataset)
        logger.debug('dataset_url: %s', dataset_url)
        resolver = FilesystemResolver(dataset_url)
        self.dataset = pq.ParquetDataset(resolver.parsed_dataset_url().path,
                                         filesystem=resolver.filesystem(),
                                         validate_schema=False)

        # Get a unischema stored in the dataset metadata.
        stored_schema = dataset_metadata.get_schema(self.dataset)

        # Make a schema view (a view is a Unischema containing only a subset of fields
        # Will raise an exception if invalid schema fields are in schema_fields
        fields = schema_fields if isinstance(schema_fields,
                                             collections.Iterable) else None
        self.schema = stored_schema.create_schema_view(
            fields) if fields else stored_schema

        # 2. Get a list of all groups
        row_groups = dataset_metadata.load_row_groups(self.dataset)

        # 3. Filter rowgroups
        filtered_row_group_indexes, worker_predicate = self._filter_row_groups(
            self.dataset, row_groups, predicate, rowgroup_selector,
            training_partition, num_training_partitions)
        # 4. Create a rowgroup ventilator object
        if shuffle_options is None:
            if shuffle is None:
                shuffle = True
            else:
                logger.warning(
                    'shuffle option is deprecated. Please use shuffle_options instead'
                )
            shuffle_options = ShuffleOptions(shuffle)
        self._normalize_shuffle_options(shuffle_options, self.dataset)
        ventilator = self._create_ventilator(filtered_row_group_indexes,
                                             shuffle_options, num_epochs,
                                             worker_predicate)

        # 5. Start workers pool
        self._workers_pool.start(ReaderWorker,
                                 (dataset_url, self.schema, self.ngram,
                                  row_groups, cache, worker_predicate),
                                 ventilator=ventilator)
        self._read_timeout_s = read_timeout_s
                        help='Display list of row group indexes')
    parser.add_argument('--print-values', action='store_true',
                        help='Print index values (dataset piece indexes)')
    parser.add_argument('--skip-index', nargs='+', type=str,
                        help='Donot display indexed values for given fields')
    parser.add_argument('--hdfs-driver', type=str, default='libhdfs3',
                        help='A string denoting the hdfs driver to use (if using a dataset on hdfs). '
                             'Current choices are libhdfs (java through JNI) or libhdfs3 (C++)')

    args = parser.parse_args()

    if args.dataset_url and args.dataset_url[-1] == '/':
        args.dataset_url = args.dataset_url[:-1]

    # Create pyarrow file system
    resolver = FilesystemResolver(args.dataset_url, hdfs_driver=args.hdfs_driver)
    dataset = pq.ParquetDataset(resolver.get_dataset_path(), filesystem=resolver.filesystem(),
                                validate_schema=False)

    print_all = not args.schema and not args.index
    if args.schema or print_all:
        print('*** Schema from dataset metadata ***')
        print((dataset_metadata.get_schema(dataset)))

    if args.index or print_all:
        index_dict = rowgroup_indexing.get_row_group_indexes(dataset)
        print('*** Row group indexes from dataset metadata ***')
        for index_name in index_dict:
            print(('Index: {}'.format(index_name)))
            if args.skip_index is None or index_name not in args.skip_index:
                for field_value in index_dict[index_name].indexed_values:
def generate_petastorm_metadata(spark,
                                dataset_url,
                                unischema_class=None,
                                use_summary_metadata=False,
                                hdfs_driver='libhdfs3'):
    """
    Generates metadata necessary to read a petastorm dataset to an existing dataset.

    :param spark: spark session
    :param dataset_url: url of existing dataset
    :param unischema_class: (optional) fully qualified dataset unischema class. If not specified will attempt
        to find one already in the dataset. (e.g.
        :class:`examples.hello_world.generate_hello_world_dataset.HelloWorldSchema`)
    :param hdfs_driver: A string denoting the hdfs driver to use (if using a dataset on hdfs). Current choices are
        libhdfs (java through JNI) or libhdfs3 (C++)
    :param user: String denoting username when connecting to HDFS
    """
    sc = spark.sparkContext

    resolver = FilesystemResolver(dataset_url,
                                  sc._jsc.hadoopConfiguration(),
                                  hdfs_driver=hdfs_driver,
                                  user=spark.sparkContext.sparkUser())
    fs = resolver.filesystem()
    dataset = pq.ParquetDataset(resolver.get_dataset_path(),
                                filesystem=fs,
                                validate_schema=False)

    if unischema_class:
        schema = locate(unischema_class)
        if not isinstance(schema, Unischema):
            raise ValueError(
                'The specified class %s is not an instance of a petastorm.Unischema object.',
                unischema_class)
    else:

        try:
            schema = get_schema(dataset)
        except ValueError:
            raise ValueError(
                'Unischema class could not be located in existing dataset,'
                ' please specify it')

    # In order to be backwards compatible, we retrieve the common metadata from the dataset before
    # overwriting the metadata to keep row group indexes and the old row group per file index
    arrow_metadata = dataset.common_metadata or None

    with materialize_dataset(spark,
                             dataset_url,
                             schema,
                             use_summary_metadata=use_summary_metadata,
                             filesystem_factory=resolver.filesystem_factory()):
        if use_summary_metadata:
            # Inside the materialize dataset context we just need to write the metadata file as the schema will
            # be written by the context manager.
            # We use the java ParquetOutputCommitter to write the metadata file for the existing dataset
            # which will read all the footers of the dataset in parallel and merge them.
            hadoop_config = sc._jsc.hadoopConfiguration()
            Path = sc._gateway.jvm.org.apache.hadoop.fs.Path
            parquet_output_committer = sc._gateway.jvm.org.apache.parquet.hadoop.ParquetOutputCommitter
            parquet_output_committer.writeMetaDataFile(hadoop_config,
                                                       Path(dataset_url))

    spark.stop()

    if use_summary_metadata and arrow_metadata:
        # When calling writeMetaDataFile it will overwrite the _common_metadata file which could have schema information
        # or row group indexers. Therefore we want to retain this information and will add it to the new
        # _common_metadata file. If we were using the old legacy metadata method this file wont be deleted
        base_schema = arrow_metadata.schema.to_arrow_schema()
        metadata_dict = base_schema.metadata
        if ROW_GROUPS_PER_FILE_KEY in metadata_dict:
            add_to_dataset_metadata(dataset, ROW_GROUPS_PER_FILE_KEY,
                                    metadata_dict[ROW_GROUPS_PER_FILE_KEY])
        if ROWGROUPS_INDEX_KEY in metadata_dict:
            add_to_dataset_metadata(dataset, ROWGROUPS_INDEX_KEY,
                                    metadata_dict[ROWGROUPS_INDEX_KEY])
Example #18
0
def materialize_dataset_carbon(spark,
                               dataset_url,
                               schema,
                               blocklet_size_mb=None,
                               use_summary_metadata=False,
                               pyarrow_filesystem=None):
    """
  A Context Manager which handles all the initialization and finalization necessary
  to generate metadata for a pycarbon dataset. This should be used around your
  spark logic to materialize a dataset (specifically the writing of carbon output).

  Note: Any blocklet indexing should happen outside the materialize_dataset_carbon block

  Example:

  >>> spark = SparkSession.builder...
  >>> ds_url = 'hdfs:///path/to/my/dataset'
  >>> with materialize_dataset_carbon(spark, ds_url, MyUnischema, 64):
  >>>   spark.sparkContext.parallelize(range(0, 10)).
  >>>     ...
  >>>     .write.save(path=ds_url, format='carbon')

  A user may provide their own instance of pyarrow filesystem object in ``pyarrow_filesystem`` argument (otherwise,
  pycarbon will create a default one based on the url).

  The following example shows how a custom pyarrow HDFS filesystem, instantiated using ``libhdfs`` driver can be used
  during Pycarbon dataset generation:

  >>> resolver=FilesystemResolver(dataset_url, spark.sparkContext._jsc.hadoopConfiguration(),
  >>>                             hdfs_driver='libhdfs')
  >>> with materialize_dataset_carbon(..., pyarrow_filesystem=resolver.filesystem()):
  >>>     ...


  :param spark: The spark session you are using
  :param dataset_url: The dataset url to output your dataset to (e.g. ``hdfs:///path/to/dataset``)
  :param schema: The :class:`petastorm.unischema.Unischema` definition of your dataset
  :param blocklet_size_mb: The carbon blocklet size to use for your dataset
  :param use_summary_metadata: Whether to use the carbon summary metadata for blocklet indexing or a custom
    indexing method. The custom indexing method is more scalable for very large datasets.
  :param pyarrow_filesystem: A pyarrow filesystem object to be used when saving Pycarbon specific metadata to the
    Carbon store.

  """

    # After job completes, add the unischema metadata and check for the metadata summary file
    spark_config = {}
    _init_spark(spark, spark_config, blocklet_size_mb, use_summary_metadata)
    yield

    # After job completes, add the unischema metadata and check for the metadata summary file
    if pyarrow_filesystem is None:
        resolver = FilesystemResolver(
            dataset_url, spark.sparkContext._jsc.hadoopConfiguration())
        # filesystem = resolver.filesystem()
        dataset_path = resolver.get_dataset_path()
    else:
        # filesystem = pyarrow_filesystem
        dataset_path = urlparse(dataset_url).path

    carbon_dataset = CarbonDataset(dataset_path)
    _generate_unischema_metadata_carbon(carbon_dataset, schema)
    if not use_summary_metadata:
        _generate_num_blocklets_per_file_carbon(carbon_dataset,
                                                spark.sparkContext)

    _cleanup_spark(spark, spark_config, blocklet_size_mb)
Example #19
0
def copy_dataset(spark,
                 source_url,
                 target_url,
                 field_regex,
                 not_null_fields,
                 overwrite_output,
                 partitions_count,
                 row_group_size_mb,
                 hdfs_driver='libhdfs3'):
    """
    Creates a copy of a dataset. A new dataset will optionally contain a subset of columns. Rows that have NULL
    values in fields defined by ``not_null_fields`` argument are filtered out.


    :param spark: An instance of ``SparkSession`` object
    :param source_url: A url of the dataset to be copied.
    :param target_url: A url specifying location of the target dataset.
    :param field_regex: A list of regex patterns. Only columns that match one of these patterns are copied to the new
      dataset.
    :param not_null_fields: A list of fields that must have non-NULL valus in the target dataset.
    :param overwrite_output: If ``False`` and there is an existing path defined by ``target_url``, the operation will
      fail.
    :param partitions_count: If not ``None``, the dataset is repartitioned before write. Number of files in the target
      Parquet store is defined by this parameter.
    :param row_group_size_mb: The size of the rowgroup in the target dataset. Specified in megabytes.
    :param hdfs_driver: A string denoting the hdfs driver to use (if using a dataset on hdfs). Current choices are
        libhdfs (java through JNI) or libhdfs3 (C++)
    :return: None
    """
    schema = get_schema_from_dataset_url(source_url, hdfs_driver=hdfs_driver)

    fields = match_unischema_fields(schema, field_regex)

    if field_regex and not fields:
        field_names = list(schema.fields.keys())
        raise ValueError(
            'Regular expressions (%s) do not match any fields (%s)',
            str(field_regex), str(field_names))

    if fields:
        subschema = schema.create_schema_view(fields)
    else:
        subschema = schema

    resolver = FilesystemResolver(
        target_url,
        spark.sparkContext._jsc.hadoopConfiguration(),
        hdfs_driver=hdfs_driver)
    with materialize_dataset(spark,
                             target_url,
                             subschema,
                             row_group_size_mb,
                             pyarrow_filesystem=resolver.filesystem()):
        data_frame = spark.read \
            .parquet(source_url)

        if fields:
            data_frame = data_frame.select(*[f.name for f in fields])

        if not_null_fields:
            not_null_condition = reduce(operator.__and__,
                                        (data_frame[f].isNotNull()
                                         for f in not_null_fields))
            data_frame = data_frame.filter(not_null_condition)

        if partitions_count:
            data_frame = data_frame.repartition(partitions_count)

        data_frame.write \
            .mode('overwrite' if overwrite_output else 'error') \
            .option('compression', 'none') \
            .parquet(target_url)
Example #20
0
def _create_dataset(store, df, validation, compress_sparse, num_partitions,
                    num_workers, dataset_idx, parquet_row_group_size_mb,
                    verbose):
    train_data_path = store.get_train_data_path(dataset_idx)
    val_data_path = store.get_val_data_path(dataset_idx)
    if verbose >= 1:
        print('CEREBRO => Time: {}, Writing DataFrames'.format(
            datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
        print('CEREBRO => Time: {}, Train Data Path: {}'.format(
            datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
            train_data_path))
        print('CEREBRO => Time: {}, Val Data Path: {}'.format(
            datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
            val_data_path))

    schema_cols = df.columns

    if isinstance(validation, str):
        schema_cols.append(validation)
    df = df[schema_cols]

    metadata = None
    if _has_vector_column(df):
        if compress_sparse:
            metadata = _get_metadata(df)
        to_petastorm = to_petastorm_fn(schema_cols, metadata)
        df = df.rdd.map(to_petastorm).toDF()

    train_df, val_df, validation_ratio = _train_val_split(df, validation)

    unischema_fields = []
    metadata = _get_metadata(train_df)
    for k in metadata.keys():
        type = spark_to_petastorm_type(metadata[k]['spark_data_type'])
        shape = petastorm_unischema_shape(metadata[k]['shape'])
        codec = petastorm_unischema_codec(metadata[k]['shape'],
                                          metadata[k]['spark_data_type'])
        unischema_fields.append(UnischemaField(k, type, shape, codec, False))

    petastorm_schema = Unischema('petastorm_schema', unischema_fields)

    train_partitions = max(int(num_partitions * (1.0 - validation_ratio)),
                           num_workers)
    if verbose >= 1:
        print('CEREBRO => Time: {}, Train Partitions: {}'.format(
            datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
            train_partitions))

    spark = SparkSession.builder.getOrCreate()
    # FIXME pass hdfs_driver from user interface instead of hardcoded PETASTORM_HDFS_DRIVER
    train_resolver = FilesystemResolver(
        train_data_path,
        spark.sparkContext._jsc.hadoopConfiguration(),
        user=spark.sparkContext.sparkUser(),
        hdfs_driver=constants.PETASTORM_HDFS_DRIVER)
    with materialize_dataset(
            spark,
            train_data_path,
            petastorm_schema,
            parquet_row_group_size_mb,
            filesystem_factory=train_resolver.filesystem_factory()):
        train_rdd = train_df.rdd.map(lambda x: x.asDict()).map(
            lambda x: {k: np.array(x[k], dtype=spark_to_petastorm_type(metadata[k]['spark_data_type'])) for k in x}) \
            .map(lambda x: dict_to_spark_row(petastorm_schema, x))

        spark.createDataFrame(train_rdd, petastorm_schema.as_spark_schema()) \
            .coalesce(train_partitions) \
            .write \
            .mode('overwrite') \
            .parquet(train_data_path)

    if val_df:
        val_partitions = max(int(num_partitions * validation_ratio),
                             num_workers)
        if verbose >= 1:
            print('CEREBRO => Time: {}, Val Partitions: {}'.format(
                datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                val_partitions))
        val_resolver = FilesystemResolver(
            val_data_path,
            spark.sparkContext._jsc.hadoopConfiguration(),
            user=spark.sparkContext.sparkUser(),
            hdfs_driver=constants.PETASTORM_HDFS_DRIVER)
        with materialize_dataset(
                spark,
                val_data_path,
                petastorm_schema,
                parquet_row_group_size_mb,
                filesystem_factory=val_resolver.filesystem_factory()):
            val_rdd = val_df.rdd.map(lambda x: x.asDict()).map(
                lambda x: {k: np.array(x[k], dtype=spark_to_petastorm_type(metadata[k]['spark_data_type'])) for k in x}) \
                .map(lambda x: dict_to_spark_row(petastorm_schema, x))

            spark.createDataFrame(val_rdd, petastorm_schema.as_spark_schema()) \
                .coalesce(val_partitions) \
                .write \
                .mode('overwrite') \
                .parquet(val_data_path)

    train_rows, val_rows, pq_metadata, avg_row_size = get_simple_meta_from_parquet(
        store, df.columns, dataset_idx)

    if verbose:
        print('CEREBRO => Time: {}, Train Rows: {}'.format(
            datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), train_rows))
    if val_df:
        if val_rows == 0:
            raise ValueError(
                'Validation DataFrame does not any samples with validation param {}'
                .format(validation))
        if verbose:
            print('CEREBRO => Time: {}, Val Rows: {}'.format(
                datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                val_rows))

    return train_rows, val_rows, pq_metadata, avg_row_size
Example #21
0
def make_reader(dataset_url,
                schema_fields=None,
                reader_pool_type='thread', workers_count=10, pyarrow_serialize=False,
                shuffle_row_groups=True, shuffle_row_drop_partitions=1,
                predicate=None,
                rowgroup_selector=None,
                num_epochs=1,
                cur_shard=None, shard_count=None,
                cache_type='null', cache_location=None, cache_size_limit=None,
                cache_row_size_estimate=None, cache_extra_settings=None,
                hdfs_driver='libhdfs3',
                infer_schema=False,
                reader_engine='reader_v1', reader_engine_params=None):
    """
    Factory convenience method for :class:`Reader`.

    :param dataset_url: an filepath or a url to a parquet directory,
        e.g. ``'hdfs://some_hdfs_cluster/user/yevgeni/parquet8'``, or ``'file:///tmp/mydataset'``
        or ``'s3://bucket/mydataset'``.
    :param schema_fields: Either list of unischema fields to subset, or ``None`` to read all fields.
            OR an NGram object, then it will return an NGram of the specified properties.
    :param reader_pool_type: A string denoting the reader pool type. Should be one of ['thread', 'process', 'dummy']
        denoting a thread pool, process pool, or running everything in the master thread. Defaults to 'thread'
    :param workers_count: An int for the number of workers to use in the reader pool. This only is used for the
        thread or process pool. Defaults to 10
    :param pyarrow_serialize: Whether to use pyarrow for serialization. Currently only applicable to process pool.
        Defaults to False.
    :param shuffle_row_groups: Whether to shuffle row groups (the order in which full row groups are read)
    :param shuffle_row_drop_partitions: This is is a positive integer which determines how many partitions to
        break up a row group into for increased shuffling in exchange for worse performance (extra reads).
        For example if you specify 2 each row group read will drop half of the rows within every row group and
        read the remaining rows in separate reads. It is recommended to keep this number below the regular row
        group size in order to not waste reads which drop all rows.
    :param predicate: instance of :class:`.PredicateBase` object to filter rows to be returned by reader.
    :param rowgroup_selector: instance of row group selector object to select row groups to be read
    :param num_epochs: An epoch is a single pass over all rows in the dataset. Setting ``num_epochs`` to
        ``None`` will result in an infinite number of epochs.
    :param cur_shard: An int denoting the current shard number. Each node reading a shard should
        pass in a unique shard number in the range [0, shard_count). shard_count must be supplied as well.
        Defaults to None
    :param shard_count: An int denoting the number of shards to break this dataset into. Defaults to None
    :param cache_type: A string denoting the cache type, if desired. Options are [None, 'null', 'local-disk'] to
        either have a null/noop cache or a cache implemented using diskcache. Caching is useful when communication
        to the main data store is either slow or expensive and the local machine has large enough storage
        to store entire dataset (or a partition of a dataset if shard_count is used). By default will be a null cache.
    :param cache_location: A string denoting the location or path of the cache.
    :param cache_size_limit: An int specifying the size limit of the cache in bytes
    :param cache_row_size_estimate: An int specifying the estimated size of a row in the dataset
    :param cache_extra_settings: A dictionary of extra settings to pass to the cache implementation,
    :param hdfs_driver: A string denoting the hdfs driver to use (if using a dataset on hdfs). Current choices are
        libhdfs (java through JNI) or libhdfs3 (C++)
    :param infer_schema: Whether to infer the unischema object from the parquet schema.
            Only works for schemas containing certain scalar type. This option allows getting around explicitly
            generating petastorm metadata using :func:`petastorm.etl.dataset_metadata.materialize_dataset` or
            petastorm-generate-metadata.py
    :param reader_engine: Multiple engine implementations exist ('reader_v1' and 'experimental_reader_v2'). 'reader_v1'
        (the default value) selects a stable reader implementation.
    :param reader_engine_params: For advanced usage: a dictionary with arguments passed directly to a reader
        implementation constructor chosen by ``reader_engine`` argument.  You should not use this parameter, unless you
        fine-tuning of a reader.
    :return: A :class:`Reader` object
    """

    if dataset_url is None or not isinstance(dataset_url, six.string_types):
        raise ValueError("""dataset_url must be a string""")

    dataset_url = dataset_url[:-1] if dataset_url[-1] == '/' else dataset_url
    logger.debug('dataset_url: %s', dataset_url)

    resolver = FilesystemResolver(dataset_url, hdfs_driver=hdfs_driver)
    filesystem = resolver.filesystem()
    dataset_path = resolver.get_dataset_path()

    if cache_type is None or cache_type == 'null':
        cache = NullCache()
    elif cache_type == 'local-disk':
        cache = LocalDiskCache(cache_location, cache_size_limit, cache_row_size_estimate, **cache_extra_settings or {})
    else:
        raise ValueError('Unknown cache_type: {}'.format(cache_type))

    if reader_engine == 'reader_v1':
        if reader_pool_type == 'thread':
            reader_pool = ThreadPool(workers_count)
        elif reader_pool_type == 'process':
            reader_pool = ProcessPool(workers_count, pyarrow_serialize=pyarrow_serialize)
        elif reader_pool_type == 'dummy':
            reader_pool = DummyPool()
        else:
            raise ValueError('Unknown reader_pool_type: {}'.format(reader_pool_type))

        # Create a dictionary with all ReaderV2 parameters, so we can merge with reader_engine_params if specified
        kwargs = {
            'schema_fields': schema_fields,
            'reader_pool': reader_pool,
            'shuffle_row_groups': shuffle_row_groups,
            'shuffle_row_drop_partitions': shuffle_row_drop_partitions,
            'predicate': predicate,
            'rowgroup_selector': rowgroup_selector,
            'num_epochs': num_epochs,
            'cur_shard': cur_shard,
            'shard_count': shard_count,
            'cache': cache,
            'infer_schema': infer_schema,
        }

        if reader_engine_params:
            kwargs.update(reader_engine_params)

        return Reader(filesystem, dataset_path, **kwargs)
    elif reader_engine == 'experimental_reader_v2':
        if reader_pool_type == 'thread':
            decoder_pool = ThreadPoolExecutor(workers_count)
        elif reader_pool_type == 'process':
            decoder_pool = ProcessPoolExecutor(workers_count)
        elif reader_pool_type == 'dummy':
            decoder_pool = SameThreadExecutor()
        else:
            raise ValueError('Unknown reader_pool_type: {}'.format(reader_pool_type))

        # TODO(yevgeni): once ReaderV2 is ready to be out of experimental status, we should extend
        # the make_reader interfaces to take shuffling buffer parameters explicitly
        shuffling_queue = RandomShufflingBuffer(1000, 800) if shuffle_row_groups else NoopShufflingBuffer()

        # Create a dictionary with all ReaderV2 parameters, so we can merge with reader_engine_params if specified
        kwargs = {
            'schema_fields': schema_fields,
            'predicate': predicate,
            'rowgroup_selector': rowgroup_selector,
            'num_epochs': num_epochs,
            'cur_shard': cur_shard,
            'shard_count': shard_count,
            'cache': cache,
            'decoder_pool': decoder_pool,
            'shuffling_queue': shuffling_queue,
            'shuffle_row_groups': shuffle_row_groups,
            'shuffle_row_drop_partitions': shuffle_row_drop_partitions,
            'infer_schema': infer_schema,
        }

        if reader_engine_params:
            kwargs.update(reader_engine_params)

        return ReaderV2(dataset_url, **kwargs)

    else:
        raise ValueError('Unexpected value of reader_engine argument \'%s\'. '
                         'Supported reader_engine values are \'reader_v1\' and \'experimental_reader_v2\'',
                         reader_engine)
Example #22
0
    def __init__(self,
                 dataset_url,
                 schema_fields=None,
                 predicate=None,
                 rowgroup_selector=None,
                 num_epochs=1,
                 sequence=None,
                 cur_shard=None,
                 shard_count=None,
                 read_timeout_s=None,
                 cache=None,
                 loader_pool=None,
                 decoder_pool=None,
                 shuffling_queue=None,
                 shuffle_row_groups=True,
                 shuffle_row_drop_partitions=1,
                 pyarrow_filesystem=None,
                 hdfs_driver='libhdfs3'):
        """Initializes a reader object.

        :param dataset_url: an filepath or a url to a parquet directory,
                       e.g. 'hdfs://some_hdfs_cluster/user/yevgeni/parquet8', or '/tmp/mydataset'
                       or ``'s3://bucket/mydataset'``.
        :param schema_fields:
            Either list of unischema fields to subset, or None to read all fields.
            OR an NGram object, then it will return an NGram of the specified properties.
        :param predicate: instance of predicate object to filter rows to be returned by reader.
        :param rowgroup_selector: instance of row group selector object to select row groups to be read
        :param reader_pool: parallelization pool. ThreadPool(10) (10 threads) is used by default.
                       This pool is a custom implementation used to parallelize reading data from the dataset.
                       Any object from workers_pool package can be used (e.g. ProcessPool)
        :param num_epochs: An epoch is a single pass over all samples in the dataset. Setting num_epochs to 'None' will
                       result in an infinite number of epochs.
        :param sequence: This is deprecated. To use sequence/ngram, please supply the argument in schema_fields instead.
        :param cur_shard: An int denoting the current shard number. Each node reading a shard should
                       pass in a unique shard number in the range [0, shard_count).
                       shard count must be supplied as well.
        :param shard_count An int denoting the number of shards to break this dataset into.
        :param read_timeout_s: A numeric with the amount of time in seconds you would like to give a read before it
                       times out and raises an EmptyResultError. Pass in None for an infinite timeout
        :param cache: An object conforming to `cache.CacheBase` interface. Before loading row groups from a parquet file
                       the Reader will attempt to load these values from cache. Caching is useful when communication
                       to the main data store is either slow or expensive and the local machine has large enough storage
                       to store entire dataset (or a partition of a dataset if num_training_partitions is used).
        :param decoder_pool: An instance of a concurrent.futures pool executor used for decoding. If None,
          a default ThreadPoolExecutor(5) will be used.
        :param loader_pool: An instance of a concurrent.futures pool executor used for decoding. If None,
          a default ThreadPoolExecutor(5) will be used.
        :param hdfs_driver: A string denoting the hdfs driver to use (if using a dataset on hdfs). Current choices are
        libhdfs (java through JNI) or libhdfs3 (C++)

        By default, `NullCache` implementation
        """

        # 1. Resolve dataset path (hdfs://, file://) and open the parquet storage (dataset)
        # 2. Get a list of all groups
        # 3. Filter rowgroups
        #    a. predicates
        #    b. row-group selector (our indexing mechanism)
        #    c. partition: used to get a subset of data for distributed training
        # 4. Launch a new thread running `worker_loop` function.

        if dataset_url is None or not isinstance(dataset_url,
                                                 six.string_types):
            raise ValueError("""dataset_url must be a string""")

        if not (isinstance(schema_fields, collections.Iterable)
                or isinstance(schema_fields, NGram) or schema_fields is None):
            raise ValueError(
                """Fields must be either None, an iterable collection of Unischema fields or an NGram
            object.""")

        if sequence is not None:
            raise ValueError(
                """'sequence' argument of Reader object is deprecated. Please pass an NGram instance to
            'schema_fields' argument instead.""")

        # Can not rely on a check in epochs.py since it runs on a separate thread. Inform user earlier about invalid
        # argument value.
        if num_epochs is not None and (not isinstance(num_epochs, int)
                                       or num_epochs < 1):
            raise ValueError('iterations must be positive integer or None')

        self.ngram = schema_fields if isinstance(schema_fields,
                                                 NGram) else None

        if self.ngram and not self.ngram.timestamp_overlap and shuffle_row_drop_partitions > 1:
            raise NotImplementedError(
                'Using timestamp_overlap=False is not implemented with'
                ' shuffle_options.shuffle_row_drop_partitions > 1')

        cache = cache or NullCache()
        dataset_url = dataset_url[:-1] if dataset_url[
            -1] == '/' else dataset_url

        # 1. Resolve dataset path (hdfs://, file://) and open the parquet storage (dataset)
        logger.debug('dataset_url: %s', dataset_url)

        if pyarrow_filesystem is not None:
            filesystem = pyarrow_filesystem
            dataset_path = urlparse(dataset_url).path
        else:
            resolver = FilesystemResolver(dataset_url)
            filesystem = resolver.filesystem()
            dataset_path = resolver.get_dataset_path()

        self._dataset = pq.ParquetDataset(dataset_path,
                                          filesystem=filesystem,
                                          validate_schema=False)

        shuffle_row_drop_partitions = self._normalize_shuffle_options(
            shuffle_row_drop_partitions, self._dataset)

        stored_schema = infer_or_load_unischema(self._dataset)

        # Make a schema view (a view is a Unischema containing only a subset of fields
        # Will raise an exception if invalid schema fields are in schema_fields
        fields = schema_fields if isinstance(schema_fields,
                                             collections.Iterable) else None
        self.schema = stored_schema.create_schema_view(
            fields) if fields else stored_schema

        # 2. Get a list of all groups
        row_groups = dataset_metadata.load_row_groups(self._dataset)

        # 3. Filter rowgroups
        filtered_row_groups, worker_predicate = self._filter_row_groups(
            self._dataset, row_groups, predicate, rowgroup_selector, cur_shard,
            shard_count)

        epoch_items = self._apply_row_drop_partition(
            filtered_row_groups, shuffle_row_drop_partitions)

        # 4. Launch a new thread running `worker_loop` function.
        def epochs_iterator():
            return epoch_generator(epoch_items, num_epochs, shuffle_row_groups)

        self._results_queue = Queue(_OUTPUT_QUEUE_SIZE)

        loader = RowGroupLoader(dataset_url,
                                self.schema,
                                self.ngram,
                                cache,
                                worker_predicate,
                                hdfs_driver=hdfs_driver)
        decoder = RowDecoder(self.schema, self.ngram)
        self._loader_pool = loader_pool or ThreadPoolExecutor(5)
        self._decoder_pool = decoder_pool or ThreadPoolExecutor(5)
        self._stop_flow_manager_event = threading.Event()
        self._diags = Counter()

        if not shuffling_queue:
            shuffling_queue = NoopShufflingBuffer()

        self._flow_manager_thread = threading.Thread(
            target=worker_loop,
            args=(epochs_iterator, self._loader_pool, loader,
                  self._decoder_pool, decoder, shuffling_queue,
                  self._results_queue, self._stop_flow_manager_event,
                  self._diags))
        self._flow_manager_thread.daemon = True
        self._flow_manager_thread.start()

        self._read_timeout_s = read_timeout_s
        self.batched_output = False
                        help='Display list of row group indexes')
    parser.add_argument('--print-values',
                        action='store_true',
                        help='Print index values (dataset piece indexes)')
    parser.add_argument('--skip-index',
                        nargs='+',
                        type=str,
                        help='Donot display indexed values for given fields')

    args = parser.parse_args()

    if args.dataset_url and args.dataset_url[-1] == '/':
        args.dataset_url = args.dataset_url[:-1]

    # Create pyarrow file system
    resolver = FilesystemResolver(args.dataset_url)
    dataset = pq.ParquetDataset(resolver.parsed_dataset_url().path,
                                filesystem=resolver.filesystem(),
                                validate_schema=False)

    print_all = not args.schema and not args.index
    if args.schema or print_all:
        print('*** Schema from dataset metadata ***')
        print((dataset_metadata.get_schema(dataset)))

    if args.index or print_all:
        index_dict = rowgroup_indexing.get_row_group_indexes(dataset)
        print('*** Row group indexes from dataset metadata ***')
        for index_name in index_dict:
            print(('Index: {}'.format(index_name)))
            if args.skip_index is None or index_name not in args.skip_index:
Example #24
0
def make_batch_reader(dataset_url,
                      schema_fields=None,
                      reader_pool_type='thread',
                      workers_count=10,
                      shuffle_row_groups=True,
                      shuffle_row_drop_partitions=1,
                      predicate=None,
                      rowgroup_selector=None,
                      num_epochs=1,
                      cur_shard=None,
                      shard_count=None,
                      cache_type='null',
                      cache_location=None,
                      cache_size_limit=None,
                      cache_row_size_estimate=None,
                      cache_extra_settings=None,
                      hdfs_driver='libhdfs3'):
    """
    Creates an instance of Reader for reading batches out of a non-Petastorm Parquet store.

    Currently, only stores having native scalar parquet data types are supported.
    Use :func:`~petastorm.make_reader` to read Petastorm Parquet stores generated with
    :func:`~petastorm.etl.dataset_metadata.materialize_dataset`.

    NOTE: only scalar columns are currently supported.

    :param dataset_url: an filepath or a url to a parquet directory,
        e.g. ``'hdfs://some_hdfs_cluster/user/yevgeni/parquet8'``, or ``'file:///tmp/mydataset'``
        or ``'s3://bucket/mydataset'``.
    :param schema_fields: A list of regex pattern strings. Only columns matching at least one of the
        patterns in the list will be loaded.
    :param reader_pool_type: A string denoting the reader pool type. Should be one of ['thread', 'process', 'dummy']
        denoting a thread pool, process pool, or running everything in the master thread. Defaults to 'thread'
    :param workers_count: An int for the number of workers to use in the reader pool. This only is used for the
        thread or process pool. Defaults to 10
    :param shuffle_row_groups: Whether to shuffle row groups (the order in which full row groups are read)
    :param shuffle_row_drop_partitions: This is is a positive integer which determines how many partitions to
        break up a row group into for increased shuffling in exchange for worse performance (extra reads).
        For example if you specify 2 each row group read will drop half of the rows within every row group and
        read the remaining rows in separate reads. It is recommended to keep this number below the regular row
        group size in order to not waste reads which drop all rows.
    :param predicate: instance of :class:`.PredicateBase` object to filter rows to be returned by reader. The predicate
        will be passed a pandas DataFrame object and must return a pandas Series with boolean values of matching
        dimensions.
    :param rowgroup_selector: instance of row group selector object to select row groups to be read
    :param num_epochs: An epoch is a single pass over all rows in the dataset. Setting ``num_epochs`` to
        ``None`` will result in an infinite number of epochs.
    :param cur_shard: An int denoting the current shard number. Each node reading a shard should
        pass in a unique shard number in the range [0, shard_count). shard_count must be supplied as well.
        Defaults to None
    :param shard_count: An int denoting the number of shards to break this dataset into. Defaults to None
    :param cache_type: A string denoting the cache type, if desired. Options are [None, 'null', 'local-disk'] to
        either have a null/noop cache or a cache implemented using diskcache. Caching is useful when communication
        to the main data store is either slow or expensive and the local machine has large enough storage
        to store entire dataset (or a partition of a dataset if shard_count is used). By default will be a null cache.
    :param cache_location: A string denoting the location or path of the cache.
    :param cache_size_limit: An int specifying the size limit of the cache in bytes
    :param cache_row_size_estimate: An int specifying the estimated size of a row in the dataset
    :param cache_extra_settings: A dictionary of extra settings to pass to the cache implementation,
    :param hdfs_driver: A string denoting the hdfs driver to use (if using a dataset on hdfs). Current choices are
        libhdfs (java through JNI) or libhdfs3 (C++)
    :return: A :class:`Reader` object
    """

    if dataset_url is None or not isinstance(dataset_url, six.string_types):
        raise ValueError("""dataset_url must be a string""")

    dataset_url = dataset_url[:-1] if dataset_url[-1] == '/' else dataset_url
    logger.debug('dataset_url: %s', dataset_url)

    resolver = FilesystemResolver(dataset_url, hdfs_driver=hdfs_driver)
    filesystem = resolver.filesystem()

    dataset_path = resolver.parsed_dataset_url().path

    if cache_type is None or cache_type == 'null':
        cache = NullCache()
    elif cache_type == 'local-disk':
        cache = LocalDiskArrowTableCache(cache_location, cache_size_limit,
                                         cache_row_size_estimate,
                                         **cache_extra_settings or {})
    else:
        raise ValueError('Unknown cache_type: {}'.format(cache_type))

    if reader_pool_type == 'thread':
        reader_pool = ThreadPool(workers_count)
    elif reader_pool_type == 'process':
        serializer = ArrowTableSerializer()
        reader_pool = ProcessPool(workers_count, serializer)
    elif reader_pool_type == 'dummy':
        reader_pool = DummyPool()
    else:
        raise ValueError(
            'Unknown reader_pool_type: {}'.format(reader_pool_type))

    return Reader(filesystem,
                  dataset_path,
                  schema_fields=schema_fields,
                  worker_class=ArrowReaderWorker,
                  reader_pool=reader_pool,
                  shuffle_row_groups=shuffle_row_groups,
                  shuffle_row_drop_partitions=shuffle_row_drop_partitions,
                  predicate=predicate,
                  rowgroup_selector=rowgroup_selector,
                  num_epochs=num_epochs,
                  cur_shard=cur_shard,
                  shard_count=shard_count,
                  cache=cache)
Example #25
0
def materialize_dataset(spark,
                        dataset_url,
                        schema,
                        row_group_size_mb=None,
                        use_summary_metadata=False):
    """
    A Context Manager which handles all the initialization and finalization necessary
    to generate metadata for a petastorm dataset. This should be used around your
    spark logic to materialize a dataset (specifically the writing of parquet output).

    Note: Any rowgroup indexing should happen outside the materialize_dataset block

    Example:

    >>> spark = SparkSession.builder...
    >>> ds_url = 'hdfs:///path/to/my/dataset'
    >>> with materialize_dataset(spark, ds_url, MyUnischema, 64):
    >>>   spark.sparkContext.parallelize(range(0, 10)).
    >>>     ...
    >>>     .write.parquet(ds_url)
    >>> indexer = [SingleFieldIndexer(...)]
    >>> build_rowgroup_index(ds_url, spark.sparkContext, indexer)

    :param spark: The spark session you are using
    :param dataset_url: The dataset url to output your dataset to (e.g. ``hdfs:///path/to/dataset``)
    :param schema: The :class:`petastorm.unischema.Unischema` definition of your dataset
    :param row_group_size_mb: The parquet row group size to use for your dataset
    :param use_summary_metadata: Whether to use the parquet summary metadata for row group indexing or a custom
            indexing method. The custom indexing method is more scalable for very large datasets.
    """
    spark_config = {}
    _init_spark(spark, spark_config, row_group_size_mb, use_summary_metadata)
    yield

    # After job completes, add the unischema metadata and check for the metadata summary file
    resolver = FilesystemResolver(
        dataset_url, spark.sparkContext._jsc.hadoopConfiguration())
    dataset = pq.ParquetDataset(resolver.parsed_dataset_url().path,
                                filesystem=resolver.filesystem(),
                                validate_schema=False)

    _generate_unischema_metadata(dataset, schema)
    if not use_summary_metadata:
        _generate_num_row_groups_per_file(dataset, spark.sparkContext)

    # Reload the dataset to take into account the new metadata
    dataset = pq.ParquetDataset(resolver.parsed_dataset_url().path,
                                filesystem=resolver.filesystem(),
                                validate_schema=False)
    try:
        # Try to load the row groups, if it fails that means the metadata was not generated properly
        load_row_groups(dataset)
    except PetastormMetadataError:
        raise PetastormMetadataGenerationError(
            'Could not find summary metadata file. The dataset will exist but you will need'
            ' to execute petastorm-generate-metadata.py before you can read your dataset '
            ' in order to generate the necessary metadata.'
            ' Try increasing spark driver memory next time and making sure you are'
            ' using parquet-mr >= 1.8.3')

    _cleanup_spark(spark, spark_config, row_group_size_mb)
Example #26
0
 def test_s3_without_s3fs(self):
     with mock.patch.dict('sys.modules', s3fs=None):
         # `import s3fs` will fail in this context
         with self.assertRaises(ValueError):
             FilesystemResolver(urlparse('s3://foo/bar'), {})
Example #27
0
def make_reader(dataset_url,
                schema_fields=None,
                reader_pool_type='thread',
                workers_count=10,
                pyarrow_serialize=False,
                results_queue_size=50,
                shuffle_row_groups=True,
                shuffle_row_drop_partitions=1,
                predicate=None,
                rowgroup_selector=None,
                num_epochs=1,
                cur_shard=None,
                shard_count=None,
                cache_type='null',
                cache_location=None,
                cache_size_limit=None,
                cache_row_size_estimate=None,
                cache_extra_settings=None,
                hdfs_driver='libhdfs3',
                reader_engine='reader_v1',
                reader_engine_params=None):
    """
    Creates an instance of Reader for reading Petastorm datasets. A Petastorm dataset is a dataset generated using
    :func:`~petastorm.etl.dataset_metadata.materialize_dataset` context manager as explained
    `here <https://petastorm.readthedocs.io/en/latest/readme_include.html#generating-a-dataset>`_.

    See :func:`~petastorm.make_batch_reader` to read from a Parquet store that was not generated using
    :func:`~petastorm.etl.dataset_metadata.materialize_dataset`.

    :param dataset_url: an filepath or a url to a parquet directory,
        e.g. ``'hdfs://some_hdfs_cluster/user/yevgeni/parquet8'``, or ``'file:///tmp/mydataset'``
        or ``'s3://bucket/mydataset'``.
    :param schema_fields: Can be: a list of unischema fields and/or regex pattern strings; ``None`` to read all fields;
            an NGram object, then it will return an NGram of the specified fields.
    :param reader_pool_type: A string denoting the reader pool type. Should be one of ['thread', 'process', 'dummy']
        denoting a thread pool, process pool, or running everything in the master thread. Defaults to 'thread'
    :param workers_count: An int for the number of workers to use in the reader pool. This only is used for the
        thread or process pool. Defaults to 10
    :param pyarrow_serialize: Whether to use pyarrow for serialization. Currently only applicable to process pool.
        Defaults to False.
    :param results_queue_size: Size of the results queue to store prefetched rows. Currently only applicable to
        thread reader pool type.
    :param shuffle_row_groups: Whether to shuffle row groups (the order in which full row groups are read)
    :param shuffle_row_drop_partitions: This is is a positive integer which determines how many partitions to
        break up a row group into for increased shuffling in exchange for worse performance (extra reads).
        For example if you specify 2 each row group read will drop half of the rows within every row group and
        read the remaining rows in separate reads. It is recommended to keep this number below the regular row
        group size in order to not waste reads which drop all rows.
    :param predicate: instance of :class:`.PredicateBase` object to filter rows to be returned by reader. The predicate
        will be passed a single row and must return a boolean value indicating whether to include it in the results.
    :param rowgroup_selector: instance of row group selector object to select row groups to be read
    :param num_epochs: An epoch is a single pass over all rows in the dataset. Setting ``num_epochs`` to
        ``None`` will result in an infinite number of epochs.
    :param cur_shard: An int denoting the current shard number. Each node reading a shard should
        pass in a unique shard number in the range [0, shard_count). shard_count must be supplied as well.
        Defaults to None
    :param shard_count: An int denoting the number of shards to break this dataset into. Defaults to None
    :param cache_type: A string denoting the cache type, if desired. Options are [None, 'null', 'local-disk'] to
        either have a null/noop cache or a cache implemented using diskcache. Caching is useful when communication
        to the main data store is either slow or expensive and the local machine has large enough storage
        to store entire dataset (or a partition of a dataset if shard_count is used). By default will be a null cache.
    :param cache_location: A string denoting the location or path of the cache.
    :param cache_size_limit: An int specifying the size limit of the cache in bytes
    :param cache_row_size_estimate: An int specifying the estimated size of a row in the dataset
    :param cache_extra_settings: A dictionary of extra settings to pass to the cache implementation,
    :param hdfs_driver: A string denoting the hdfs driver to use (if using a dataset on hdfs). Current choices are
        libhdfs (java through JNI) or libhdfs3 (C++)
    :param reader_engine: Multiple engine implementations exist ('reader_v1' and 'experimental_reader_v2'). 'reader_v1'
        (the default value) selects a stable reader implementation.
    :param reader_engine_params: For advanced usage: a dictionary with arguments passed directly to a reader
        implementation constructor chosen by ``reader_engine`` argument.  You should not use this parameter, unless you
        fine-tuning of a reader.
    :return: A :class:`Reader` object
    """

    if dataset_url is None or not isinstance(dataset_url, six.string_types):
        raise ValueError("""dataset_url must be a string""")

    dataset_url = dataset_url[:-1] if dataset_url[-1] == '/' else dataset_url
    logger.debug('dataset_url: %s', dataset_url)

    resolver = FilesystemResolver(dataset_url, hdfs_driver=hdfs_driver)
    filesystem = resolver.filesystem()
    dataset_path = resolver.get_dataset_path()

    if cache_type is None or cache_type == 'null':
        cache = NullCache()
    elif cache_type == 'local-disk':
        cache = LocalDiskCache(cache_location, cache_size_limit,
                               cache_row_size_estimate, **cache_extra_settings
                               or {})
    else:
        raise ValueError('Unknown cache_type: {}'.format(cache_type))

    # Fail if this is a non-petastorm dataset. Typically, a Parquet store will have hundred thousands rows in a single
    # rowgroup. Using PyDictReaderWorker or ReaderV2 implementation is very inefficient as it processes data on a
    # row by row basis. ArrowReaderWorker (used by make_batch_reader) is much more efficient in these cases.
    try:
        dataset_metadata.get_schema_from_dataset_url(dataset_url)
    except PetastormMetadataError:
        raise RuntimeError(
            'Currently make_reader supports reading only Petastorm datasets. '
            'To read from a non-Petastorm Parquet store use make_batch_reader')

    if reader_engine == 'reader_v1':
        if reader_pool_type == 'thread':
            reader_pool = ThreadPool(workers_count, results_queue_size)
        elif reader_pool_type == 'process':
            if pyarrow_serialize:
                serializer = PyArrowSerializer()
            else:
                serializer = PickleSerializer()
            reader_pool = ProcessPool(workers_count, serializer)
        elif reader_pool_type == 'dummy':
            reader_pool = DummyPool()
        else:
            raise ValueError(
                'Unknown reader_pool_type: {}'.format(reader_pool_type))

        # Create a dictionary with all ReaderV2 parameters, so we can merge with reader_engine_params if specified
        kwargs = {
            'schema_fields': schema_fields,
            'reader_pool': reader_pool,
            'shuffle_row_groups': shuffle_row_groups,
            'shuffle_row_drop_partitions': shuffle_row_drop_partitions,
            'predicate': predicate,
            'rowgroup_selector': rowgroup_selector,
            'num_epochs': num_epochs,
            'cur_shard': cur_shard,
            'shard_count': shard_count,
            'cache': cache,
        }

        if reader_engine_params:
            kwargs.update(reader_engine_params)

        try:
            return Reader(filesystem,
                          dataset_path,
                          worker_class=PyDictReaderWorker,
                          **kwargs)
        except PetastormMetadataError as e:
            logger.error('Unexpected exception: %s', str(e))
            raise RuntimeError(
                'make_reader has failed. If you were trying to open a Parquet store that was not '
                'created using Petastorm materialize_dataset and it contains only scalar columns, '
                'you may use make_batch_reader to read it.\n'
                'Inner exception: %s', str(e))

    elif reader_engine == 'experimental_reader_v2':
        if reader_pool_type == 'thread':
            decoder_pool = ThreadPoolExecutor(workers_count)
        elif reader_pool_type == 'process':
            decoder_pool = ProcessPoolExecutor(workers_count)
        elif reader_pool_type == 'dummy':
            decoder_pool = SameThreadExecutor()
        else:
            raise ValueError(
                'Unknown reader_pool_type: {}'.format(reader_pool_type))

        # TODO(yevgeni): once ReaderV2 is ready to be out of experimental status, we should extend
        # the make_reader interfaces to take shuffling buffer parameters explicitly
        shuffling_queue = RandomShufflingBuffer(
            1000, 800) if shuffle_row_groups else NoopShufflingBuffer()

        # Create a dictionary with all ReaderV2 parameters, so we can merge with reader_engine_params if specified
        kwargs = {
            'schema_fields': schema_fields,
            'predicate': predicate,
            'rowgroup_selector': rowgroup_selector,
            'num_epochs': num_epochs,
            'cur_shard': cur_shard,
            'shard_count': shard_count,
            'cache': cache,
            'decoder_pool': decoder_pool,
            'shuffling_queue': shuffling_queue,
            'shuffle_row_groups': shuffle_row_groups,
            'shuffle_row_drop_partitions': shuffle_row_drop_partitions,
        }

        if reader_engine_params:
            kwargs.update(reader_engine_params)

        return ReaderV2(dataset_url, **kwargs)

    else:
        raise ValueError(
            'Unexpected value of reader_engine argument \'%s\'. '
            'Supported reader_engine values are \'reader_v1\' and \'experimental_reader_v2\'',
            reader_engine)
Example #28
0
                        help='Display list of row group indexes')
    parser.add_argument('--print-values',
                        action='store_true',
                        help='Print index values (dataset piece indexes)')
    parser.add_argument('--skip-index',
                        nargs='+',
                        type=str,
                        help='Donot display indexed values for given fields')

    args = parser.parse_args()

    if args.dataset_url and args.dataset_url[-1] == '/':
        args.dataset_url = args.dataset_url[:-1]

    # Create pyarrow file system
    resolver = FilesystemResolver(args.dataset_url)
    dataset = pq.ParquetDataset(resolver.get_dataset_path(),
                                filesystem=resolver.filesystem(),
                                validate_schema=False)

    print_all = not args.schema and not args.index
    if args.schema or print_all:
        print('*** Schema from dataset metadata ***')
        print((dataset_metadata.get_schema(dataset)))

    if args.index or print_all:
        index_dict = rowgroup_indexing.get_row_group_indexes(dataset)
        print('*** Row group indexes from dataset metadata ***')
        for index_name in index_dict:
            print(('Index: {}'.format(index_name)))
            if args.skip_index is None or index_name not in args.skip_index: