Esempio n. 1
0
def build_rowgroup_index(dataset_url,
                         spark_context,
                         indexers,
                         hdfs_driver='libhdfs3'):
    """
    Build index for given list of fields to use for fast rowgroup selection
    :param dataset_url: (str) the url for the dataset (or a path if you would like to use the default hdfs config)
    :param spark_context: (SparkContext)
    :param indexers: list of objects to build row groups indexes. Should support RowGroupIndexerBase interface
    :param hdfs_driver: A string denoting the hdfs driver to use (if using a dataset on hdfs). Current choices are
    libhdfs (java through JNI) or libhdfs3 (C++)
    :return: None, upon successful completion the rowgroup predicates will be saved to _metadata file
    """

    if dataset_url and dataset_url[-1] == '/':
        dataset_url = dataset_url[:-1]

    # Create pyarrow file system
    resolver = FilesystemResolver(dataset_url,
                                  spark_context._jsc.hadoopConfiguration(),
                                  hdfs_driver=hdfs_driver,
                                  user=spark_context.sparkUser())
    dataset = pq.ParquetDataset(resolver.get_dataset_path(),
                                filesystem=resolver.filesystem(),
                                validate_schema=False)

    split_pieces = dataset_metadata.load_row_groups(dataset)
    schema = dataset_metadata.get_schema(dataset)

    # We need direct reference on partitions object
    partitions = dataset.partitions
    pieces_num = len(split_pieces)
    piece_info_list = []
    for piece_index in range(pieces_num):
        #  indexes relies on the ordering of the split dataset pieces.
        # This relies on how the dataset pieces are split and sorted which although should not change,
        # still might and we should make sure not to forget that could break this.
        piece = split_pieces[piece_index]
        piece_info_list.append(
            PieceInfo(piece_index, piece.path, piece.row_group,
                      piece.partition_keys))

    start_time = time.time()
    piece_info_rdd = spark_context.parallelize(
        piece_info_list, min(len(piece_info_list), PARALLEL_SLICE_NUM))
    indexer_rdd = piece_info_rdd.map(
        lambda piece_info: _index_columns(piece_info,
                                          dataset_url,
                                          partitions,
                                          indexers,
                                          schema,
                                          hdfs_driver=hdfs_driver))
    indexer_list = indexer_rdd.reduce(_combine_indexers)

    indexer_dict = {indexer.index_name: indexer for indexer in indexer_list}
    serialized_indexers = pickle.dumps(indexer_dict, pickle.HIGHEST_PROTOCOL)
    utils.add_to_dataset_metadata(dataset, ROWGROUPS_INDEX_KEY,
                                  serialized_indexers)
    logger.info("Elapsed time of index creation: %f s",
                (time.time() - start_time))
Esempio n. 2
0
def dataset_as_rdd(dataset_url, spark_session, schema_fields=None):
    """
    Retrieve a spark rdd for a given petastorm dataset

    :param dataset_url: A string for the dataset url (e.g. hdfs:///path/to/dataset)
    :param spark_session: A spark session
    :param schema_fields: list of unischema fields to subset, or None to read all fields.
    :return: A rdd of dictionary records from the dataset
    """
    dataset_url_parsed = urlparse(dataset_url)

    resolver = FilesystemResolver(
        dataset_url_parsed,
        spark_session.sparkContext._jsc.hadoopConfiguration())
    dataset = pq.ParquetDataset(resolver.parsed_dataset_url().path,
                                filesystem=resolver.filesystem(),
                                validate_schema=False)
    schema = dataset_metadata.get_schema(dataset)

    dataset_df = spark_session.read.parquet(resolver.parsed_dataset_url().path)
    if schema_fields is not None:
        # If wanting a subset of fields, create the schema view and run a select on those fields
        schema = schema.create_schema_view(schema_fields)
        field_names = [field.name for field in schema_fields]
        dataset_df = dataset_df.select(*field_names)

    dataset_rows = dataset_df.rdd\
        .map(lambda row: utils.decode_row(row.asDict(), schema))\
        .map(lambda record: schema.make_namedtuple(**record))

    return dataset_rows
Esempio n. 3
0
def generate_petastorm_metadata(spark, dataset_url, unischema_class=None, use_summary_metadata=False):
    """
    Generates metadata necessary to read a petastorm dataset to an existing dataset.

    :param spark: spark session
    :param dataset_url: url of existing dataset
    :param unischema_class: (optional) fully qualified dataset unischema class. If not specified will attempt
        to find one already in the dataset. (e.g.
        :class:`examples.hello_world.generate_hello_world_dataset.HelloWorldSchema`)
    """
    sc = spark.sparkContext

    resolver = FilesystemResolver(dataset_url, sc._jsc.hadoopConfiguration())
    dataset = pq.ParquetDataset(
        resolver.get_dataset_path(),
        filesystem=resolver.filesystem(),
        validate_schema=False)

    if unischema_class:
        schema = locate(unischema_class)
        if not isinstance(schema, Unischema):
            raise ValueError('The specified class %s is not an instance of a petastorm.Unischema object.',
                             unischema_class)
    else:

        try:
            schema = get_schema(dataset)
        except ValueError:
            raise ValueError('Unischema class could not be located in existing dataset,'
                             ' please specify it')

    # In order to be backwards compatible, we retrieve the common metadata from the dataset before
    # overwriting the metadata to keep row group indexes and the old row group per file index
    arrow_metadata = dataset.common_metadata or None

    with materialize_dataset(spark, dataset_url, schema, use_summary_metadata=use_summary_metadata):
        if use_summary_metadata:
            # Inside the materialize dataset context we just need to write the metadata file as the schema will
            # be written by the context manager.
            # We use the java ParquetOutputCommitter to write the metadata file for the existing dataset
            # which will read all the footers of the dataset in parallel and merge them.
            hadoop_config = sc._jsc.hadoopConfiguration()
            Path = sc._gateway.jvm.org.apache.hadoop.fs.Path
            parquet_output_committer = sc._gateway.jvm.org.apache.parquet.hadoop.ParquetOutputCommitter
            parquet_output_committer.writeMetaDataFile(hadoop_config, Path(dataset_url))

    spark.stop()

    if use_summary_metadata and arrow_metadata:
        # When calling writeMetaDataFile it will overwrite the _common_metadata file which could have schema information
        # or row group indexers. Therefore we want to retain this information and will add it to the new
        # _common_metadata file. If we were using the old legacy metadata method this file wont be deleted
        base_schema = arrow_metadata.schema.to_arrow_schema()
        metadata_dict = base_schema.metadata
        if ROW_GROUPS_PER_FILE_KEY in metadata_dict:
            add_to_dataset_metadata(dataset, ROW_GROUPS_PER_FILE_KEY, metadata_dict[ROW_GROUPS_PER_FILE_KEY])
        if ROWGROUPS_INDEX_KEY in metadata_dict:
            add_to_dataset_metadata(dataset, ROWGROUPS_INDEX_KEY, metadata_dict[ROWGROUPS_INDEX_KEY])
def generate_petastorm_metadata(spark, dataset_url, unischema_class=None):
    """
    Generate metadata necessary to read a petastorm dataset to an existing dataset.
    :param spark: spark session
    :param dataset_url: url of existing dataset
    :param unischema_class: (optional) fully qualified dataset unischema class. If not specified will attempt
        to find one already in the dataset. (e.g. examples.hello_world.hello_world_dataset.HelloWorldSchema)
    :return:
    """
    sc = spark.sparkContext

    resolver = FilesystemResolver(dataset_url, sc._jsc.hadoopConfiguration())
    dataset = pq.ParquetDataset(resolver.parsed_dataset_url().path,
                                filesystem=resolver.filesystem(),
                                validate_schema=False)

    if unischema_class:
        schema = locate(unischema_class)
    else:

        try:
            schema = get_schema(dataset)
        except ValueError:
            raise ValueError(
                'Unischema class could not be located in existing dataset,'
                ' please specify it')

    # In order to be backwards compatible, we retrieve the common metadata from the dataset before
    # overwriting the metadata to keep row group indexes and the old row group per file index
    arrow_metadata = dataset.common_metadata or None

    with materialize_dataset(spark, dataset_url, schema):
        # Inside the materialize dataset context we just need to write the metadata file as the schema will
        # be written by the context manager.
        # We use the java ParquetOutputCommitter to write the metadata file for the existing dataset
        # which will read all the footers of the dataset in parallel and merge them.
        hadoop_config = sc._jsc.hadoopConfiguration()
        Path = sc._gateway.jvm.org.apache.hadoop.fs.Path
        parquet_output_committer = sc._gateway.jvm.org.apache.parquet.hadoop.ParquetOutputCommitter
        parquet_output_committer.writeMetaDataFile(hadoop_config,
                                                   Path(dataset_url))

    if arrow_metadata:
        # If there was the old row groups per file key or the row groups index key, add them to the new dataset metadata
        base_schema = arrow_metadata.schema.to_arrow_schema()
        metadata_dict = base_schema.metadata
        if ROW_GROUPS_PER_FILE_KEY in metadata_dict:
            add_to_dataset_metadata(dataset, ROW_GROUPS_PER_FILE_KEY,
                                    metadata_dict[ROW_GROUPS_PER_FILE_KEY])
        if ROWGROUPS_INDEX_KEY in metadata_dict:
            add_to_dataset_metadata(dataset, ROWGROUPS_INDEX_KEY,
                                    metadata_dict[ROWGROUPS_INDEX_KEY])
Esempio n. 5
0
                             'Current choices are libhdfs (java through JNI) or libhdfs3 (C++)')

    args = parser.parse_args()

    if args.dataset_url and args.dataset_url[-1] == '/':
        args.dataset_url = args.dataset_url[:-1]

    # Create pyarrow file system
    resolver = FilesystemResolver(args.dataset_url, hdfs_driver=args.hdfs_driver)
    dataset = pq.ParquetDataset(resolver.get_dataset_path(), filesystem=resolver.filesystem(),
                                validate_schema=False)

    print_all = not args.schema and not args.index
    if args.schema or print_all:
        print('*** Schema from dataset metadata ***')
        print((dataset_metadata.get_schema(dataset)))

    if args.index or print_all:
        index_dict = rowgroup_indexing.get_row_group_indexes(dataset)
        print('*** Row group indexes from dataset metadata ***')
        for index_name in index_dict:
            print(('Index: {}'.format(index_name)))
            if args.skip_index is None or index_name not in args.skip_index:
                for field_value in index_dict[index_name].indexed_values:
                    print('  -- {}({})'.format(field_value,
                                               len(index_dict[index_name].get_row_group_indexes(field_value))))
                    if args.print_values:
                        print(index_dict[index_name].get_row_group_indexes(field_value))
            else:
                print('  (skipped)')
Esempio n. 6
0
    def __init__(self, pyarrow_filesystem, dataset_path, schema_fields=None,
                 shuffle_row_groups=True, shuffle_row_drop_partitions=1,
                 predicate=None, rowgroup_selector=None, reader_pool=None, num_epochs=1,
                 cur_shard=None, shard_count=None, cache=None, infer_schema=False):
        """Initializes a reader object.

        :param pyarrow_filesystem: An instance of ``pyarrow.FileSystem`` that will be used. If not specified,
            then a default one will be selected based on the url (only for ``hdfs://`` or ``file://``; for
            ``s3://`` support, use ``make_reader``). The default hdfs driver is ``libhdfs3``. If you want
            to to use ``libhdfs``, use
            ``pyarrow_filesystem=pyarrow.hdfs.connect('hdfs:///some/path', driver='libhdfs')``.
        :param dataset_path: filepath to a parquet directory on the specified filesystem.
            e.g. ``'/user/yevgeni/parquet8'``, or ``'/tmp/mydataset'``.
        :param schema_fields: Either list of unischema fields to subset, or ``None`` to read all fields.
            OR an NGram object, then it will return an NGram of the specified properties.
        :param shuffle_row_groups: Whether to shuffle row groups (the order in which full row groups are read)
        :param shuffle_row_drop_partitions: This is is a positive integer which determines how many partitions to
            break up a row group into for increased shuffling in exchange for worse performance (extra reads).
            For example if you specify 2 each row group read will drop half of the rows within every row group and
            read the remaining rows in separate reads. It is recommended to keep this number below the regular row
            group size in order to not waste reads which drop all rows.
        :param predicate: instance of predicate object to filter rows to be returned by reader.
        :param rowgroup_selector: instance of row group selector object to select row groups to be read
        :param reader_pool: parallelization pool. ``ThreadPool(10)`` (10 threads) is used by default.
            This pool is a custom implementation used to parallelize reading data from the dataset.
            Any object from workers_pool package can be used
            (e.g. :class:`petastorm.workers_pool.process_pool.ProcessPool`).
        :param num_epochs: An epoch is a single pass over all rows in the dataset. Setting ``num_epochs`` to
            ``None`` will result in an infinite number of epochs.
        :param cur_shard: An int denoting the current shard number used. Each reader instance should
            pass in a unique shard number in the range ``[0, shard_count)``.
            ``shard_count`` must be supplied as well. Defaults to None
        :param shard_count: An int denoting the number of shard partitions there are. Defaults to None
        :param cache: An object conforming to :class:`.CacheBase` interface. Before loading row groups from a parquet
            file the Reader will attempt to load these values from cache. Caching is useful when communication
            to the main data store is either slow or expensive and the local machine has large enough storage
            to store entire dataset (or a partition of a dataset if shards are used).
            By default, use the :class:`.NullCache` implementation.
        """

        # 1. Open the parquet storage (dataset)
        # 2. Get a list of all groups
        # 3. Filter rowgroups
        #    a. predicates
        #    b. row-group selector (our indexing mechanism)
        #    c. partition: used to get a subset of data for distributed training
        # 4. Create a rowgroup ventilator object
        # 5. Start workers pool
        if not (isinstance(schema_fields, collections.Iterable) or isinstance(schema_fields, NGram)
                or schema_fields is None):
            raise ValueError("""Fields must be either None, an iterable collection of Unischema fields or an NGram
            object.""")

        self.ngram = schema_fields if isinstance(schema_fields, NGram) else None

        if self.ngram and not self.ngram.timestamp_overlap and shuffle_row_drop_partitions > 1:
            raise NotImplementedError('Using timestamp_overlap=False is not implemented with'
                                      ' shuffle_options.shuffle_row_drop_partitions > 1')

        cache = cache or NullCache()

        self._workers_pool = reader_pool or ThreadPool(10)
        # 1. Resolve dataset path (hdfs://, file://) and open the parquet storage (dataset)
        self.dataset = pq.ParquetDataset(dataset_path, filesystem=pyarrow_filesystem,
                                         validate_schema=False)

        if infer_schema:
            # If inferring schema, just retrieve the schema from a file of the dataset
            meta = self.dataset.pieces[0].get_metadata(self.dataset.fs.open)
            arrow_schema = meta.schema.to_arrow_schema()
            stored_schema = Unischema.from_arrow_schema(arrow_schema)
        else:
            # Otherwise, get the stored schema
            stored_schema = dataset_metadata.get_schema(self.dataset)

        # Make a schema view (a view is a Unischema containing only a subset of fields
        # Will raise an exception if invalid schema fields are in schema_fields
        fields = schema_fields if isinstance(schema_fields, collections.Iterable) else None
        self.schema = stored_schema.create_schema_view(fields) if fields else stored_schema

        # 2. Get a list of all groups
        row_groups = dataset_metadata.load_row_groups(self.dataset, infer_schema)

        # 3. Filter rowgroups
        filtered_row_group_indexes, worker_predicate = self._filter_row_groups(self.dataset, row_groups, predicate,
                                                                               rowgroup_selector, cur_shard,
                                                                               shard_count)
        # 4. Create a rowgroup ventilator object
        normalized_shuffle_row_drop_partitions = \
            self._normalize_shuffle_options(shuffle_row_drop_partitions, self.dataset)
        ventilator = self._create_ventilator(filtered_row_group_indexes, shuffle_row_groups,
                                             normalized_shuffle_row_drop_partitions, num_epochs, worker_predicate,
                                             self._workers_pool.workers_count + _VENTILATE_EXTRA_ROWGROUPS)

        # 5. Start workers pool
        self._workers_pool.start(ReaderWorker,
                                 (pyarrow_filesystem, dataset_path, self.schema, self.ngram, row_groups, cache),
                                 ventilator=ventilator)
        logger.debug('Workers pool started')

        self.last_row_consumed = False

        # _result
        self._result_buffer = []
Esempio n. 7
0
    def __init__(self,
                 dataset_url,
                 schema_fields=None,
                 shuffle=None,
                 predicate=None,
                 rowgroup_selector=None,
                 reader_pool=None,
                 num_epochs=1,
                 sequence=None,
                 training_partition=None,
                 num_training_partitions=None,
                 read_timeout_s=None,
                 cache=None,
                 shuffle_options=None):
        """Initializes a reader object.

        :param dataset_url: an filepath or a url to a parquet directory,
            e.g. ``'hdfs://some_hdfs_cluster/user/yevgeni/parquet8'``, or ``'/tmp/mydataset'``.
        :param schema_fields: Either list of unischema fields to subset, or ``None`` to read all fields.
            OR an NGram object, then it will return an NGram of the specified properties.
        :param predicate: instance of predicate object to filter rows to be returned by reader.
        :param rowgroup_selector: instance of row group selector object to select row groups to be read
        :param reader_pool: parallelization pool. ``ThreadPool(10)`` (10 threads) is used by default.
            This pool is a custom implementation used to parallelize reading data from the dataset.
            Any object from workers_pool package can be used
            (e.g. :class:`petastorm.workers_pool.process_pool.ProcessPool`).
        :param num_epochs: An epoch is a single pass over all samples in the dataset. Setting ``num_epochs`` to
            ``None`` will result in an infinite number of epochs.
        :param training_partition: An int denoting the partition number used for multi node training. Each node should
            pass in a unique partition number in the range ``[0, num_training_partitions)``.
            ``num_training_partitions`` must be supplied as well.
        :param num_training_partitions: An int denoting the number of training partitions (how many nodes are performing
            the multi node training).
        :param read_timeout_s: A numeric with the amount of time in seconds you would like to give a read before it
            times out and raises an EmptyResultError. Pass in None for an infinite timeout.
        :param cache: An object conforming to :class:`.CacheBase` interface. Before loading row groups from a parquet
            file the Reader will attempt to load these values from cache. Caching is useful when communication
            to the main data store is either slow or expensive and the local machine has large enough storage
            to store entire dataset (or a partition of a dataset if num_training_partitions is used).
            By default, use the :class:`.NullCache` implementation.
        :param shuffle_options: ShuffleOptions object to describe how to shuffle dataset (supercedes shuffle parameter)
            defaults to shuffling row groups but not to drop rows based on partitions.
        :param sequence: *DEPRECATED* To use sequence/ngram, please supply the argument in
            ``schema_fields`` instead.
        :param shuffle: *DEPRECATED* Boolean whether to shuffle the row group order.
            Use ``shuffle_row_groups`` in :class:`.ShuffleOptions` instead.
        """

        # 1. Resolve dataset path (hdfs://, file://) and open the parquet storage (dataset)
        # 2. Get a list of all groups
        # 3. Filter rowgroups
        #    a. predicates
        #    b. row-group selector (our indexing mechanism)
        #    c. partition: used to get a subset of data for distributed training
        # 4. Create a rowgroup ventilator object
        # 5. Start workers pool
        if dataset_url is None or not isinstance(dataset_url,
                                                 six.string_types):
            raise ValueError("""dataset_url must be a string""")

        if not (isinstance(schema_fields, collections.Iterable)
                or isinstance(schema_fields, NGram) or schema_fields is None):
            raise ValueError(
                """Fields must be either None, an iterable collection of Unischema fields or an NGram
            object.""")

        if sequence is not None:
            raise ValueError(
                """'sequence' argument of Reader object is deprecated. Please pass an NGram instance to
            'schema_fields' argument instead.""")

        self.ngram = schema_fields if isinstance(schema_fields,
                                                 NGram) else None

        if self.ngram and not self.ngram.timestamp_overlap and shuffle_options.shuffle_row_drop_partitions > 1:
            raise NotImplementedError(
                'Using timestamp_overlap=False is not implemented with'
                ' shuffle_options.shuffle_row_drop_partitions > 1')

        cache = cache or NullCache()
        dataset_url = dataset_url[:-1] if dataset_url[
            -1] == '/' else dataset_url
        self._workers_pool = reader_pool or ThreadPool(10)

        # 1. Resolve dataset path (hdfs://, file://) and open the parquet storage (dataset)
        logger.debug('dataset_url: %s', dataset_url)
        resolver = FilesystemResolver(dataset_url)
        self.dataset = pq.ParquetDataset(resolver.parsed_dataset_url().path,
                                         filesystem=resolver.filesystem(),
                                         validate_schema=False)

        # Get a unischema stored in the dataset metadata.
        stored_schema = dataset_metadata.get_schema(self.dataset)

        # Make a schema view (a view is a Unischema containing only a subset of fields
        # Will raise an exception if invalid schema fields are in schema_fields
        fields = schema_fields if isinstance(schema_fields,
                                             collections.Iterable) else None
        self.schema = stored_schema.create_schema_view(
            fields) if fields else stored_schema

        # 2. Get a list of all groups
        row_groups = dataset_metadata.load_row_groups(self.dataset)

        # 3. Filter rowgroups
        filtered_row_group_indexes, worker_predicate = self._filter_row_groups(
            self.dataset, row_groups, predicate, rowgroup_selector,
            training_partition, num_training_partitions)
        # 4. Create a rowgroup ventilator object
        if shuffle_options is None:
            if shuffle is None:
                shuffle = True
            else:
                logger.warning(
                    'shuffle option is deprecated. Please use shuffle_options instead'
                )
            shuffle_options = ShuffleOptions(shuffle)
        self._normalize_shuffle_options(shuffle_options, self.dataset)
        ventilator = self._create_ventilator(filtered_row_group_indexes,
                                             shuffle_options, num_epochs,
                                             worker_predicate)

        # 5. Start workers pool
        self._workers_pool.start(ReaderWorker,
                                 (dataset_url, self.schema, self.ngram,
                                  row_groups, cache, worker_predicate),
                                 ventilator=ventilator)
        self._read_timeout_s = read_timeout_s
Esempio n. 8
0
    def __init__(self,
                 dataset_url,
                 schema_fields=None,
                 predicate=None,
                 rowgroup_selector=None,
                 num_epochs=1,
                 sequence=None,
                 cur_shard=None,
                 shard_count=None,
                 read_timeout_s=None,
                 cache=None,
                 loader_pool=None,
                 decoder_pool=None,
                 shuffling_queue=None,
                 shuffle_row_groups=True,
                 shuffle_row_drop_partitions=1,
                 pyarrow_filesystem=None):
        """Initializes a reader object.

        :param dataset_url: an filepath or a url to a parquet directory,
                       e.g. 'hdfs://some_hdfs_cluster/user/yevgeni/parquet8', or '/tmp/mydataset'
                       or ``'s3://bucket/mydataset'``.
        :param schema_fields:
            Either list of unischema fields to subset, or None to read all fields.
            OR an NGram object, then it will return an NGram of the specified properties.
        :param predicate: instance of predicate object to filter rows to be returned by reader.
        :param rowgroup_selector: instance of row group selector object to select row groups to be read
        :param reader_pool: parallelization pool. ThreadPool(10) (10 threads) is used by default.
                       This pool is a custom implementation used to parallelize reading data from the dataset.
                       Any object from workers_pool package can be used (e.g. ProcessPool)
        :param num_epochs: An epoch is a single pass over all samples in the dataset. Setting num_epochs to 'None' will
                       result in an infinite number of epochs.
        :param sequence: This is deprecated. To use sequence/ngram, please supply the argument in schema_fields instead.
        :param cur_shard: An int denoting the current shard number. Each node reading a shard should
                       pass in a unique shard number in the range [0, shard_count).
                       shard count must be supplied as well.
        :param shard_count An int denoting the number of shards to break this dataset into.
        :param read_timeout_s: A numeric with the amount of time in seconds you would like to give a read before it
                       times out and raises an EmptyResultError. Pass in None for an infinite timeout
        :param cache: An object conforming to `cache.CacheBase` interface. Before loading row groups from a parquet file
                       the Reader will attempt to load these values from cache. Caching is useful when communication
                       to the main data store is either slow or expensive and the local machine has large enough storage
                       to store entire dataset (or a partition of a dataset if num_training_partitions is used).
        :param decoder_pool: An instance of a concurrent.futures pool executor used for decoding. If None,
          a default ThreadPoolExecutor(5) will be used.
        :param loader_pool: An instance of a concurrent.futures pool executor used for decoding. If None,
          a default ThreadPoolExecutor(5) will be used.

        By default, `NullCache` implementation
        """

        # 1. Resolve dataset path (hdfs://, file://) and open the parquet storage (dataset)
        # 2. Get a list of all groups
        # 3. Filter rowgroups
        #    a. predicates
        #    b. row-group selector (our indexing mechanism)
        #    c. partition: used to get a subset of data for distributed training
        # 4. Launch a new thread running `worker_loop` function.

        if dataset_url is None or not isinstance(dataset_url,
                                                 six.string_types):
            raise ValueError("""dataset_url must be a string""")

        if not (isinstance(schema_fields, collections.Iterable)
                or isinstance(schema_fields, NGram) or schema_fields is None):
            raise ValueError(
                """Fields must be either None, an iterable collection of Unischema fields or an NGram
            object.""")

        if sequence is not None:
            raise ValueError(
                """'sequence' argument of Reader object is deprecated. Please pass an NGram instance to
            'schema_fields' argument instead.""")

        self.ngram = schema_fields if isinstance(schema_fields,
                                                 NGram) else None

        if self.ngram and not self.ngram.timestamp_overlap and shuffle_row_drop_partitions > 1:
            raise NotImplementedError(
                'Using timestamp_overlap=False is not implemented with'
                ' shuffle_options.shuffle_row_drop_partitions > 1')

        cache = cache or NullCache()
        dataset_url = dataset_url[:-1] if dataset_url[
            -1] == '/' else dataset_url

        # 1. Resolve dataset path (hdfs://, file://) and open the parquet storage (dataset)
        logger.debug('dataset_url: %s', dataset_url)

        if pyarrow_filesystem is not None:
            filesystem = pyarrow_filesystem
            dataset_path = urlparse(dataset_url).path
        else:
            resolver = FilesystemResolver(dataset_url)
            filesystem = resolver.filesystem()
            dataset_path = resolver.get_dataset_path()

        self._dataset = pq.ParquetDataset(dataset_path,
                                          filesystem=filesystem,
                                          validate_schema=False)

        shuffle_row_drop_partitions = self._normalize_shuffle_options(
            shuffle_row_drop_partitions, self._dataset)

        # Get a unischema stored in the dataset metadata.
        stored_schema = dataset_metadata.get_schema(self._dataset)

        # Make a schema view (a view is a Unischema containing only a subset of fields
        # Will raise an exception if invalid schema fields are in schema_fields
        fields = schema_fields if isinstance(schema_fields,
                                             collections.Iterable) else None
        self.schema = stored_schema.create_schema_view(
            fields) if fields else stored_schema

        # 2. Get a list of all groups
        row_groups = dataset_metadata.load_row_groups(self._dataset)

        # 3. Filter rowgroups
        filtered_row_groups, worker_predicate = self._filter_row_groups(
            self._dataset, row_groups, predicate, rowgroup_selector, cur_shard,
            shard_count)

        epoch_items = self._apply_row_drop_partition(
            filtered_row_groups, shuffle_row_drop_partitions)

        # 4. Launch a new thread running `worker_loop` function.
        def epochs_iterator():
            return epoch_generator(epoch_items, num_epochs, shuffle_row_groups)

        self._results_queue = Queue(_OUTPUT_QUEUE_SIZE)

        loader = RowGroupLoader(dataset_url, self.schema, self.ngram, cache,
                                worker_predicate)
        decoder = RowDecoder(self.schema, self.ngram)
        self._loader_pool = loader_pool or ThreadPoolExecutor(5)
        self._decoder_pool = decoder_pool or ThreadPoolExecutor(5)
        self._stop_flow_manager_event = threading.Event()
        self._diags = Counter()

        if not shuffling_queue:
            shuffling_queue = NoopShufflingBuffer()

        self._flow_manager_thread = threading.Thread(
            target=worker_loop,
            args=(epochs_iterator, self._loader_pool, loader,
                  self._decoder_pool, decoder, shuffling_queue,
                  self._results_queue, self._stop_flow_manager_event,
                  self._diags))
        self._flow_manager_thread.daemon = True
        self._flow_manager_thread.start()

        self._read_timeout_s = read_timeout_s