Python FilesystemResolver.filesystem Beispiele, petastorm.fs_utils.FilesystemResolver.filesystem Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: test_fs_utils.py Projekt: yangw1234/petastorm

 def test_hdfs_url_direct_namenode_retries(self):
     """ Case 4: direct namenode fails first two times thru, but 2nd retry succeeds."""
     self.mock.set_fail_n_next_connect(2)
     with self.assertRaises(ArrowIOError):
         suj = FilesystemResolver('hdfs://{}/path'.format(
             HC.WARP_TURTLE_NN2),
                                  self._hadoop_configuration,
                                  connector=self.mock,
                                  user=self.mock_name)
     self.assertEqual(1, self.mock.connect_attempted(HC.WARP_TURTLE_NN2))
     self.assertEqual(0, self.mock.connect_attempted(HC.WARP_TURTLE_NN1))
     self.assertEqual(0, self.mock.connect_attempted(HC.DEFAULT_NN))
     with self.assertRaises(ArrowIOError):
         suj = FilesystemResolver('hdfs://{}/path'.format(
             HC.WARP_TURTLE_NN2),
                                  self._hadoop_configuration,
                                  connector=self.mock)
     self.assertEqual(2, self.mock.connect_attempted(HC.WARP_TURTLE_NN2))
     self.assertEqual(0, self.mock.connect_attempted(HC.WARP_TURTLE_NN1))
     self.assertEqual(0, self.mock.connect_attempted(HC.DEFAULT_NN))
     # this one should connect "successfully"
     suj = FilesystemResolver('hdfs://{}/path'.format(HC.WARP_TURTLE_NN2),
                              self._hadoop_configuration,
                              connector=self.mock,
                              user=self.mock_name)
     self.assertEqual(MockHdfs, type(suj.filesystem()))
     self.assertEqual(self.mock_name, suj.filesystem()._user)
     self.assertEqual(HC.WARP_TURTLE_NN2, suj.parsed_dataset_url().netloc)
     self.assertEqual(3, self.mock.connect_attempted(HC.WARP_TURTLE_NN2))
     self.assertEqual(0, self.mock.connect_attempted(HC.WARP_TURTLE_NN1))
     self.assertEqual(0, self.mock.connect_attempted(HC.DEFAULT_NN))

Beispiel #2

0

Datei anzeigen

Datei: test_fs_utils.py Projekt: yangw1234/petastorm

 def test_hdfs_url_direct_namenode_driver_libhdfs(self):
     suj = FilesystemResolver('hdfs://{}/path'.format(HC.WARP_TURTLE_NN1),
                              self._hadoop_configuration,
                              connector=self.mock,
                              hdfs_driver='libhdfs',
                              user=self.mock_name)
     self.assertEqual(MockHdfs, type(suj.filesystem()))
     self.assertEqual(self.mock_name, suj.filesystem()._user)
     # Make sure we did not capture FilesystemResolver in a closure by mistake
     dill.dumps(suj.filesystem_factory())

Beispiel #3

0

Datei anzeigen

Datei: test_fs_utils.py Projekt: yangw1234/petastorm

    def test_hdfs_url_direct_namenode(self):
        """ Case 4: direct namenode."""
        suj = FilesystemResolver('hdfs://{}/path'.format(HC.WARP_TURTLE_NN1),
                                 self._hadoop_configuration,
                                 connector=self.mock,
                                 user=self.mock_name)
        self.assertEqual(MockHdfs, type(suj.filesystem()))
        self.assertEqual(self.mock_name, suj.filesystem()._user)
        self.assertEqual(HC.WARP_TURTLE_NN1, suj.parsed_dataset_url().netloc)
        self.assertEqual(0, self.mock.connect_attempted(HC.WARP_TURTLE_NN2))
        self.assertEqual(1, self.mock.connect_attempted(HC.WARP_TURTLE_NN1))
        self.assertEqual(0, self.mock.connect_attempted(HC.DEFAULT_NN))

        # Make sure we did not capture FilesystemResolver in a closure by mistake
        dill.dumps(suj.filesystem_factory())

Beispiel #4

0

Datei anzeigen

Datei: test_fs_utils.py Projekt: yangw1234/petastorm

    def test_hdfs_url_with_nameservice(self):
        """ Case 3a: HDFS nameservice."""
        suj = FilesystemResolver(HC.WARP_TURTLE_PATH,
                                 self._hadoop_configuration,
                                 connector=self.mock,
                                 user=self.mock_name)
        self.assertEqual(MockHdfs, type(suj.filesystem()._hdfs))
        self.assertEqual(self.mock_name, suj.filesystem()._user)
        self.assertEqual(HC.WARP_TURTLE, suj.parsed_dataset_url().netloc)
        self.assertEqual(1, self.mock.connect_attempted(HC.WARP_TURTLE_NN2))
        self.assertEqual(0, self.mock.connect_attempted(HC.WARP_TURTLE_NN1))
        self.assertEqual(0, self.mock.connect_attempted(HC.DEFAULT_NN))

        # Make sure we did not capture FilesystemResolver in a closure by mistake
        dill.dumps(suj.filesystem_factory())

Beispiel #5

0

Datei anzeigen

Datei: spark_utils.py Projekt: meremeev/petastorm

def dataset_as_rdd(dataset_url, spark_session, schema_fields=None):
    """
    Retrieve a spark rdd for a given petastorm dataset

    :param dataset_url: A string for the dataset url (e.g. hdfs:///path/to/dataset)
    :param spark_session: A spark session
    :param schema_fields: list of unischema fields to subset, or None to read all fields.
    :return: A rdd of dictionary records from the dataset
    """
    dataset_url_parsed = urlparse(dataset_url)

    resolver = FilesystemResolver(
        dataset_url_parsed,
        spark_session.sparkContext._jsc.hadoopConfiguration())
    dataset = pq.ParquetDataset(resolver.parsed_dataset_url().path,
                                filesystem=resolver.filesystem(),
                                validate_schema=False)
    schema = dataset_metadata.get_schema(dataset)

    dataset_df = spark_session.read.parquet(resolver.parsed_dataset_url().path)
    if schema_fields is not None:
        # If wanting a subset of fields, create the schema view and run a select on those fields
        schema = schema.create_schema_view(schema_fields)
        field_names = [field.name for field in schema_fields]
        dataset_df = dataset_df.select(*field_names)

    dataset_rows = dataset_df.rdd\
        .map(lambda row: utils.decode_row(row.asDict(), schema))\
        .map(lambda record: schema.make_namedtuple(**record))

    return dataset_rows

Beispiel #6

0

Datei anzeigen

 def test_s3_url(self):
     suj = FilesystemResolver('s3://bucket{}'.format(ABS_PATH),
                              self._hadoop_configuration,
                              connector=self.mock)
     self.assertTrue(isinstance(suj.filesystem(), S3FSWrapper))
     self.assertEqual('bucket', suj.parsed_dataset_url().netloc)
     self.assertEqual('bucket' + ABS_PATH, suj.get_dataset_path())

Beispiel #7

0

Datei anzeigen

Datei: rowgroup_indexing.py Projekt: wangduan023/petastorm

def build_rowgroup_index(dataset_url,
                         spark_context,
                         indexers,
                         hdfs_driver='libhdfs3'):
    """
    Build index for given list of fields to use for fast rowgroup selection
    :param dataset_url: (str) the url for the dataset (or a path if you would like to use the default hdfs config)
    :param spark_context: (SparkContext)
    :param indexers: list of objects to build row groups indexes. Should support RowGroupIndexerBase interface
    :param hdfs_driver: A string denoting the hdfs driver to use (if using a dataset on hdfs). Current choices are
    libhdfs (java through JNI) or libhdfs3 (C++)
    :return: None, upon successful completion the rowgroup predicates will be saved to _metadata file
    """

    if dataset_url and dataset_url[-1] == '/':
        dataset_url = dataset_url[:-1]

    # Create pyarrow file system
    resolver = FilesystemResolver(dataset_url,
                                  spark_context._jsc.hadoopConfiguration(),
                                  hdfs_driver=hdfs_driver,
                                  user=spark_context.sparkUser())
    dataset = pq.ParquetDataset(resolver.get_dataset_path(),
                                filesystem=resolver.filesystem(),
                                validate_schema=False)

    split_pieces = dataset_metadata.load_row_groups(dataset)
    schema = dataset_metadata.get_schema(dataset)

    # We need direct reference on partitions object
    partitions = dataset.partitions
    pieces_num = len(split_pieces)
    piece_info_list = []
    for piece_index in range(pieces_num):
        #  indexes relies on the ordering of the split dataset pieces.
        # This relies on how the dataset pieces are split and sorted which although should not change,
        # still might and we should make sure not to forget that could break this.
        piece = split_pieces[piece_index]
        piece_info_list.append(
            PieceInfo(piece_index, piece.path, piece.row_group,
                      piece.partition_keys))

    start_time = time.time()
    piece_info_rdd = spark_context.parallelize(
        piece_info_list, min(len(piece_info_list), PARALLEL_SLICE_NUM))
    indexer_rdd = piece_info_rdd.map(
        lambda piece_info: _index_columns(piece_info,
                                          dataset_url,
                                          partitions,
                                          indexers,
                                          schema,
                                          hdfs_driver=hdfs_driver))
    indexer_list = indexer_rdd.reduce(_combine_indexers)

    indexer_dict = {indexer.index_name: indexer for indexer in indexer_list}
    serialized_indexers = pickle.dumps(indexer_dict, pickle.HIGHEST_PROTOCOL)
    utils.add_to_dataset_metadata(dataset, ROWGROUPS_INDEX_KEY,
                                  serialized_indexers)
    logger.info("Elapsed time of index creation: %f s",
                (time.time() - start_time))

Beispiel #8

0

Datei anzeigen

    def __init__(self,
                 dataset_url,
                 schema,
                 ngram,
                 local_cache,
                 worker_predicate,
                 hdfs_driver='libhdfs3'):
        """RowGroupLoader responsible for loading one rowgroup at a time. Rows returned are returned encoded.

        :param dataset_url: A url of a parquet dataset.
        :param schema: A unischema corresponding to the data in the dataset
        :param ngram: An instance of NGram if ngrams should be read or None, if each row in the dataset corresponds to
          a single sample returned.
        :param local_cache: An instance of a rowgroup cache (CacheBase interface) object to be used.
        :param worker_predicate: An instance of predicate (PredicateBase interface)
        :param hdfs_driver: A string denoting the hdfs driver to use (if using a dataset on hdfs). Current choices are
        libhdfs (java through JNI) or libhdfs3 (C++)
        """
        self._dataset_url_parsed = urlparse(dataset_url)
        self._schema = schema
        self._ngram = ngram
        self._local_cache = local_cache
        self._worker_predicate = worker_predicate

        resolver = FilesystemResolver(self._dataset_url_parsed,
                                      hdfs_driver=hdfs_driver)
        self._dataset = pq.ParquetDataset(resolver.get_dataset_path(),
                                          filesystem=resolver.filesystem(),
                                          validate_schema=False)

Beispiel #9

0

Datei anzeigen

 def test_file_url(self):
     """ Case 2: File path, agnostic to content of hadoop configuration."""
     suj = FilesystemResolver('file://{}'.format(ABS_PATH),
                              self._hadoop_configuration,
                              connector=self.mock)
     self.assertTrue(isinstance(suj.filesystem(), LocalFileSystem))
     self.assertEqual('', suj.parsed_dataset_url().netloc)
     self.assertEqual(ABS_PATH, suj.get_dataset_path())

Beispiel #10

0

Datei anzeigen

Datei: test_fs_utils.py Projekt: yangw1234/petastorm

    def test_hdfs_url_no_nameservice(self):
        """ Case 3b: HDFS with no nameservice should connect to default namenode."""
        suj = FilesystemResolver('hdfs:///some/path',
                                 self._hadoop_configuration,
                                 connector=self.mock,
                                 user=self.mock_name)
        self.assertEqual(MockHdfs, type(suj.filesystem()._hdfs))
        self.assertEqual(self.mock_name, suj.filesystem()._user)
        self.assertEqual(HC.WARP_TURTLE, suj.parsed_dataset_url().netloc)
        # ensure path is preserved in parsed URL
        self.assertEqual('/some/path', suj.get_dataset_path())
        self.assertEqual(1, self.mock.connect_attempted(HC.WARP_TURTLE_NN2))
        self.assertEqual(0, self.mock.connect_attempted(HC.WARP_TURTLE_NN1))
        self.assertEqual(0, self.mock.connect_attempted(HC.DEFAULT_NN))

        # Make sure we did not capture FilesystemResolver in a closure by mistake
        dill.dumps(suj.filesystem_factory())

Beispiel #11

0

Datei anzeigen

def generate_petastorm_metadata(spark, dataset_url, unischema_class=None, use_summary_metadata=False):
    """
    Generates metadata necessary to read a petastorm dataset to an existing dataset.

    :param spark: spark session
    :param dataset_url: url of existing dataset
    :param unischema_class: (optional) fully qualified dataset unischema class. If not specified will attempt
        to find one already in the dataset. (e.g.
        :class:`examples.hello_world.generate_hello_world_dataset.HelloWorldSchema`)
    """
    sc = spark.sparkContext

    resolver = FilesystemResolver(dataset_url, sc._jsc.hadoopConfiguration())
    dataset = pq.ParquetDataset(
        resolver.get_dataset_path(),
        filesystem=resolver.filesystem(),
        validate_schema=False)

    if unischema_class:
        schema = locate(unischema_class)
        if not isinstance(schema, Unischema):
            raise ValueError('The specified class %s is not an instance of a petastorm.Unischema object.',
                             unischema_class)
    else:

        try:
            schema = get_schema(dataset)
        except ValueError:
            raise ValueError('Unischema class could not be located in existing dataset,'
                             ' please specify it')

    # In order to be backwards compatible, we retrieve the common metadata from the dataset before
    # overwriting the metadata to keep row group indexes and the old row group per file index
    arrow_metadata = dataset.common_metadata or None

    with materialize_dataset(spark, dataset_url, schema, use_summary_metadata=use_summary_metadata):
        if use_summary_metadata:
            # Inside the materialize dataset context we just need to write the metadata file as the schema will
            # be written by the context manager.
            # We use the java ParquetOutputCommitter to write the metadata file for the existing dataset
            # which will read all the footers of the dataset in parallel and merge them.
            hadoop_config = sc._jsc.hadoopConfiguration()
            Path = sc._gateway.jvm.org.apache.hadoop.fs.Path
            parquet_output_committer = sc._gateway.jvm.org.apache.parquet.hadoop.ParquetOutputCommitter
            parquet_output_committer.writeMetaDataFile(hadoop_config, Path(dataset_url))

    spark.stop()

    if use_summary_metadata and arrow_metadata:
        # When calling writeMetaDataFile it will overwrite the _common_metadata file which could have schema information
        # or row group indexers. Therefore we want to retain this information and will add it to the new
        # _common_metadata file. If we were using the old legacy metadata method this file wont be deleted
        base_schema = arrow_metadata.schema.to_arrow_schema()
        metadata_dict = base_schema.metadata
        if ROW_GROUPS_PER_FILE_KEY in metadata_dict:
            add_to_dataset_metadata(dataset, ROW_GROUPS_PER_FILE_KEY, metadata_dict[ROW_GROUPS_PER_FILE_KEY])
        if ROWGROUPS_INDEX_KEY in metadata_dict:
            add_to_dataset_metadata(dataset, ROWGROUPS_INDEX_KEY, metadata_dict[ROWGROUPS_INDEX_KEY])

Beispiel #12

0

Datei anzeigen

Datei: test_fs_utils.py Projekt: yangw1234/petastorm

    def test_s3_url(self):
        suj = FilesystemResolver('s3://bucket{}'.format(ABS_PATH),
                                 self._hadoop_configuration,
                                 connector=self.mock)
        self.assertTrue(isinstance(suj.filesystem(), S3FSWrapper))
        self.assertEqual('bucket', suj.parsed_dataset_url().netloc)
        self.assertEqual('bucket' + ABS_PATH, suj.get_dataset_path())

        # Make sure we did not capture FilesystemResolver in a closure by mistake
        dill.dumps(suj.filesystem_factory())

Beispiel #13

0

Datei anzeigen

 def test_hdfs_url_direct_namenode(self):
     """ Case 4: direct namenode."""
     suj = FilesystemResolver('hdfs://{}/path'.format(HC.WARP_TURTLE_NN1),
                              self._hadoop_configuration,
                              connector=self.mock)
     self.assertEqual(MockHdfs, type(suj.filesystem()))
     self.assertEqual(HC.WARP_TURTLE_NN1, suj.parsed_dataset_url().netloc)
     self.assertEqual(0, self.mock.connect_attempted(HC.WARP_TURTLE_NN2))
     self.assertEqual(1, self.mock.connect_attempted(HC.WARP_TURTLE_NN1))
     self.assertEqual(0, self.mock.connect_attempted(HC.DEFAULT_NN))

Beispiel #14

0

Datei anzeigen

 def test_hdfs_url_with_nameservice(self):
     """ Case 3a: HDFS nameservice."""
     suj = FilesystemResolver(HC.WARP_TURTLE_PATH,
                              self._hadoop_configuration,
                              connector=self.mock)
     self.assertEqual(MockHdfs, type(suj.filesystem()._hdfs))
     self.assertEqual(HC.WARP_TURTLE, suj.parsed_dataset_url().netloc)
     self.assertEqual(1, self.mock.connect_attempted(HC.WARP_TURTLE_NN2))
     self.assertEqual(0, self.mock.connect_attempted(HC.WARP_TURTLE_NN1))
     self.assertEqual(0, self.mock.connect_attempted(HC.DEFAULT_NN))

Beispiel #15

0

Datei anzeigen

Datei: test_fs_utils.py Projekt: DailyActie/AI_DATA_FM-petastorm

 def test_hdfs_url_no_nameservice(self):
     """ Case 3b: HDFS with no nameservice should connect to default namenode."""
     suj = FilesystemResolver('hdfs:///some/path',
                              self._hadoop_configuration,
                              connector=self.mock)
     self.assertEqual(MockHdfs, type(suj.filesystem()._hdfs))
     self.assertEqual(HC.WARP_TURTLE, suj.parsed_dataset_url().netloc)
     # ensure path is preserved in parsed URL
     self.assertEqual('/some/path', suj.parsed_dataset_url().path)
     self.assertEqual(1, self.mock.connect_attempted(HC.WARP_TURTLE_NN2))
     self.assertEqual(0, self.mock.connect_attempted(HC.WARP_TURTLE_NN1))
     self.assertEqual(0, self.mock.connect_attempted(HC.DEFAULT_NN))

Beispiel #16

0

Datei anzeigen

def _default_delete_dir_handler(dataset_url):
    resolver = FilesystemResolver(dataset_url)
    fs = resolver.filesystem()
    parsed = urlparse(dataset_url)
    if isinstance(fs, LocalFileSystem):
        # pyarrow has a bug: LocalFileSystem.delete() is not implemented.
        # https://issues.apache.org/jira/browse/ARROW-7953
        # We can remove this branch once ARROW-7953 is fixed.
        local_path = parsed.path
        shutil.rmtree(local_path, ignore_errors=False)
    else:
        fs.delete(parsed.path, recursive=True)

Beispiel #17

0

Datei anzeigen

Datei: petastorm_generate_metadata.py Projekt: meremeev/petastorm

def generate_petastorm_metadata(spark, dataset_url, unischema_class=None):
    """
    Generate metadata necessary to read a petastorm dataset to an existing dataset.
    :param spark: spark session
    :param dataset_url: url of existing dataset
    :param unischema_class: (optional) fully qualified dataset unischema class. If not specified will attempt
        to find one already in the dataset. (e.g. examples.hello_world.hello_world_dataset.HelloWorldSchema)
    :return:
    """
    sc = spark.sparkContext

    resolver = FilesystemResolver(dataset_url, sc._jsc.hadoopConfiguration())
    dataset = pq.ParquetDataset(resolver.parsed_dataset_url().path,
                                filesystem=resolver.filesystem(),
                                validate_schema=False)

    if unischema_class:
        schema = locate(unischema_class)
    else:

        try:
            schema = get_schema(dataset)
        except ValueError:
            raise ValueError(
                'Unischema class could not be located in existing dataset,'
                ' please specify it')

    # In order to be backwards compatible, we retrieve the common metadata from the dataset before
    # overwriting the metadata to keep row group indexes and the old row group per file index
    arrow_metadata = dataset.common_metadata or None

    with materialize_dataset(spark, dataset_url, schema):
        # Inside the materialize dataset context we just need to write the metadata file as the schema will
        # be written by the context manager.
        # We use the java ParquetOutputCommitter to write the metadata file for the existing dataset
        # which will read all the footers of the dataset in parallel and merge them.
        hadoop_config = sc._jsc.hadoopConfiguration()
        Path = sc._gateway.jvm.org.apache.hadoop.fs.Path
        parquet_output_committer = sc._gateway.jvm.org.apache.parquet.hadoop.ParquetOutputCommitter
        parquet_output_committer.writeMetaDataFile(hadoop_config,
                                                   Path(dataset_url))

    if arrow_metadata:
        # If there was the old row groups per file key or the row groups index key, add them to the new dataset metadata
        base_schema = arrow_metadata.schema.to_arrow_schema()
        metadata_dict = base_schema.metadata
        if ROW_GROUPS_PER_FILE_KEY in metadata_dict:
            add_to_dataset_metadata(dataset, ROW_GROUPS_PER_FILE_KEY,
                                    metadata_dict[ROW_GROUPS_PER_FILE_KEY])
        if ROWGROUPS_INDEX_KEY in metadata_dict:
            add_to_dataset_metadata(dataset, ROWGROUPS_INDEX_KEY,
                                    metadata_dict[ROWGROUPS_INDEX_KEY])

Beispiel #18

0

Datei anzeigen

def _index_columns(piece_info,
                   dataset_url,
                   partitions,
                   indexers,
                   schema,
                   hdfs_driver='libhdfs3'):
    """
    Function build indexes for  dataset piece described in piece_info
    :param piece_info: description of dataset piece
    :param dataset_url: dataset location
    :param partitions: dataset partitions
    :param indexers: list of indexer objects
    :param schema: dataset schema
    :param hdfs_driver: A string denoting the hdfs driver to use (if using a dataset on hdfs). Current choices are
        libhdfs (java through JNI) or libhdfs3 (C++)
    :return: list of indexers containing index data
    """
    # Resolver in executor context will get hadoop config from environment
    resolver = FilesystemResolver(dataset_url, hdfs_driver=hdfs_driver)
    fs = resolver.filesystem()

    # Create pyarrow piece
    piece = pq.ParquetDatasetPiece(piece_info.path,
                                   open_file_func=fs.open,
                                   row_group=piece_info.row_group,
                                   partition_keys=piece_info.partition_keys)

    # Collect column names needed for indexing
    column_names = set()
    for indexer in indexers:
        column_names.update(indexer.column_names)

    # Read columns needed for indexing
    column_rows = piece.read(
        columns=list(column_names),
        partitions=partitions).to_pandas().to_dict('records')

    # Decode columns values
    decoded_rows = [utils.decode_row(row, schema) for row in column_rows]
    if not decoded_rows:
        raise ValueError(
            'Cannot build index with empty decoded_rows, columns: {}, partitions: {}'
            .format(column_names, partitions))

    # Index columns values
    for indexer in indexers:
        indexer.build_index(decoded_rows, piece_info.piece_index)

    # Indexer objects contain index data, it will be consolidated on reduce phace
    return indexers

Beispiel #19

0

Datei anzeigen

Datei: dataset_metadata.py Projekt: shadowwalker2718/petastorm

def get_schema_from_dataset_url(dataset_url):
    """Returns a :class:`petastorm.unischema.Unischema` object loaded from a dataset specified by a url.

    :param dataset_url: A dataset URL
    :return: A :class:`petastorm.unischema.Unischema` object
    """
    resolver = FilesystemResolver(dataset_url)
    dataset = pq.ParquetDataset(resolver.get_dataset_path(), filesystem=resolver.filesystem(),
                                validate_schema=False)

    # Get a unischema stored in the dataset metadata.
    stored_schema = get_schema(dataset)

    return stored_schema

Beispiel #20

0

Datei anzeigen

Datei: reader_worker.py Projekt: meremeev/petastorm

    def process(self, piece_index, worker_predicate, shuffle_row_drop_partition):
        """Main worker function. Loads and returns all rows matching the predicate from a rowgroup

        Looks up the requested piece (a single row-group in a parquet file). If a predicate is specified,
        columns needed by the predicate are loaded first. If no rows in the rowgroup matches the predicate criteria
        the rest of the columns are not loaded.

        :param piece_index:
        :param shuffle_row_drop_partition: A tuple 2 of the current row drop partition and the total number
            of partitions.
        :return:
        """

        if not self._dataset:
            resolver = FilesystemResolver(self._dataset_url_parsed)
            self._dataset = pq.ParquetDataset(
                resolver.parsed_dataset_url().path,
                filesystem=resolver.filesystem(),
                validate_schema=False)

        piece = self._split_pieces[piece_index]

        # Create pyarrow file system
        parquet_file = ParquetFile(self._dataset.fs.open(piece.path))

        if not isinstance(self._local_cache, NullCache):
            if worker_predicate:
                raise RuntimeError('Local cache is not supported together with predicates, '
                                   'unless the dataset is partitioned by the column the predicate operates on.')
            if shuffle_row_drop_partition[1] != 1:
                raise RuntimeError('Local cache is not supported together with shuffle_row_drop_partitions > 1')

        if worker_predicate:
            all_cols = self._load_rows_with_predicate(parquet_file, piece, worker_predicate, shuffle_row_drop_partition)
        else:
            # Using hash of the dataset url with the relative path in order to:
            #  1. Make sure if a common cache serves multiple processes (e.g. redis), we don't have conflicts
            #  2. Dataset url is hashed, to make sure we don't create too long keys, which maybe incompatible with
            #     some cache implementations
            #  3. Still leave relative path and the piece_index in plain text to make it easier to debug
            cache_key = '{}:{}:{}'.format(hashlib.md5(urlunparse(self._dataset_url_parsed).encode('utf-8')).hexdigest(),
                                          piece.path, piece_index)
            all_cols = self._local_cache.get(cache_key,
                                             lambda: self._load_rows(parquet_file, piece, shuffle_row_drop_partition))

        if self._ngram:
            all_cols = self._ngram.form_ngram(data=all_cols, schema=self._schema)

        for item in all_cols:
            self.publish_func(item)

Beispiel #21

0

Datei anzeigen

def get_schema_from_dataset_url(dataset_url):
    """Returns a Unischema object loaded from a dataset specified by a url.

    :param dataset_url: A dataset url
    :return: A Unischema object
    """
    resolver = FilesystemResolver(dataset_url)
    dataset = pq.ParquetDataset(resolver.parsed_dataset_url().path,
                                filesystem=resolver.filesystem(),
                                validate_schema=False)

    # Get a unischema stored in the dataset metadata.
    stored_schema = get_schema(dataset)

    return stored_schema

Beispiel #22

0

Datei anzeigen

def _default_delete_dir_handler(dataset_url):
    resolver = FilesystemResolver(dataset_url)
    fs = resolver.filesystem()
    _dataset_url = strip_protocol(dataset_url)

    if isinstance(fs, LocalFileSystem):
        # pyarrow has a bug: LocalFileSystem.delete() is not implemented.
        # https://issues.apache.org/jira/browse/ARROW-7953
        # We can remove this branch once ARROW-7953 is fixed.
        local_path = _dataset_url
        if os.path.exists(local_path):
            shutil.rmtree(local_path, ignore_errors=False)
    else:
        if fs.exists(_dataset_url):
            fs.delete(_dataset_url, recursive=True)

Beispiel #23

0

Datei anzeigen

def get_schema_from_dataset_url(dataset_url, hdfs_driver='libhdfs3'):
    """Returns a :class:`petastorm.unischema.Unischema` object loaded from a dataset specified by a url.

    :param dataset_url: A dataset URL
    :param hdfs_driver: A string denoting the hdfs driver to use (if using a dataset on hdfs). Current choices are
        libhdfs (java through JNI) or libhdfs3 (C++)
    :return: A :class:`petastorm.unischema.Unischema` object
    """
    resolver = FilesystemResolver(dataset_url, hdfs_driver=hdfs_driver)
    dataset = pq.ParquetDataset(resolver.get_dataset_path(), filesystem=resolver.filesystem(),
                                validate_schema=False)

    # Get a unischema stored in the dataset metadata.
    stored_schema = get_schema(dataset)

    return stored_schema

Beispiel #24

0

Datei anzeigen

Datei: dataset_metadata.py Projekt: kashmawy/petastorm

def materialize_dataset(spark, dataset_url, schema, row_group_size_mb=None):
    """
    A Context Manager which handles all the initialization and finalization necessary
    to generate metadata for a petastorm dataset. This should be used around your
    spark logic to materialize a dataset (specifically the writing of parquet output).

    Note: Any rowgroup indexing should happen outside the materialize_dataset block

    e.g.
    spark = SparkSession.builder...
    dataset_url = 'hdfs:///path/to/my/dataset'
    with materialize_dataset(spark, dataset_url, MyUnischema, 64):
      spark.sparkContext.parallelize(range(0, 10)).\
        ...
        .write.parquet(dataset_url)

    indexers = [SingleFieldIndexer(...)]
    build_rowgroup_index(dataset_url, spark.sparkContext, indexers)

    :param spark The spark session you are using
    :param dataset_url The dataset url to output your dataset to (e.g. hdfs:///path/to/dataset)
    :param schema The unischema definition of your dataset
    :param row_group_size_mb The parquet row group size to use for your dataset
    """
    spark_config = {}
    _init_spark(spark, spark_config, row_group_size_mb)
    yield

    # After job completes, add the unischema metadata and check for the metadata summary file
    resolver = FilesystemResolver(
        dataset_url, spark.sparkContext._jsc.hadoopConfiguration())
    dataset = pq.ParquetDataset(resolver.parsed_dataset_url().path,
                                filesystem=resolver.filesystem(),
                                validate_schema=False)

    _generate_unischema_metadata(dataset, schema)
    if not dataset.metadata_path:
        raise MetadataGenerationError(
            'Could not find summary metadata file. The dataset will exist but you will need'
            ' to execute petastorm-generate-metadata before you can read your dataset '
            ' in order to generate the necessary metadata.'
            ' Try increasing spark driver memory next time and making sure you are'
            ' using parquet-mr >= 1.8.3')

    _cleanup_spark(spark, spark_config, row_group_size_mb)

Beispiel #25

0

Datei anzeigen

    def __init__(self, dataset_url, schema, ngram, local_cache,
                 worker_predicate):
        """RowGroupLoader responsible for loading one rowgroup at a time. Rows returned are returned encoded.

        :param dataset_url: A url of a parquet dataset.
        :param schema: A unischema corresponding to the data in the dataset
        :param ngram: An instance of NGram if ngrams should be read or None, if each row in the dataset corresponds to
          a single sample returned.
        :param local_cache: An instance of a rowgroup cache (CacheBase interface) object to be used.
        :param worker_predicate: An instance of predicate (PredicateBase interface)
        """
        self._dataset_url_parsed = urlparse(dataset_url)
        self._schema = schema
        self._ngram = ngram
        self._local_cache = local_cache
        self._worker_predicate = worker_predicate

        resolver = FilesystemResolver(self._dataset_url_parsed)
        self._dataset = pq.ParquetDataset(resolver.get_dataset_path(),
                                          filesystem=resolver.filesystem(),
                                          validate_schema=False)

Beispiel #26

0

Datei anzeigen

def make_reader(dataset_url,
                schema_fields=None,
                reader_pool_type='thread',
                workers_count=10,
                pyarrow_serialize=False,
                results_queue_size=50,
                shuffle_row_groups=True,
                shuffle_row_drop_partitions=1,
                predicate=None,
                rowgroup_selector=None,
                num_epochs=1,
                cur_shard=None,
                shard_count=None,
                cache_type='null',
                cache_location=None,
                cache_size_limit=None,
                cache_row_size_estimate=None,
                cache_extra_settings=None,
                hdfs_driver='libhdfs3',
                reader_engine='reader_v1',
                reader_engine_params=None):
    """
    Creates an instance of Reader for reading Petastorm datasets. A Petastorm dataset is a dataset generated using
    :func:`~petastorm.etl.dataset_metadata.materialize_dataset` context manager as explained
    `here <https://petastorm.readthedocs.io/en/latest/readme_include.html#generating-a-dataset>`_.

    See :func:`~petastorm.make_batch_reader` to read from a Parquet store that was not generated using
    :func:`~petastorm.etl.dataset_metadata.materialize_dataset`.

    :param dataset_url: an filepath or a url to a parquet directory,
        e.g. ``'hdfs://some_hdfs_cluster/user/yevgeni/parquet8'``, or ``'file:///tmp/mydataset'``
        or ``'s3://bucket/mydataset'``.
    :param schema_fields: Can be: a list of unischema fields and/or regex pattern strings; ``None`` to read all fields;
            an NGram object, then it will return an NGram of the specified fields.
    :param reader_pool_type: A string denoting the reader pool type. Should be one of ['thread', 'process', 'dummy']
        denoting a thread pool, process pool, or running everything in the master thread. Defaults to 'thread'
    :param workers_count: An int for the number of workers to use in the reader pool. This only is used for the
        thread or process pool. Defaults to 10
    :param pyarrow_serialize: Whether to use pyarrow for serialization. Currently only applicable to process pool.
        Defaults to False.
    :param results_queue_size: Size of the results queue to store prefetched rows. Currently only applicable to
        thread reader pool type.
    :param shuffle_row_groups: Whether to shuffle row groups (the order in which full row groups are read)
    :param shuffle_row_drop_partitions: This is is a positive integer which determines how many partitions to
        break up a row group into for increased shuffling in exchange for worse performance (extra reads).
        For example if you specify 2 each row group read will drop half of the rows within every row group and
        read the remaining rows in separate reads. It is recommended to keep this number below the regular row
        group size in order to not waste reads which drop all rows.
    :param predicate: instance of :class:`.PredicateBase` object to filter rows to be returned by reader. The predicate
        will be passed a single row and must return a boolean value indicating whether to include it in the results.
    :param rowgroup_selector: instance of row group selector object to select row groups to be read
    :param num_epochs: An epoch is a single pass over all rows in the dataset. Setting ``num_epochs`` to
        ``None`` will result in an infinite number of epochs.
    :param cur_shard: An int denoting the current shard number. Each node reading a shard should
        pass in a unique shard number in the range [0, shard_count). shard_count must be supplied as well.
        Defaults to None
    :param shard_count: An int denoting the number of shards to break this dataset into. Defaults to None
    :param cache_type: A string denoting the cache type, if desired. Options are [None, 'null', 'local-disk'] to
        either have a null/noop cache or a cache implemented using diskcache. Caching is useful when communication
        to the main data store is either slow or expensive and the local machine has large enough storage
        to store entire dataset (or a partition of a dataset if shard_count is used). By default will be a null cache.
    :param cache_location: A string denoting the location or path of the cache.
    :param cache_size_limit: An int specifying the size limit of the cache in bytes
    :param cache_row_size_estimate: An int specifying the estimated size of a row in the dataset
    :param cache_extra_settings: A dictionary of extra settings to pass to the cache implementation,
    :param hdfs_driver: A string denoting the hdfs driver to use (if using a dataset on hdfs). Current choices are
        libhdfs (java through JNI) or libhdfs3 (C++)
    :param reader_engine: Multiple engine implementations exist ('reader_v1' and 'experimental_reader_v2'). 'reader_v1'
        (the default value) selects a stable reader implementation.
    :param reader_engine_params: For advanced usage: a dictionary with arguments passed directly to a reader
        implementation constructor chosen by ``reader_engine`` argument.  You should not use this parameter, unless you
        fine-tuning of a reader.
    :return: A :class:`Reader` object
    """

    if dataset_url is None or not isinstance(dataset_url, six.string_types):
        raise ValueError("""dataset_url must be a string""")

    dataset_url = dataset_url[:-1] if dataset_url[-1] == '/' else dataset_url
    logger.debug('dataset_url: %s', dataset_url)

    resolver = FilesystemResolver(dataset_url, hdfs_driver=hdfs_driver)
    filesystem = resolver.filesystem()
    dataset_path = resolver.get_dataset_path()

    if cache_type is None or cache_type == 'null':
        cache = NullCache()
    elif cache_type == 'local-disk':
        cache = LocalDiskCache(cache_location, cache_size_limit,
                               cache_row_size_estimate, **cache_extra_settings
                               or {})
    else:
        raise ValueError('Unknown cache_type: {}'.format(cache_type))

    # Fail if this is a non-petastorm dataset. Typically, a Parquet store will have hundred thousands rows in a single
    # rowgroup. Using PyDictReaderWorker or ReaderV2 implementation is very inefficient as it processes data on a
    # row by row basis. ArrowReaderWorker (used by make_batch_reader) is much more efficient in these cases.
    try:
        dataset_metadata.get_schema_from_dataset_url(dataset_url)
    except PetastormMetadataError:
        raise RuntimeError(
            'Currently make_reader supports reading only Petastorm datasets. '
            'To read from a non-Petastorm Parquet store use make_batch_reader')

    if reader_engine == 'reader_v1':
        if reader_pool_type == 'thread':
            reader_pool = ThreadPool(workers_count, results_queue_size)
        elif reader_pool_type == 'process':
            if pyarrow_serialize:
                serializer = PyArrowSerializer()
            else:
                serializer = PickleSerializer()
            reader_pool = ProcessPool(workers_count, serializer)
        elif reader_pool_type == 'dummy':
            reader_pool = DummyPool()
        else:
            raise ValueError(
                'Unknown reader_pool_type: {}'.format(reader_pool_type))

        # Create a dictionary with all ReaderV2 parameters, so we can merge with reader_engine_params if specified
        kwargs = {
            'schema_fields': schema_fields,
            'reader_pool': reader_pool,
            'shuffle_row_groups': shuffle_row_groups,
            'shuffle_row_drop_partitions': shuffle_row_drop_partitions,
            'predicate': predicate,
            'rowgroup_selector': rowgroup_selector,
            'num_epochs': num_epochs,
            'cur_shard': cur_shard,
            'shard_count': shard_count,
            'cache': cache,
        }

        if reader_engine_params:
            kwargs.update(reader_engine_params)

        try:
            return Reader(filesystem,
                          dataset_path,
                          worker_class=PyDictReaderWorker,
                          **kwargs)
        except PetastormMetadataError as e:
            logger.error('Unexpected exception: %s', str(e))
            raise RuntimeError(
                'make_reader has failed. If you were trying to open a Parquet store that was not '
                'created using Petastorm materialize_dataset and it contains only scalar columns, '
                'you may use make_batch_reader to read it.\n'
                'Inner exception: %s', str(e))

    elif reader_engine == 'experimental_reader_v2':
        if reader_pool_type == 'thread':
            decoder_pool = ThreadPoolExecutor(workers_count)
        elif reader_pool_type == 'process':
            decoder_pool = ProcessPoolExecutor(workers_count)
        elif reader_pool_type == 'dummy':
            decoder_pool = SameThreadExecutor()
        else:
            raise ValueError(
                'Unknown reader_pool_type: {}'.format(reader_pool_type))

        # TODO(yevgeni): once ReaderV2 is ready to be out of experimental status, we should extend
        # the make_reader interfaces to take shuffling buffer parameters explicitly
        shuffling_queue = RandomShufflingBuffer(
            1000, 800) if shuffle_row_groups else NoopShufflingBuffer()

        # Create a dictionary with all ReaderV2 parameters, so we can merge with reader_engine_params if specified
        kwargs = {
            'schema_fields': schema_fields,
            'predicate': predicate,
            'rowgroup_selector': rowgroup_selector,
            'num_epochs': num_epochs,
            'cur_shard': cur_shard,
            'shard_count': shard_count,
            'cache': cache,
            'decoder_pool': decoder_pool,
            'shuffling_queue': shuffling_queue,
            'shuffle_row_groups': shuffle_row_groups,
            'shuffle_row_drop_partitions': shuffle_row_drop_partitions,
        }

        if reader_engine_params:
            kwargs.update(reader_engine_params)

        return ReaderV2(dataset_url, **kwargs)

    else:
        raise ValueError(
            'Unexpected value of reader_engine argument \'%s\'. '
            'Supported reader_engine values are \'reader_v1\' and \'experimental_reader_v2\'',
            reader_engine)

Beispiel #27

0

Datei anzeigen

def make_batch_reader(dataset_url,
                      schema_fields=None,
                      reader_pool_type='thread',
                      workers_count=10,
                      shuffle_row_groups=True,
                      shuffle_row_drop_partitions=1,
                      predicate=None,
                      rowgroup_selector=None,
                      num_epochs=1,
                      cur_shard=None,
                      shard_count=None,
                      cache_type='null',
                      cache_location=None,
                      cache_size_limit=None,
                      cache_row_size_estimate=None,
                      cache_extra_settings=None,
                      hdfs_driver='libhdfs3'):
    """
    Creates an instance of Reader for reading batches out of a non-Petastorm Parquet store.

    Currently, only stores having native scalar parquet data types are supported.
    Use :func:`~petastorm.make_reader` to read Petastorm Parquet stores generated with
    :func:`~petastorm.etl.dataset_metadata.materialize_dataset`.

    NOTE: only scalar columns are currently supported.

    :param dataset_url: an filepath or a url to a parquet directory,
        e.g. ``'hdfs://some_hdfs_cluster/user/yevgeni/parquet8'``, or ``'file:///tmp/mydataset'``
        or ``'s3://bucket/mydataset'``.
    :param schema_fields: A list of regex pattern strings. Only columns matching at least one of the
        patterns in the list will be loaded.
    :param reader_pool_type: A string denoting the reader pool type. Should be one of ['thread', 'process', 'dummy']
        denoting a thread pool, process pool, or running everything in the master thread. Defaults to 'thread'
    :param workers_count: An int for the number of workers to use in the reader pool. This only is used for the
        thread or process pool. Defaults to 10
    :param shuffle_row_groups: Whether to shuffle row groups (the order in which full row groups are read)
    :param shuffle_row_drop_partitions: This is is a positive integer which determines how many partitions to
        break up a row group into for increased shuffling in exchange for worse performance (extra reads).
        For example if you specify 2 each row group read will drop half of the rows within every row group and
        read the remaining rows in separate reads. It is recommended to keep this number below the regular row
        group size in order to not waste reads which drop all rows.
    :param predicate: instance of :class:`.PredicateBase` object to filter rows to be returned by reader. The predicate
        will be passed a pandas DataFrame object and must return a pandas Series with boolean values of matching
        dimensions.
    :param rowgroup_selector: instance of row group selector object to select row groups to be read
    :param num_epochs: An epoch is a single pass over all rows in the dataset. Setting ``num_epochs`` to
        ``None`` will result in an infinite number of epochs.
    :param cur_shard: An int denoting the current shard number. Each node reading a shard should
        pass in a unique shard number in the range [0, shard_count). shard_count must be supplied as well.
        Defaults to None
    :param shard_count: An int denoting the number of shards to break this dataset into. Defaults to None
    :param cache_type: A string denoting the cache type, if desired. Options are [None, 'null', 'local-disk'] to
        either have a null/noop cache or a cache implemented using diskcache. Caching is useful when communication
        to the main data store is either slow or expensive and the local machine has large enough storage
        to store entire dataset (or a partition of a dataset if shard_count is used). By default will be a null cache.
    :param cache_location: A string denoting the location or path of the cache.
    :param cache_size_limit: An int specifying the size limit of the cache in bytes
    :param cache_row_size_estimate: An int specifying the estimated size of a row in the dataset
    :param cache_extra_settings: A dictionary of extra settings to pass to the cache implementation,
    :param hdfs_driver: A string denoting the hdfs driver to use (if using a dataset on hdfs). Current choices are
        libhdfs (java through JNI) or libhdfs3 (C++)
    :return: A :class:`Reader` object
    """

    if dataset_url is None or not isinstance(dataset_url, six.string_types):
        raise ValueError("""dataset_url must be a string""")

    dataset_url = dataset_url[:-1] if dataset_url[-1] == '/' else dataset_url
    logger.debug('dataset_url: %s', dataset_url)

    resolver = FilesystemResolver(dataset_url, hdfs_driver=hdfs_driver)
    filesystem = resolver.filesystem()

    dataset_path = resolver.parsed_dataset_url().path

    if cache_type is None or cache_type == 'null':
        cache = NullCache()
    elif cache_type == 'local-disk':
        cache = LocalDiskArrowTableCache(cache_location, cache_size_limit,
                                         cache_row_size_estimate,
                                         **cache_extra_settings or {})
    else:
        raise ValueError('Unknown cache_type: {}'.format(cache_type))

    if reader_pool_type == 'thread':
        reader_pool = ThreadPool(workers_count)
    elif reader_pool_type == 'process':
        serializer = ArrowTableSerializer()
        reader_pool = ProcessPool(workers_count, serializer)
    elif reader_pool_type == 'dummy':
        reader_pool = DummyPool()
    else:
        raise ValueError(
            'Unknown reader_pool_type: {}'.format(reader_pool_type))

    return Reader(filesystem,
                  dataset_path,
                  schema_fields=schema_fields,
                  worker_class=ArrowReaderWorker,
                  reader_pool=reader_pool,
                  shuffle_row_groups=shuffle_row_groups,
                  shuffle_row_drop_partitions=shuffle_row_drop_partitions,
                  predicate=predicate,
                  rowgroup_selector=rowgroup_selector,
                  num_epochs=num_epochs,
                  cur_shard=cur_shard,
                  shard_count=shard_count,
                  cache=cache)

Beispiel #28

0

Datei anzeigen

def materialize_dataset(spark,
                        dataset_url,
                        schema,
                        row_group_size_mb=None,
                        use_summary_metadata=False):
    """
    A Context Manager which handles all the initialization and finalization necessary
    to generate metadata for a petastorm dataset. This should be used around your
    spark logic to materialize a dataset (specifically the writing of parquet output).

    Note: Any rowgroup indexing should happen outside the materialize_dataset block

    Example:

    >>> spark = SparkSession.builder...
    >>> ds_url = 'hdfs:///path/to/my/dataset'
    >>> with materialize_dataset(spark, ds_url, MyUnischema, 64):
    >>>   spark.sparkContext.parallelize(range(0, 10)).
    >>>     ...
    >>>     .write.parquet(ds_url)
    >>> indexer = [SingleFieldIndexer(...)]
    >>> build_rowgroup_index(ds_url, spark.sparkContext, indexer)

    :param spark: The spark session you are using
    :param dataset_url: The dataset url to output your dataset to (e.g. ``hdfs:///path/to/dataset``)
    :param schema: The :class:`petastorm.unischema.Unischema` definition of your dataset
    :param row_group_size_mb: The parquet row group size to use for your dataset
    :param use_summary_metadata: Whether to use the parquet summary metadata for row group indexing or a custom
            indexing method. The custom indexing method is more scalable for very large datasets.
    """
    spark_config = {}
    _init_spark(spark, spark_config, row_group_size_mb, use_summary_metadata)
    yield

    # After job completes, add the unischema metadata and check for the metadata summary file
    resolver = FilesystemResolver(
        dataset_url, spark.sparkContext._jsc.hadoopConfiguration())
    dataset = pq.ParquetDataset(resolver.parsed_dataset_url().path,
                                filesystem=resolver.filesystem(),
                                validate_schema=False)

    _generate_unischema_metadata(dataset, schema)
    if not use_summary_metadata:
        _generate_num_row_groups_per_file(dataset, spark.sparkContext)

    # Reload the dataset to take into account the new metadata
    dataset = pq.ParquetDataset(resolver.parsed_dataset_url().path,
                                filesystem=resolver.filesystem(),
                                validate_schema=False)
    try:
        # Try to load the row groups, if it fails that means the metadata was not generated properly
        load_row_groups(dataset)
    except PetastormMetadataError:
        raise PetastormMetadataGenerationError(
            'Could not find summary metadata file. The dataset will exist but you will need'
            ' to execute petastorm-generate-metadata.py before you can read your dataset '
            ' in order to generate the necessary metadata.'
            ' Try increasing spark driver memory next time and making sure you are'
            ' using parquet-mr >= 1.8.3')

    _cleanup_spark(spark, spark_config, row_group_size_mb)

Beispiel #29

0

Datei anzeigen

Datei: metadata_util.py Projekt: zhangruiskyline/petastorm

    parser.add_argument('--print-values', action='store_true',
                        help='Print index values (dataset piece indexes)')
    parser.add_argument('--skip-index', nargs='+', type=str,
                        help='Donot display indexed values for given fields')
    parser.add_argument('--hdfs-driver', type=str, default='libhdfs3',
                        help='A string denoting the hdfs driver to use (if using a dataset on hdfs). '
                             'Current choices are libhdfs (java through JNI) or libhdfs3 (C++)')

    args = parser.parse_args()

    if args.dataset_url and args.dataset_url[-1] == '/':
        args.dataset_url = args.dataset_url[:-1]

    # Create pyarrow file system
    resolver = FilesystemResolver(args.dataset_url, hdfs_driver=args.hdfs_driver)
    dataset = pq.ParquetDataset(resolver.get_dataset_path(), filesystem=resolver.filesystem(),
                                validate_schema=False)

    print_all = not args.schema and not args.index
    if args.schema or print_all:
        print('*** Schema from dataset metadata ***')
        print((dataset_metadata.get_schema(dataset)))

    if args.index or print_all:
        index_dict = rowgroup_indexing.get_row_group_indexes(dataset)
        print('*** Row group indexes from dataset metadata ***')
        for index_name in index_dict:
            print(('Index: {}'.format(index_name)))
            if args.skip_index is None or index_name not in args.skip_index:
                for field_value in index_dict[index_name].indexed_values:
                    print('  -- {}({})'.format(field_value,

Beispiel #30

0

Datei anzeigen

Datei: copy_dataset.py Projekt: cheesama/petastorm

def copy_dataset(spark,
                 source_url,
                 target_url,
                 field_regex,
                 not_null_fields,
                 overwrite_output,
                 partitions_count,
                 row_group_size_mb,
                 hdfs_driver='libhdfs3'):
    """
    Creates a copy of a dataset. A new dataset will optionally contain a subset of columns. Rows that have NULL
    values in fields defined by ``not_null_fields`` argument are filtered out.


    :param spark: An instance of ``SparkSession`` object
    :param source_url: A url of the dataset to be copied.
    :param target_url: A url specifying location of the target dataset.
    :param field_regex: A list of regex patterns. Only columns that match one of these patterns are copied to the new
      dataset.
    :param not_null_fields: A list of fields that must have non-NULL valus in the target dataset.
    :param overwrite_output: If ``False`` and there is an existing path defined by ``target_url``, the operation will
      fail.
    :param partitions_count: If not ``None``, the dataset is repartitioned before write. Number of files in the target
      Parquet store is defined by this parameter.
    :param row_group_size_mb: The size of the rowgroup in the target dataset. Specified in megabytes.
    :param hdfs_driver: A string denoting the hdfs driver to use (if using a dataset on hdfs). Current choices are
        libhdfs (java through JNI) or libhdfs3 (C++)
    :return: None
    """
    schema = get_schema_from_dataset_url(source_url, hdfs_driver=hdfs_driver)

    fields = match_unischema_fields(schema, field_regex)

    if field_regex and not fields:
        field_names = list(schema.fields.keys())
        raise ValueError(
            'Regular expressions (%s) do not match any fields (%s)',
            str(field_regex), str(field_names))

    if fields:
        subschema = schema.create_schema_view(fields)
    else:
        subschema = schema

    resolver = FilesystemResolver(
        target_url,
        spark.sparkContext._jsc.hadoopConfiguration(),
        hdfs_driver=hdfs_driver)
    with materialize_dataset(spark,
                             target_url,
                             subschema,
                             row_group_size_mb,
                             pyarrow_filesystem=resolver.filesystem()):
        data_frame = spark.read \
            .parquet(source_url)

        if fields:
            data_frame = data_frame.select(*[f.name for f in fields])

        if not_null_fields:
            not_null_condition = reduce(operator.__and__,
                                        (data_frame[f].isNotNull()
                                         for f in not_null_fields))
            data_frame = data_frame.filter(not_null_condition)

        if partitions_count:
            data_frame = data_frame.repartition(partitions_count)

        data_frame.write \
            .mode('overwrite' if overwrite_output else 'error') \
            .option('compression', 'none') \
            .parquet(target_url)