Esempio n. 1
0
    def test_hdfs_url_direct_namenode_driver_libhdfs(self):
        suj = FilesystemResolver('hdfs://{}/path'.format(HC.WARP_TURTLE_NN1),
                                 self._hadoop_configuration,
                                 connector=self.mock,
                                 hdfs_driver='libhdfs')
        self.assertEqual(MockHdfs, type(suj.filesystem()))

        # Make sure we did not capture FilesystemResolver in a closure by mistake
        dill.dumps(suj.filesystem_factory())
Esempio n. 2
0
    def test_s3_url(self):
        suj = FilesystemResolver('s3://bucket{}'.format(ABS_PATH),
                                 self._hadoop_configuration,
                                 connector=self.mock)
        self.assertTrue(isinstance(suj.filesystem(), S3FSWrapper))
        self.assertEqual('bucket', suj.parsed_dataset_url().netloc)
        self.assertEqual('bucket' + ABS_PATH, suj.get_dataset_path())

        # Make sure we did not capture FilesystemResolver in a closure by mistake
        dill.dumps(suj.filesystem_factory())
Esempio n. 3
0
    def test_file_url(self):
        """ Case 2: File path, agnostic to content of hadoop configuration."""
        suj = FilesystemResolver('file://{}'.format(ABS_PATH),
                                 self._hadoop_configuration,
                                 connector=self.mock)
        self.assertTrue(isinstance(suj.filesystem(), LocalFileSystem))
        self.assertEqual('', suj.parsed_dataset_url().netloc)
        self.assertEqual(ABS_PATH, suj.get_dataset_path())

        # Make sure we did not capture FilesystemResolver in a closure by mistake
        dill.dumps(suj.filesystem_factory())
Esempio n. 4
0
    def test_hdfs_url_with_nameservice(self):
        """ Case 3a: HDFS nameservice."""
        suj = FilesystemResolver(HC.WARP_TURTLE_PATH,
                                 self._hadoop_configuration,
                                 connector=self.mock)
        self.assertEqual(MockHdfs, type(suj.filesystem()._hdfs))
        self.assertEqual(HC.WARP_TURTLE, suj.parsed_dataset_url().netloc)
        self.assertEqual(1, self.mock.connect_attempted(HC.WARP_TURTLE_NN2))
        self.assertEqual(0, self.mock.connect_attempted(HC.WARP_TURTLE_NN1))
        self.assertEqual(0, self.mock.connect_attempted(HC.DEFAULT_NN))

        # Make sure we did not capture FilesystemResolver in a closure by mistake
        dill.dumps(suj.filesystem_factory())
Esempio n. 5
0
    def test_hdfs_url_direct_namenode(self):
        """ Case 4: direct namenode."""
        suj = FilesystemResolver('hdfs://{}/path'.format(HC.WARP_TURTLE_NN1),
                                 self._hadoop_configuration,
                                 connector=self.mock)
        self.assertEqual(MockHdfs, type(suj.filesystem()))
        self.assertEqual(HC.WARP_TURTLE_NN1, suj.parsed_dataset_url().netloc)
        self.assertEqual(0, self.mock.connect_attempted(HC.WARP_TURTLE_NN2))
        self.assertEqual(1, self.mock.connect_attempted(HC.WARP_TURTLE_NN1))
        self.assertEqual(0, self.mock.connect_attempted(HC.DEFAULT_NN))

        # Make sure we did not capture FilesystemResolver in a closure by mistake
        dill.dumps(suj.filesystem_factory())
Esempio n. 6
0
    def test_hdfs_url_no_nameservice(self):
        """ Case 3b: HDFS with no nameservice should connect to default namenode."""
        suj = FilesystemResolver('hdfs:///some/path',
                                 self._hadoop_configuration,
                                 connector=self.mock)
        self.assertEqual(MockHdfs, type(suj.filesystem()._hdfs))
        self.assertEqual(HC.WARP_TURTLE, suj.parsed_dataset_url().netloc)
        # ensure path is preserved in parsed URL
        self.assertEqual('/some/path', suj.get_dataset_path())
        self.assertEqual(1, self.mock.connect_attempted(HC.WARP_TURTLE_NN2))
        self.assertEqual(0, self.mock.connect_attempted(HC.WARP_TURTLE_NN1))
        self.assertEqual(0, self.mock.connect_attempted(HC.DEFAULT_NN))

        # Make sure we did not capture FilesystemResolver in a closure by mistake
        dill.dumps(suj.filesystem_factory())
def materialize_dataset(spark,
                        dataset_url,
                        schema,
                        row_group_size_mb=None,
                        use_summary_metadata=False,
                        filesystem_factory=None):
    """
    A Context Manager which handles all the initialization and finalization necessary
    to generate metadata for a petastorm dataset. This should be used around your
    spark logic to materialize a dataset (specifically the writing of parquet output).

    Note: Any rowgroup indexing should happen outside the materialize_dataset block

    Example:

    >>> spark = SparkSession.builder...
    >>> ds_url = 'hdfs:///path/to/my/dataset'
    >>> with materialize_dataset(spark, ds_url, MyUnischema, 64):
    >>>   spark.sparkContext.parallelize(range(0, 10)).
    >>>     ...
    >>>     .write.parquet(ds_url)
    >>> indexer = [SingleFieldIndexer(...)]
    >>> build_rowgroup_index(ds_url, spark.sparkContext, indexer)

    A user may provide their own recipe for creation of pyarrow filesystem object in ``filesystem_factory``
    argument (otherwise, petastorm will create a default one based on the url).

    The following example shows how a custom pyarrow HDFS filesystem, instantiated using ``libhdfs`` driver can be used
    during Petastorm dataset generation:

    >>> resolver=FilesystemResolver(dataset_url, spark.sparkContext._jsc.hadoopConfiguration(),
    >>>                             hdfs_driver='libhdfs')
    >>> with materialize_dataset(..., filesystem_factory=resolver.filesystem_factory()):
    >>>     ...


    :param spark: The spark session you are using
    :param dataset_url: The dataset url to output your dataset to (e.g. ``hdfs:///path/to/dataset``)
    :param schema: The :class:`petastorm.unischema.Unischema` definition of your dataset
    :param row_group_size_mb: The parquet row group size to use for your dataset
    :param use_summary_metadata: Whether to use the parquet summary metadata for row group indexing or a custom
      indexing method. The custom indexing method is more scalable for very large datasets.
    :param filesystem_factory: A filesystem factory function to be used when saving Petastorm specific metadata to the
      Parquet store.
    """
    spark_config = {}
    _init_spark(spark, spark_config, row_group_size_mb, use_summary_metadata)
    yield
    # After job completes, add the unischema metadata and check for the metadata summary file
    if filesystem_factory is None:
        resolver = FilesystemResolver(
            dataset_url,
            spark.sparkContext._jsc.hadoopConfiguration(),
            user=spark.sparkContext.sparkUser())
        filesystem_factory = resolver.filesystem_factory()
        dataset_path = resolver.get_dataset_path()
    else:
        dataset_path = get_dataset_path(urlparse(dataset_url))
    filesystem = filesystem_factory()

    dataset = pq.ParquetDataset(dataset_path,
                                filesystem=filesystem,
                                validate_schema=False)

    _generate_unischema_metadata(dataset, schema)
    if not use_summary_metadata:
        _generate_num_row_groups_per_file(dataset, spark.sparkContext,
                                          filesystem_factory)

    # Reload the dataset to take into account the new metadata
    dataset = pq.ParquetDataset(dataset_path,
                                filesystem=filesystem,
                                validate_schema=False)
    try:
        # Try to load the row groups, if it fails that means the metadata was not generated properly
        load_row_groups(dataset)
    except PetastormMetadataError:
        raise PetastormMetadataGenerationError(
            'Could not find summary metadata file. The dataset will exist but you will need'
            ' to execute petastorm-generate-metadata.py before you can read your dataset '
            ' in order to generate the necessary metadata.'
            ' Try increasing spark driver memory next time and making sure you are'
            ' using parquet-mr >= 1.8.3')

    _cleanup_spark(spark, spark_config, row_group_size_mb)
def generate_petastorm_metadata(spark,
                                dataset_url,
                                unischema_class=None,
                                use_summary_metadata=False,
                                hdfs_driver='libhdfs3'):
    """
    Generates metadata necessary to read a petastorm dataset to an existing dataset.

    :param spark: spark session
    :param dataset_url: url of existing dataset
    :param unischema_class: (optional) fully qualified dataset unischema class. If not specified will attempt
        to find one already in the dataset. (e.g.
        :class:`examples.hello_world.generate_hello_world_dataset.HelloWorldSchema`)
    :param hdfs_driver: A string denoting the hdfs driver to use (if using a dataset on hdfs). Current choices are
        libhdfs (java through JNI) or libhdfs3 (C++)
    :param user: String denoting username when connecting to HDFS
    """
    sc = spark.sparkContext

    resolver = FilesystemResolver(dataset_url,
                                  sc._jsc.hadoopConfiguration(),
                                  hdfs_driver=hdfs_driver,
                                  user=spark.sparkContext.sparkUser())
    fs = resolver.filesystem()
    dataset = pq.ParquetDataset(resolver.get_dataset_path(),
                                filesystem=fs,
                                validate_schema=False)

    if unischema_class:
        schema = locate(unischema_class)
        if not isinstance(schema, Unischema):
            raise ValueError(
                'The specified class %s is not an instance of a petastorm.Unischema object.',
                unischema_class)
    else:

        try:
            schema = get_schema(dataset)
        except ValueError:
            raise ValueError(
                'Unischema class could not be located in existing dataset,'
                ' please specify it')

    # In order to be backwards compatible, we retrieve the common metadata from the dataset before
    # overwriting the metadata to keep row group indexes and the old row group per file index
    arrow_metadata = dataset.common_metadata or None

    with materialize_dataset(spark,
                             dataset_url,
                             schema,
                             use_summary_metadata=use_summary_metadata,
                             filesystem_factory=resolver.filesystem_factory()):
        if use_summary_metadata:
            # Inside the materialize dataset context we just need to write the metadata file as the schema will
            # be written by the context manager.
            # We use the java ParquetOutputCommitter to write the metadata file for the existing dataset
            # which will read all the footers of the dataset in parallel and merge them.
            hadoop_config = sc._jsc.hadoopConfiguration()
            Path = sc._gateway.jvm.org.apache.hadoop.fs.Path
            parquet_output_committer = sc._gateway.jvm.org.apache.parquet.hadoop.ParquetOutputCommitter
            parquet_output_committer.writeMetaDataFile(hadoop_config,
                                                       Path(dataset_url))

    spark.stop()

    if use_summary_metadata and arrow_metadata:
        # When calling writeMetaDataFile it will overwrite the _common_metadata file which could have schema information
        # or row group indexers. Therefore we want to retain this information and will add it to the new
        # _common_metadata file. If we were using the old legacy metadata method this file wont be deleted
        base_schema = arrow_metadata.schema.to_arrow_schema()
        metadata_dict = base_schema.metadata
        if ROW_GROUPS_PER_FILE_KEY in metadata_dict:
            add_to_dataset_metadata(dataset, ROW_GROUPS_PER_FILE_KEY,
                                    metadata_dict[ROW_GROUPS_PER_FILE_KEY])
        if ROWGROUPS_INDEX_KEY in metadata_dict:
            add_to_dataset_metadata(dataset, ROWGROUPS_INDEX_KEY,
                                    metadata_dict[ROWGROUPS_INDEX_KEY])
Esempio n. 9
0
def _create_dataset(store, df, validation, compress_sparse, num_partitions,
                    num_workers, dataset_idx, parquet_row_group_size_mb,
                    verbose):
    train_data_path = store.get_train_data_path(dataset_idx)
    val_data_path = store.get_val_data_path(dataset_idx)
    if verbose >= 1:
        print('CEREBRO => Time: {}, Writing DataFrames'.format(
            datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
        print('CEREBRO => Time: {}, Train Data Path: {}'.format(
            datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
            train_data_path))
        print('CEREBRO => Time: {}, Val Data Path: {}'.format(
            datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
            val_data_path))

    schema_cols = df.columns

    if isinstance(validation, str):
        schema_cols.append(validation)
    df = df[schema_cols]

    metadata = None
    if _has_vector_column(df):
        if compress_sparse:
            metadata = _get_metadata(df)
        to_petastorm = to_petastorm_fn(schema_cols, metadata)
        df = df.rdd.map(to_petastorm).toDF()

    train_df, val_df, validation_ratio = _train_val_split(df, validation)

    unischema_fields = []
    metadata = _get_metadata(train_df)
    for k in metadata.keys():
        type = spark_to_petastorm_type(metadata[k]['spark_data_type'])
        shape = petastorm_unischema_shape(metadata[k]['shape'])
        codec = petastorm_unischema_codec(metadata[k]['shape'],
                                          metadata[k]['spark_data_type'])
        unischema_fields.append(UnischemaField(k, type, shape, codec, False))

    petastorm_schema = Unischema('petastorm_schema', unischema_fields)

    train_partitions = max(int(num_partitions * (1.0 - validation_ratio)),
                           num_workers)
    if verbose >= 1:
        print('CEREBRO => Time: {}, Train Partitions: {}'.format(
            datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
            train_partitions))

    spark = SparkSession.builder.getOrCreate()
    # FIXME pass hdfs_driver from user interface instead of hardcoded PETASTORM_HDFS_DRIVER
    train_resolver = FilesystemResolver(
        train_data_path,
        spark.sparkContext._jsc.hadoopConfiguration(),
        user=spark.sparkContext.sparkUser(),
        hdfs_driver=constants.PETASTORM_HDFS_DRIVER)
    with materialize_dataset(
            spark,
            train_data_path,
            petastorm_schema,
            parquet_row_group_size_mb,
            filesystem_factory=train_resolver.filesystem_factory()):
        train_rdd = train_df.rdd.map(lambda x: x.asDict()).map(
            lambda x: {k: np.array(x[k], dtype=spark_to_petastorm_type(metadata[k]['spark_data_type'])) for k in x}) \
            .map(lambda x: dict_to_spark_row(petastorm_schema, x))

        spark.createDataFrame(train_rdd, petastorm_schema.as_spark_schema()) \
            .coalesce(train_partitions) \
            .write \
            .mode('overwrite') \
            .parquet(train_data_path)

    if val_df:
        val_partitions = max(int(num_partitions * validation_ratio),
                             num_workers)
        if verbose >= 1:
            print('CEREBRO => Time: {}, Val Partitions: {}'.format(
                datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                val_partitions))
        val_resolver = FilesystemResolver(
            val_data_path,
            spark.sparkContext._jsc.hadoopConfiguration(),
            user=spark.sparkContext.sparkUser(),
            hdfs_driver=constants.PETASTORM_HDFS_DRIVER)
        with materialize_dataset(
                spark,
                val_data_path,
                petastorm_schema,
                parquet_row_group_size_mb,
                filesystem_factory=val_resolver.filesystem_factory()):
            val_rdd = val_df.rdd.map(lambda x: x.asDict()).map(
                lambda x: {k: np.array(x[k], dtype=spark_to_petastorm_type(metadata[k]['spark_data_type'])) for k in x}) \
                .map(lambda x: dict_to_spark_row(petastorm_schema, x))

            spark.createDataFrame(val_rdd, petastorm_schema.as_spark_schema()) \
                .coalesce(val_partitions) \
                .write \
                .mode('overwrite') \
                .parquet(val_data_path)

    train_rows, val_rows, pq_metadata, avg_row_size = get_simple_meta_from_parquet(
        store, df.columns, dataset_idx)

    if verbose:
        print('CEREBRO => Time: {}, Train Rows: {}'.format(
            datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), train_rows))
    if val_df:
        if val_rows == 0:
            raise ValueError(
                'Validation DataFrame does not any samples with validation param {}'
                .format(validation))
        if verbose:
            print('CEREBRO => Time: {}, Val Rows: {}'.format(
                datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                val_rows))

    return train_rows, val_rows, pq_metadata, avg_row_size
Esempio n. 10
0
def copy_dataset(spark,
                 source_url,
                 target_url,
                 field_regex,
                 not_null_fields,
                 overwrite_output,
                 partitions_count,
                 row_group_size_mb,
                 hdfs_driver='libhdfs3'):
    """
    Creates a copy of a dataset. A new dataset will optionally contain a subset of columns. Rows that have NULL
    values in fields defined by ``not_null_fields`` argument are filtered out.


    :param spark: An instance of ``SparkSession`` object
    :param source_url: A url of the dataset to be copied.
    :param target_url: A url specifying location of the target dataset.
    :param field_regex: A list of regex patterns. Only columns that match one of these patterns are copied to the new
      dataset.
    :param not_null_fields: A list of fields that must have non-NULL valus in the target dataset.
    :param overwrite_output: If ``False`` and there is an existing path defined by ``target_url``, the operation will
      fail.
    :param partitions_count: If not ``None``, the dataset is repartitioned before write. Number of files in the target
      Parquet store is defined by this parameter.
    :param row_group_size_mb: The size of the rowgroup in the target dataset. Specified in megabytes.
    :param hdfs_driver: A string denoting the hdfs driver to use (if using a dataset on hdfs). Current choices are
        libhdfs (java through JNI) or libhdfs3 (C++)
    :return: None
    """
    schema = get_schema_from_dataset_url(source_url, hdfs_driver=hdfs_driver)

    fields = match_unischema_fields(schema, field_regex)

    if field_regex and not fields:
        field_names = list(schema.fields.keys())
        raise ValueError(
            'Regular expressions (%s) do not match any fields (%s)',
            str(field_regex), str(field_names))

    if fields:
        subschema = schema.create_schema_view(fields)
    else:
        subschema = schema

    resolver = FilesystemResolver(
        target_url,
        spark.sparkContext._jsc.hadoopConfiguration(),
        hdfs_driver=hdfs_driver)
    with materialize_dataset(spark,
                             target_url,
                             subschema,
                             row_group_size_mb,
                             filesystem_factory=resolver.filesystem_factory()):
        data_frame = spark.read \
            .parquet(source_url)

        if fields:
            data_frame = data_frame.select(*[f.name for f in fields])

        if not_null_fields:
            not_null_condition = reduce(operator.__and__,
                                        (data_frame[f].isNotNull()
                                         for f in not_null_fields))
            data_frame = data_frame.filter(not_null_condition)

        if partitions_count:
            data_frame = data_frame.repartition(partitions_count)

        data_frame.write \
            .mode('overwrite' if overwrite_output else 'error') \
            .option('compression', 'none') \
            .parquet(target_url)