Esempio n. 1
0
def test_serialize_filesystem_factory(tmpdir):
    SimpleSchema = Unischema('SimpleSchema', [
        UnischemaField('id', np.int32, (), ScalarCodec(IntegerType()), False),
        UnischemaField('foo', np.int32, (), ScalarCodec(IntegerType()), False),
    ])

    class BogusFS(pyarrow.LocalFileSystem):
        def __getstate__(self):
            raise RuntimeError("can not serialize")

    rows_count = 10
    output_url = "file://{0}/fs_factory_test".format(tmpdir)
    rowgroup_size_mb = 256
    spark = SparkSession.builder.config('spark.driver.memory',
                                        '2g').master('local[2]').getOrCreate()
    sc = spark.sparkContext
    with materialize_dataset(spark,
                             output_url,
                             SimpleSchema,
                             rowgroup_size_mb,
                             filesystem_factory=BogusFS):
        rows_rdd = sc.parallelize(range(rows_count))\
            .map(lambda x: {'id': x, 'foo': x})\
            .map(lambda x: dict_to_spark_row(SimpleSchema, x))

        spark.createDataFrame(rows_rdd, SimpleSchema.as_spark_schema()) \
            .write \
            .parquet(output_url)
Esempio n. 2
0
    def write(self, table: DataFrameMetadata, rows: Batch):
        """
        Write rows into the dataframe.

        Arguments:
            table: table metadata object to write into
            rows : batch to be persisted in the storage.
        """

        if rows.empty():
            return
        # ToDo
        # Throw an error if the row schema doesn't match the table schema

        with materialize_dataset(self.spark_session, self._spark_url(table),
                                 table.schema.petastorm_schema):

            records = rows.frames
            columns = records.keys()
            rows_rdd = self.spark_context.parallelize(records.values) \
                .map(lambda x: dict(zip(columns, x))) \
                .map(lambda x: dict_to_spark_row(table.schema.petastorm_schema,
                                                 x))
            self.spark_session.createDataFrame(rows_rdd,
                                               table.schema.pyspark_schema) \
                .coalesce(1) \
                .write \
                .mode('append') \
                .parquet(self._spark_url(table))
Esempio n. 3
0
    def write_featureframe(self):
        """
        Writes a dataframe of data as a training dataset on HDFS in the petastorm format

        Returns:
            None

        Raises:
              :ValueError: if not petastorm schema was provided
        """
        spark = util._find_spark()
        if constants.PETASTORM_CONFIG.SCHEMA in self.petastorm_args:
            schema = self.petastorm_args[constants.PETASTORM_CONFIG.SCHEMA]
            del self.petastorm_args[constants.PETASTORM_CONFIG.SCHEMA]
        else:
            raise ValueError(
                "Required petastorm argument 'schema' is not defined in petastorm_args dict"
            )
        if constants.PETASTORM_CONFIG.FILESYSTEM_FACTORY in self.petastorm_args:
            filesystem_factory = self.petastorm_args[
                constants.PETASTORM_CONFIG.FILESYSTEM_FACTORY]
            del self.petastorm_args[
                constants.PETASTORM_CONFIG.FILESYSTEM_FACTORY]
        else:
            filesystem_factory = lambda: pa.hdfs.connect(
                driver=constants.PETASTORM_CONFIG.LIBHDFS)
        with materialize_dataset(spark,
                                 self.path,
                                 schema,
                                 filesystem_factory=filesystem_factory,
                                 **self.petastorm_args):
            self.df.write.mode(self.write_mode).parquet(self.path)
Esempio n. 4
0
def mnist_data_to_petastorm_dataset(download_dir, output_url, spark_master=None, parquet_files_count=1,
                                    mnist_data=None):
    """Converts a directory with MNIST data into a petastorm dataset.

    Data files are as specified in http://yann.lecun.com/exdb/mnist/:
        * train-images-idx3-ubyte.gz:  training set images (9912422 bytes)
        * train-labels-idx1-ubyte.gz:  training set labels (28881 bytes)
        * t10k-images-idx3-ubyte.gz:   test set images (1648877 bytes)
        * t10k-labels-idx1-ubyte.gz:   test set labels (4542 bytes)

    The images and labels and stored in the IDX file format for vectors and multidimensional matrices of
    various numerical types, as defined in the same URL.

    :param download_dir: the path to where the MNIST data will be downloaded.
    :param output_url: the location where your dataset will be written to. Should be a url: either
      file://... or hdfs://...
    :param spark_master: A master parameter used by spark session builder. Use default value (None) to use system
      environment configured spark cluster. Use 'local[*]' to run on a local box.
    :param mnist_data: A dictionary of MNIST data, with name of dataset as key, and the dataset object as value;
      if None is suplied, download it.
    :return: None
    """
    session_builder = SparkSession \
        .builder \
        .appName('MNIST Dataset Creation')
    if spark_master:
        session_builder.master(spark_master)

    spark = session_builder.getOrCreate()

    # Get training and test data
    if mnist_data is None:
        mnist_data = {
            'train': download_mnist_data(download_dir, train=True),
            'test': download_mnist_data(download_dir, train=False)
        }

    # The MNIST data is small enough to do everything here in Python
    for dset, data in mnist_data.items():
        dset_output_url = '{}/{}'.format(output_url, dset)
        with materialize_dataset(spark, dset_output_url, MnistSchema):
            # List of [(idx, image, digit), ...]
            # where image is shaped as a 28x28 numpy matrix
            idx_image_digit_list = map(lambda idx_image_digit: {
                MnistSchema.idx.name: idx_image_digit[0],
                MnistSchema.digit.name: idx_image_digit[1][1],
                MnistSchema.image.name: np.array(list(idx_image_digit[1][0].getdata()), dtype=np.uint8).reshape(28, 28)
                }, enumerate(data))

            # Convert to pyspark.sql.Row
            sql_rows = map(lambda r: dict_to_spark_row(MnistSchema, r), idx_image_digit_list)

            # Write out the result
            spark.createDataFrame(sql_rows, MnistSchema.as_spark_schema()) \
                .coalesce(parquet_files_count) \
                .write \
                .option('compression', 'none') \
                .parquet(dset_output_url)
Esempio n. 5
0
def generate_petastorm_metadata(spark, dataset_url, unischema_class=None, use_summary_metadata=False):
    """
    Generates metadata necessary to read a petastorm dataset to an existing dataset.

    :param spark: spark session
    :param dataset_url: url of existing dataset
    :param unischema_class: (optional) fully qualified dataset unischema class. If not specified will attempt
        to find one already in the dataset. (e.g.
        :class:`examples.hello_world.generate_hello_world_dataset.HelloWorldSchema`)
    """
    sc = spark.sparkContext

    resolver = FilesystemResolver(dataset_url, sc._jsc.hadoopConfiguration())
    dataset = pq.ParquetDataset(
        resolver.get_dataset_path(),
        filesystem=resolver.filesystem(),
        validate_schema=False)

    if unischema_class:
        schema = locate(unischema_class)
        if not isinstance(schema, Unischema):
            raise ValueError('The specified class %s is not an instance of a petastorm.Unischema object.',
                             unischema_class)
    else:

        try:
            schema = get_schema(dataset)
        except ValueError:
            raise ValueError('Unischema class could not be located in existing dataset,'
                             ' please specify it')

    # In order to be backwards compatible, we retrieve the common metadata from the dataset before
    # overwriting the metadata to keep row group indexes and the old row group per file index
    arrow_metadata = dataset.common_metadata or None

    with materialize_dataset(spark, dataset_url, schema, use_summary_metadata=use_summary_metadata):
        if use_summary_metadata:
            # Inside the materialize dataset context we just need to write the metadata file as the schema will
            # be written by the context manager.
            # We use the java ParquetOutputCommitter to write the metadata file for the existing dataset
            # which will read all the footers of the dataset in parallel and merge them.
            hadoop_config = sc._jsc.hadoopConfiguration()
            Path = sc._gateway.jvm.org.apache.hadoop.fs.Path
            parquet_output_committer = sc._gateway.jvm.org.apache.parquet.hadoop.ParquetOutputCommitter
            parquet_output_committer.writeMetaDataFile(hadoop_config, Path(dataset_url))

    spark.stop()

    if use_summary_metadata and arrow_metadata:
        # When calling writeMetaDataFile it will overwrite the _common_metadata file which could have schema information
        # or row group indexers. Therefore we want to retain this information and will add it to the new
        # _common_metadata file. If we were using the old legacy metadata method this file wont be deleted
        base_schema = arrow_metadata.schema.to_arrow_schema()
        metadata_dict = base_schema.metadata
        if ROW_GROUPS_PER_FILE_KEY in metadata_dict:
            add_to_dataset_metadata(dataset, ROW_GROUPS_PER_FILE_KEY, metadata_dict[ROW_GROUPS_PER_FILE_KEY])
        if ROWGROUPS_INDEX_KEY in metadata_dict:
            add_to_dataset_metadata(dataset, ROWGROUPS_INDEX_KEY, metadata_dict[ROWGROUPS_INDEX_KEY])
Esempio n. 6
0
def generate_parquet(feature_path, mask_path, output_path):
    """[summary]
    Generate parquet file with two columns
        - First column: npG_array representing image
        - Second column: np_array representing mask

    Arguments:
        feature_path {[type]} -- path to all images
        mask_path {[type]} -- path to masks of images
        output_path {[type]} -- parquet path
    """

    from pyspark import SparkContext, SparkConf
    from pyspark.sql import SparkSession, Row

    from pyspark.sql import Row
    from pyspark.sql.types import _infer_schema
    from pyspark.sql.functions import monotonically_increasing_id

    rowgroup_size_mb = 256
    spark_conf = SparkConf().setAppName('Image preprocess')
    sc = SparkContext(conf=spark_conf)
    session = SparkSession(sc)

    # Load images and convert it to dataframe
    images_rdd = sc.binaryFiles(feature_path).values()
    image_flat_numpy_rdd = images_rdd.map(lambda pair_raw_image_id: (raw_image_to_numpy_array(pair_raw_image_id[0]), pair_raw_image_id[1])) \
                                     .map(lambda pair_np_array_id: {'features': pair_np_array_id[0], 'id': pair_np_array_id[1]}) \
                                     .map(lambda x: dict_to_spark_row(FeatureSchema, x))

    image_df = session.createDataFrame(image_flat_numpy_rdd,
                                       FeatureSchema.as_spark_schema())
    # .withColumn("id", monotonically_increasing_id()) # Generate table row id
    # Load masks and convert it to dataframe
    mask_rdd = sc.binaryFiles(mask_path).values().zipWithIndex()

    # Convert mask rgb value to 0 for not building and 1 for building
    mask_flat_numpy_rdd = mask_rdd.map(lambda pair_raw_image_id: (raw_image_to_numpy_array(pair_raw_image_id[0]), pair_raw_image_id[1])) \
                                           .map(lambda pair_np_array_id: ((pair_np_array_id[0] / 255).astype(np.uint8), pair_np_array_id[1])) \
                                           .map(lambda pair_std_np_array_id: {'masks': pair_std_np_array_id[0], 'id': pair_std_np_array_id[1]}) \
                                           .map(lambda x: dict_to_spark_row(MaskSchema, x))

    mask_df = session.createDataFrame(mask_flat_numpy_rdd,
                                      MaskSchema.as_spark_schema())
    #.withColumn("id", monotonically_increasing_id()) # Generate table row id
    mask_df.show(5, False)
    # Concat image_df and mask_df row by row
    train_df = image_df.join(mask_df, "id", "inner").drop('id')

    #print("Summary =>>>>>>>>>>>>>>>>>>>>>>>....>>>")
    #print("Image count {} , mask count {}, train_count {}".format(image_df.count(), mask_df.count(), train_df.count()))
    #print("=======================================")
    with materialize_dataset(session, output_path, TrainSchema,
                             rowgroup_size_mb):
        train_df.write \
                 .mode('overwrite') \
                 .parquet(output_path)
Esempio n. 7
0
def create_test_dataset(tmp_url, rows, num_files=2, spark=None):
    """
    Creates a test dataset under tmp_dir, with rows and num_files that has TestSchema.
    :param tmp_url: The URL of the temp directory to store the test dataset in.
    :param rows: The number of rows for the dataset.
    :param num_files: The number of files to partition the data between.
    :param spark: An optional spark session to use
    :return: A list of the dataset dictionary.
    """

    shutdown = False
    if not spark:
        spark_session = SparkSession \
            .builder \
            .appName('petastorm_end_to_end_test') \
            .master('local[8]')

        spark = spark_session.getOrCreate()
        shutdown = True
    spark_context = spark.sparkContext

    with materialize_dataset(spark, tmp_url, TestSchema):
        id_rdd = spark_context.parallelize(rows, numSlices=40)

        # Make up some random data and store it for referencing in the tests
        random_dicts_rdd = id_rdd.map(_randomize_row).cache()
        dataset_dicts = random_dicts_rdd.collect()

        random_rows_rdd = random_dicts_rdd.map(
            partial(dict_to_spark_row, TestSchema))

        # Create a spark dataframe with the random rows
        dataframe = spark. \
            createDataFrame(random_rows_rdd, TestSchema.as_spark_schema()).sort('id')

        # Save a parquet
        dataframe. \
            coalesce(num_files). \
            write.option('compression', 'none'). \
            partitionBy('partition_key'). \
            mode('overwrite'). \
            parquet(tmp_url)

    # Create list of objects to build row group indexes
    indexers = [
        SingleFieldIndexer(TestSchema.id.name, TestSchema.id.name),
        SingleFieldIndexer(TestSchema.sensor_name.name,
                           TestSchema.sensor_name.name),
        SingleFieldIndexer(TestSchema.string_array_nullable.name,
                           TestSchema.string_array_nullable.name),
    ]
    build_rowgroup_index(tmp_url, spark_context, indexers)

    if shutdown:
        spark.stop()

    return dataset_dicts
Esempio n. 8
0
def save_parquet_for_petastorm_parquet(spark: SparkSession, df: pyspark.sql.DataFrame, output_path: str,
                                       schema: Unischema):
    output_path = Path(output_path).absolute().as_uri()
    with materialize_dataset(spark, output_path, schema, row_group_size_mb=256):
        (
            df
                .write
                .mode('overwrite')
                .parquet(output_path)
        )
Esempio n. 9
0
def copy_dataset(spark, source_url, target_url, field_regex, not_null_fields,
                 overwrite_output, partitions_count, row_group_size_mb):
    """Creates a copy of a dataset. A new dataset will optionally contain a subset of columns. Rows that have NULL
    values in fields defined by ``not_null_fields`` argument are filtered out.


    :param spark: An instance of ``SparkSession`` object
    :param source_url: A url of the dataset to be copied.
    :param target_url: A url specifying location of the target dataset.
    :param field_regex: A list of regex patterns. Only columns that match one of these patterns are copied to the new
      dataset.
    :param not_null_fields: A list of fields that must have non-NULL valus in the target dataset.
    :param overwrite_output: If ``False`` and there is an existing path defined by ``target_url``, the operation will
      fail.
    :param partitions_count: If not ``None``, the dataset is repartitioned before write. Number of files in the target
      Parquet store is defined by this parameter.
    :param row_group_size_mb: The size of the rowgroup in the target dataset. Specified in megabytes.
    :return: None
    """
    schema = get_schema_from_dataset_url(source_url)

    fields = match_unischema_fields(schema, field_regex)

    if field_regex and not fields:
        field_names = list(schema.fields.keys())
        raise ValueError(
            'Regular expressions (%s) do not match any fields (%s)',
            str(field_regex), str(field_names))

    if fields:
        subschema = schema.create_schema_view(fields)
    else:
        subschema = schema

    with materialize_dataset(spark, target_url, subschema, row_group_size_mb):
        data_frame = spark.read \
            .parquet(source_url)

        if fields:
            data_frame = data_frame.select(*[f.name for f in fields])

        if not_null_fields:
            not_null_condition = reduce(operator.__and__,
                                        (data_frame[f].isNotNull()
                                         for f in not_null_fields))
            data_frame = data_frame.filter(not_null_condition)

        if partitions_count:
            data_frame = data_frame.repartition(partitions_count)

        data_frame.write \
            .mode('overwrite' if overwrite_output else 'error') \
            .option('compression', 'none') \
            .parquet(target_url)
def generate_petastorm_metadata(spark, dataset_url, unischema_class=None):
    """
    Generate metadata necessary to read a petastorm dataset to an existing dataset.
    :param spark: spark session
    :param dataset_url: url of existing dataset
    :param unischema_class: (optional) fully qualified dataset unischema class. If not specified will attempt
        to find one already in the dataset. (e.g. examples.hello_world.hello_world_dataset.HelloWorldSchema)
    :return:
    """
    sc = spark.sparkContext

    resolver = FilesystemResolver(dataset_url, sc._jsc.hadoopConfiguration())
    dataset = pq.ParquetDataset(resolver.parsed_dataset_url().path,
                                filesystem=resolver.filesystem(),
                                validate_schema=False)

    if unischema_class:
        schema = locate(unischema_class)
    else:

        try:
            schema = get_schema(dataset)
        except ValueError:
            raise ValueError(
                'Unischema class could not be located in existing dataset,'
                ' please specify it')

    # In order to be backwards compatible, we retrieve the common metadata from the dataset before
    # overwriting the metadata to keep row group indexes and the old row group per file index
    arrow_metadata = dataset.common_metadata or None

    with materialize_dataset(spark, dataset_url, schema):
        # Inside the materialize dataset context we just need to write the metadata file as the schema will
        # be written by the context manager.
        # We use the java ParquetOutputCommitter to write the metadata file for the existing dataset
        # which will read all the footers of the dataset in parallel and merge them.
        hadoop_config = sc._jsc.hadoopConfiguration()
        Path = sc._gateway.jvm.org.apache.hadoop.fs.Path
        parquet_output_committer = sc._gateway.jvm.org.apache.parquet.hadoop.ParquetOutputCommitter
        parquet_output_committer.writeMetaDataFile(hadoop_config,
                                                   Path(dataset_url))

    if arrow_metadata:
        # If there was the old row groups per file key or the row groups index key, add them to the new dataset metadata
        base_schema = arrow_metadata.schema.to_arrow_schema()
        metadata_dict = base_schema.metadata
        if ROW_GROUPS_PER_FILE_KEY in metadata_dict:
            add_to_dataset_metadata(dataset, ROW_GROUPS_PER_FILE_KEY,
                                    metadata_dict[ROW_GROUPS_PER_FILE_KEY])
        if ROWGROUPS_INDEX_KEY in metadata_dict:
            add_to_dataset_metadata(dataset, ROWGROUPS_INDEX_KEY,
                                    metadata_dict[ROWGROUPS_INDEX_KEY])
Esempio n. 11
0
def generate_parquet(feature_path, mask_path, output_path):
    """[summary]
    Generate parquet file with two columns
        - First column: np_array representing image
        - Second column: np_array representing mask

    Arguments:
        feature_path {[type]} -- path to all images
        mask_path {[type]} -- path to masks of images
        output_path {[type]} -- parquet path
    """

    from pyspark import SparkContext, SparkConf
    from pyspark.sql import SparkSession, Row

    from pyspark.sql import Row
    from pyspark.sql.types import _infer_schema
    from pyspark.sql.functions import monotonically_increasing_id

    rowgroup_size_mb = 256
    spark_conf = SparkConf().setAppName('Image preprocess')
    sc = SparkContext(conf=spark_conf)
    session = SparkSession(sc)

    # Load images and convert it to dataframe
    images_rdd = sc.binaryFiles(feature_path)
    image_flat_numpy_rdd = images_rdd.values().map(raw_image_to_numpy_array) \
                                            .map(lambda x: {'features': x}) \
                                            .map(lambda x: dict_to_spark_row(FeatureSchema, x))
    image_df = session.createDataFrame(image_flat_numpy_rdd, FeatureSchema.as_spark_schema()) \
                        .withColumn("id", monotonically_increasing_id()) # Generate table row id

    # Load masks and convert it to dataframe
    mask_rdd = sc.binaryFiles(mask_path)
    mask_flat_numpy_rdd = mask_rdd.values().map(raw_image_to_numpy_array) \
                                           .map(lambda image_np_array: (image_np_array / 255).astype(np.uint8)) \
                                           .map(lambda x: {'masks': x}) \
                                           .map(lambda x: dict_to_spark_row(MaskSchema, x))

    mask_df = session.createDataFrame(mask_flat_numpy_rdd, MaskSchema.as_spark_schema()) \
                        .withColumn("id", monotonically_increasing_id()) # Generate table row id

    # Concat image_df and mask_df row by row
    train_df = image_df.join(mask_df, "id", "outer").drop("id")
    with materialize_dataset(session, output_path, TrainSchema,
                             rowgroup_size_mb):
        train_df.write \
                .mode('overwrite') \
                .parquet(output_path)
Esempio n. 12
0
    def create(self, table: DataFrameMetadata):
        """
        Create an empty dataframe in petastorm.
        """
        empty_rdd = self.spark_context.emptyRDD()

        with materialize_dataset(self.spark_session, self._spark_url(table),
                                 table.schema.petastorm_schema):

            self.spark_session.createDataFrame(empty_rdd,
                                               table.schema.pyspark_schema) \
                .coalesce(1) \
                .write \
                .mode('overwrite') \
                .parquet(self._spark_url(table))
Esempio n. 13
0
def test_predicate_on_partitioned_dataset(tmpdir):
    """
    Generates a partitioned dataset and ensures that readers evaluate the type of the partition
    column according to the type given in the Unischema.
    """
    TestSchema = Unischema('TestSchema', [
        UnischemaField('id', np.int32, (), ScalarCodec(IntegerType()), False),
        UnischemaField('id2', np.int32, (), ScalarCodec(IntegerType()), False),
        UnischemaField('test_field', np.int32,
                       (), ScalarCodec(IntegerType()), False),
    ])

    def test_row_generator(x):
        """Returns a single entry in the generated dataset."""
        return {'id': x, 'id2': x + 1, 'test_field': x * x}

    rowgroup_size_mb = 256
    dataset_url = "file://{0}/partitioned_test_dataset".format(tmpdir)

    spark = SparkSession.builder.config('spark.driver.memory',
                                        '2g').master('local[2]').getOrCreate()
    sc = spark.sparkContext

    rows_count = 10
    with materialize_dataset(spark, dataset_url, TestSchema, rowgroup_size_mb):

        rows_rdd = sc.parallelize(range(rows_count))\
            .map(test_row_generator)\
            .map(lambda x: dict_to_spark_row(TestSchema, x))

        spark.createDataFrame(rows_rdd, TestSchema.as_spark_schema()) \
            .write \
            .partitionBy('id', 'id2') \
            .parquet(dataset_url)

    with make_reader(dataset_url,
                     predicate=in_lambda(['id'], lambda x: x == 3)) as reader:
        assert next(reader).id == 3
    with make_reader(dataset_url,
                     predicate=in_lambda(['id2'], lambda x: x == 5)) as reader:
        assert next(reader).id == 5
    with make_reader(dataset_url,
                     predicate=in_lambda(['id'],
                                         lambda x: x == '3')) as reader:
        with pytest.raises(StopIteration):
            # Predicate should have selected none, so a StopIteration should be raised.
            next(reader)
def generate_dataset(data_directory,
                     sensor,
                     output_url,
                     year=2018,
                     max_files=100000,
                     dayofyear=None):
    """
    Write L1b patches to petastorm database for training
    Args:
        data_directory: directory of L1b data
        sensor: Select sensor from (G16,G17,H8)
        output_url: Directory to write petastorm database (file:///...)
        year: Integer (depending on directory, 2017-2020)
        max_files: Maximum number of files to iterate over
        dayofyear: 1-366
    Returns:
        None
    """
    rowgroup_size_mb = 256

    spark = SparkSession.builder.config('spark.driver.memory',
                                        '2g').master('local[4]').getOrCreate()
    sc = spark.sparkContext

    geo = geonexl1b.GeoNEXL1b(data_directory=data_directory, sensor=sensor)
    tiles = geo.tiles()
    files = geo.files(year=year, dayofyear=dayofyear)
    files['v'] = files['tile'].map(lambda t: int(t[4:6]))
    files['h'] = files['tile'].map(lambda t: int(t[1:3]))

    idxs = np.random.randint(0, files.shape[0], max_files)
    files = files.iloc[idxs]
    files = files.reset_index()

    with materialize_dataset(spark, output_url, L1bSchema, rowgroup_size_mb):
        filerdd = spark.createDataFrame(files)\
             .select("year", "dayofyear", "hour", "minute", "v", "h", "file")\
             .rdd.map(tuple)\
             .flatMap(sample_generator)\
             .map(lambda x: dict_to_spark_row(L1bSchema, x))

        spark.createDataFrame(filerdd, L1bSchema.as_spark_schema())\
            .coalesce(50) \
            .write \
            .mode('overwrite') \
            .parquet(output_url)
Esempio n. 15
0
def test_pass_in_pyarrow_filesystem_to_materialize_dataset(synthetic_dataset, tmpdir):
    a_moved_path = tmpdir.join('moved').strpath
    copytree(synthetic_dataset.path, a_moved_path)

    local_fs = pyarrow.LocalFileSystem
    os.remove(a_moved_path + '/_common_metadata')

    spark = SparkSession.builder.getOrCreate()

    with materialize_dataset(spark, a_moved_path, TestSchema, filesystem_factory=local_fs):
        pass

    with make_reader('file://{}'.format(a_moved_path), reader_pool_type='dummy') as reader:
        _check_simple_reader(reader, synthetic_dataset.data)

    spark.stop()
    rmtree(a_moved_path)
Esempio n. 16
0
def create_dataframe(df_metadata: DataFrameMetadata):

    spark = Session().get_session()
    spark_context = Session().get_context()

    # Create an empty RDD
    empty_rdd = spark_context.emptyRDD()
    print("url", df_metadata.file_url)
    # Use petastorm to create dataframe
    with materialize_dataset(spark, df_metadata.file_url,
                             df_metadata.schema.petastorm_schema):

        spark.createDataFrame(empty_rdd,
                              df_metadata.schema.pyspark_schema) \
            .coalesce(1) \
            .write \
            .mode('overwrite') \
            .parquet(df_metadata.file_url)
Esempio n. 17
0
def append_rows(df_metadata: DataFrameMetadata, rows):

    spark = Session().get_session()

    # Convert a list of rows to RDD
    rows_df = spark.createDataFrame(rows,
                                    df_metadata.get_dataframe_pyspark_schema())
    rows_rdd = rows_df.rdd

    # Use petastorm to appends rows
    with materialize_dataset(spark, df_metadata.get_dataframe_file_url(),
                             df_metadata.get_dataframe_petastorm_schema()):

        spark.createDataFrame(rows_rdd,
                              df_metadata.get_dataframe_pyspark_schema()) \
            .coalesce(1) \
            .write \
            .mode('append') \
            .parquet(df_metadata.get_dataframe_file_url())
Esempio n. 18
0
def append_rows(df_metadata: DataFrameMetadata, rows):

    spark = Session().get_session()
    spark_context = Session().get_context()

    # Use petastorm to appends rows
    with materialize_dataset(spark, df_metadata.file_url,
                             df_metadata.schema.petastorm_schema):
        # Convert a list of rows to RDD
        rows_rdd = spark_context.parallelize(
            rows).map(lambda x: dict_to_spark_row(
                df_metadata.schema.petastorm_schema, x))

        spark.createDataFrame(rows_rdd,
                              df_metadata.schema.pyspark_schema) \
            .coalesce(1) \
            .write \
            .mode('append') \
            .parquet(df_metadata.file_url)
Esempio n. 19
0
def generate_shuffle_analysis_dataset(spark, output_dataset_url, num_rows=1000, row_group_size=100):
    """
    Generates a small dataset useful for doing analysis on shuffling algorithms

    :param spark: spark session
    :param output_dataset_url: location to write dataset
    :param num_rows: how many rows should the dataset include
    :param row_group_size: how many rows in each row group (there is a minimum of 5)
    :return:
    """
    spark_context = spark.sparkContext
    with materialize_dataset(spark, output_dataset_url, _ShuffleAnalysisSchema):
        rows_rdd = spark_context.parallelize(range(num_rows), numSlices=50) \
            .map(lambda i: {'id': i}) \
            .map(partial(dict_to_spark_row, _ShuffleAnalysisSchema))
        spark.createDataFrame(rows_rdd, _ShuffleAnalysisSchema.as_spark_schema()) \
            .sort('id') \
            .coalesce(max(1, int(num_rows / row_group_size))) \
            .write.option('compression', 'none') \
            .parquet(output_dataset_url)
Esempio n. 20
0
def generate_petastorm_dataset(output_url='file:///tmp/hello_world_dataset'):
    rowgroup_size_mb = 256

    spark = SparkSession.builder.config('spark.driver.memory',
                                        '2g').master('local[2]').getOrCreate()
    sc = spark.sparkContext

    # Wrap dataset materialization portion. Will take care of setting up spark environment variables as
    # well as save petastorm specific metadata
    rows_count = 10
    with materialize_dataset(spark, output_url, HelloWorldSchema,
                             rowgroup_size_mb):

        rows_rdd = sc.parallelize(range(rows_count))\
            .map(row_generator)\
            .map(lambda x: dict_to_spark_row(HelloWorldSchema, x))

        spark.createDataFrame(rows_rdd, HelloWorldSchema.as_spark_schema()) \
            .coalesce(10) \
            .write \
            .mode('overwrite') \
            .parquet(output_url)
Esempio n. 21
0
    def __init__(self, dataset_name: str, frame_metadata: FrameInfo):

        self.dataset_name = dataset_name
        self.H = frame_metadata.height
        self.W = frame_metadata.width
        self.C = frame_metadata.num_channels

        # The schema defines how the dataset schema looks like
        self.dataset_schema = Unischema(self.dataset_name, [
            UnischemaField('frame_id', np.int32,
                           (), ScalarCodec(IntegerType()), False),
            UnischemaField('frame_data', np.uint8, (self.H, self.W, self.C),
                           CompressedNdarrayCodec(), False),
        ])

        # Construct output location
        eva_dir = ConfigurationManager().get_value("core", "location")
        output_url = os.path.join(eva_dir, self.dataset_name)

        # Get session handle
        session = Session()
        spark = session.get_session()
        spark_context = session.get_context()

        # Wrap dataset materialization portion.
        rows_count = 10
        with materialize_dataset(spark, output_url, self.dataset_schema):

            rows_rdd = spark_context.parallelize(range(rows_count))\
                .map(lambda x: row_generator(x, self.H, self.W, self.C))\
                .map(lambda x: dict_to_spark_row(self.dataset_schema, x))

            spark.createDataFrame(rows_rdd,
                                  self.dataset_schema.as_spark_schema()) \
                .coalesce(10) \
                .write \
                .mode('overwrite') \
                .parquet(output_url)
Esempio n. 22
0
def save_parquet(spark, df, path, schema):
    output_path = path.absolute().as_uri()
    with materialize_dataset(spark, output_path, schema,
                             row_group_size_mb=256):
        (df.write.mode('overwrite').parquet(output_path))
Esempio n. 23
0
def _create_dataset(store, df, validation, compress_sparse, num_partitions,
                    num_workers, dataset_idx, parquet_row_group_size_mb,
                    verbose):
    train_data_path = store.get_train_data_path(dataset_idx)
    val_data_path = store.get_val_data_path(dataset_idx)
    if verbose >= 1:
        print('CEREBRO => Time: {}, Writing DataFrames'.format(
            datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
        print('CEREBRO => Time: {}, Train Data Path: {}'.format(
            datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
            train_data_path))
        print('CEREBRO => Time: {}, Val Data Path: {}'.format(
            datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
            val_data_path))

    schema_cols = df.columns

    if isinstance(validation, str):
        schema_cols.append(validation)
    df = df[schema_cols]

    metadata = None
    if _has_vector_column(df):
        if compress_sparse:
            metadata = _get_metadata(df)
        to_petastorm = to_petastorm_fn(schema_cols, metadata)
        df = df.rdd.map(to_petastorm).toDF()

    train_df, val_df, validation_ratio = _train_val_split(df, validation)

    unischema_fields = []
    metadata = _get_metadata(train_df)
    for k in metadata.keys():
        type = spark_to_petastorm_type(metadata[k]['spark_data_type'])
        shape = petastorm_unischema_shape(metadata[k]['shape'])
        codec = petastorm_unischema_codec(metadata[k]['shape'],
                                          metadata[k]['spark_data_type'])
        unischema_fields.append(UnischemaField(k, type, shape, codec, False))

    petastorm_schema = Unischema('petastorm_schema', unischema_fields)

    train_partitions = max(int(num_partitions * (1.0 - validation_ratio)),
                           num_workers)
    if verbose >= 1:
        print('CEREBRO => Time: {}, Train Partitions: {}'.format(
            datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
            train_partitions))

    spark = SparkSession.builder.getOrCreate()
    # FIXME pass hdfs_driver from user interface instead of hardcoded PETASTORM_HDFS_DRIVER
    train_resolver = FilesystemResolver(
        train_data_path,
        spark.sparkContext._jsc.hadoopConfiguration(),
        user=spark.sparkContext.sparkUser(),
        hdfs_driver=constants.PETASTORM_HDFS_DRIVER)
    with materialize_dataset(
            spark,
            train_data_path,
            petastorm_schema,
            parquet_row_group_size_mb,
            filesystem_factory=train_resolver.filesystem_factory()):
        train_rdd = train_df.rdd.map(lambda x: x.asDict()).map(
            lambda x: {k: np.array(x[k], dtype=spark_to_petastorm_type(metadata[k]['spark_data_type'])) for k in x}) \
            .map(lambda x: dict_to_spark_row(petastorm_schema, x))

        spark.createDataFrame(train_rdd, petastorm_schema.as_spark_schema()) \
            .coalesce(train_partitions) \
            .write \
            .mode('overwrite') \
            .parquet(train_data_path)

    if val_df:
        val_partitions = max(int(num_partitions * validation_ratio),
                             num_workers)
        if verbose >= 1:
            print('CEREBRO => Time: {}, Val Partitions: {}'.format(
                datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                val_partitions))
        val_resolver = FilesystemResolver(
            val_data_path,
            spark.sparkContext._jsc.hadoopConfiguration(),
            user=spark.sparkContext.sparkUser(),
            hdfs_driver=constants.PETASTORM_HDFS_DRIVER)
        with materialize_dataset(
                spark,
                val_data_path,
                petastorm_schema,
                parquet_row_group_size_mb,
                filesystem_factory=val_resolver.filesystem_factory()):
            val_rdd = val_df.rdd.map(lambda x: x.asDict()).map(
                lambda x: {k: np.array(x[k], dtype=spark_to_petastorm_type(metadata[k]['spark_data_type'])) for k in x}) \
                .map(lambda x: dict_to_spark_row(petastorm_schema, x))

            spark.createDataFrame(val_rdd, petastorm_schema.as_spark_schema()) \
                .coalesce(val_partitions) \
                .write \
                .mode('overwrite') \
                .parquet(val_data_path)

    train_rows, val_rows, pq_metadata, avg_row_size = get_simple_meta_from_parquet(
        store, df.columns, dataset_idx)

    if verbose:
        print('CEREBRO => Time: {}, Train Rows: {}'.format(
            datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), train_rows))
    if val_df:
        if val_rows == 0:
            raise ValueError(
                'Validation DataFrame does not any samples with validation param {}'
                .format(validation))
        if verbose:
            print('CEREBRO => Time: {}, Val Rows: {}'.format(
                datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                val_rows))

    return train_rows, val_rows, pq_metadata, avg_row_size
Esempio n. 24
0
def imagenet_directory_to_petastorm_dataset(imagenet_path, output_url, spark_master=None, parquet_files_count=100,
                                            noun_id_to_text=None):
    """Converts a directory with imagenet data into a petastorm dataset.

    Expected directory format is:

    >>> nXXXXXXXX/
    >>>    *.JPEG

    >>> nZZZZZZZZ/
    >>>    *.JPEG

    :param imagenet_path: a path to the directory containing ``n*/`` subdirectories. If you are running this script on
      a Spark cluster, you should have this file be mounted and accessible to executors.
    :param output_url: the location where your dataset will be written to. Should be a url: either
      ``file://...`` or ``hdfs://...``
    :param spark_master: A master parameter used by spark session builder. Use default value (``None``) to use system
      environment configured spark cluster. Use ``local[*]`` to run on a local box.
    :param noun_id_to_text: A dictionary: ``{noun_id : text}``. If ``None``, this function will download the dictionary
      from the Internet.
    :return: ``None``
    """
    session_builder = SparkSession \
        .builder \
        .appName('Imagenet Dataset Creation') \
        .config('spark.executor.memory', '10g') \
        .config('spark.driver.memory', '10g')  # Increase the memory if running locally with high number of executors
    if spark_master:
        session_builder.master(spark_master)

    spark = session_builder.getOrCreate()
    sc = spark.sparkContext

    # Get a list of noun_ids
    noun_ids = os.listdir(imagenet_path)
    if not all(noun_id.startswith('n') for noun_id in noun_ids):
        raise RuntimeError('Directory {} expected to contain only subdirectories with name '
                           'starting with "n".'.format(imagenet_path))

    if not noun_id_to_text:
        noun_id_to_text = download_nouns_mapping()

    ROWGROUP_SIZE_MB = 256
    with materialize_dataset(spark, output_url, ImagenetSchema, ROWGROUP_SIZE_MB):
        # list of [(nXXXX, 'noun-text'), ...]
        noun_id_text_list = map(lambda noun_id: (noun_id, noun_id_to_text[noun_id]), noun_ids)

        # rdd of [(nXXXX, 'noun-text', path), ...]
        noun_id_text_image_path_rdd = sc.parallelize(noun_id_text_list, min(len(noun_ids) / 10 + 1, 10000)) \
            .flatMap(lambda word_id_label: [word_id_label + (image_path,) for image_path in
                                            glob.glob(os.path.join(imagenet_path, word_id_label[0], '*.JPEG'))])

        # rdd of [(nXXXX, 'noun-text', image), ...]
        noun_id_text_image_rdd = noun_id_text_image_path_rdd \
            .map(lambda id_word_image_path:
                 {ImagenetSchema.noun_id.name: id_word_image_path[0],
                  ImagenetSchema.text.name: id_word_image_path[1],
                  ImagenetSchema.image.name: cv2.imread(id_word_image_path[2])})

        # Convert to pyspark.sql.Row
        sql_rows_rdd = noun_id_text_image_rdd.map(lambda r: dict_to_spark_row(ImagenetSchema, r))

        # Write out the result
        spark.createDataFrame(sql_rows_rdd, ImagenetSchema.as_spark_schema()) \
            .coalesce(parquet_files_count) \
            .write \
            .mode('overwrite') \
            .option('compression', 'none') \
            .parquet(output_url)
def main():
    SPARK_MASTER_URL = 'spark://...' # Change the Spark master URL.
    H5_PRE_PROCESSED_DATA_DIR = 'file://...' # Change pre-processed data input path. Should be accessible from all Spark workers.
    OUTPUT_PATH = 'file:///...' # Change Petastorm output path. Should be accessible from all Spark workers.
    TRAIN_FRACTION = 0.7 # Fraction of train data. Remaining is validation data.
    
    ROW_GROUP_SIZE_MB = 512 # Size of Parquet row group size.
    NUM_PARTITIONS = 100 # Number of Parquet partitions for train and val data each.
    
    spark = SparkSession \
            .builder \
            .master(SPARK_MASTER_URL) \
            .appName("Deep Postures Example - Petastorm Data Generation") \
            .getOrCreate()

    input_data = []
    if H5_PRE_PROCESSED_DATA_DIR.startswith('hdfs://'):
        args = "hdfs dfs -ls "+dir_in+" | awk '{print $8}'"
        proc = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)

        s_output, s_err = proc.communicate()
        input_data = ['hdfs://'+ path for path in s_output.split()]
    elif H5_PRE_PROCESSED_DATA_DIR.startswith('file://'):
        for dirname in os.listdir(H5_PRE_PROCESSED_DATA_DIR):
            if not os.path.join(H5_PRE_PROCESSED_DATA_DIR, dirname).startswith('.')
            input_data.append(str(os.path.join(H5_PRE_PROCESSED_DATA_DIR, dirname)))
    else:
        raise Exception('Unsupported file system in: {}'.format(H5_PRE_PROCESSED_DATA_DIR))

    random.shuffle(input_data)
    n_train = int(len(input_data) * TRAIN_FRACTION)
    train_data = input_data[:n_train]
    val_data = input_data[n_train:]

    backend = SparkBackend(spark_context=spark.sparkContext)
    store = LocalStore(OUTPUT_PATH, train_path=os.path.join(OUTPUT_PATH, 'train_data'), val_path=os.path.join(OUTPUT_PATH, 'val_data'))
    
    schema = Unischema('schema', [
        UnischemaField('id', np.string_, (), ScalarCodec(StringType()), False),
        UnischemaField('time', np.int64, (), ScalarCodec(LongType()), False),
        UnischemaField('data', np.float32, (100, 3), NdarrayCodec(), False),
        UnischemaField('non_wear', np.int32, (), ScalarCodec(IntegerType()), False),
        UnischemaField('sleeping', np.int32, (), ScalarCodec(IntegerType()), False),
        UnischemaField('label', np.int32, (), ScalarCodec(IntegerType()), False)
    ])

    with materialize_dataset(spark, os.path.join(output_url, 'train_data'), schema, ROW_GROUP_SIZE_MB):
        rdd=spark.sparkContext.parallelize(train_data)
        rdd = rdd.flatMap(lambda x: load_h5(x)).map(lambda item: {'id': item[0], 'time':item[1], 'data':item[2], 'non_wear':item[3], 'sleeping':item[4], 'label':item[5]})
        rdd =  rdd.map(lambda x: dict_to_spark_row(schema, x)) 
        
        df = spark.createDataFrame(rdd, schema=schema.as_spark_schema())
        df.orderBy("id","time").coalesce(NUM_PARTITIONS).write.mode('overwrite').parquet(os.path.join(output_url, 'train_data'))


    with materialize_dataset(spark, os.path.join(output_url, 'val_data'), schema, ROW_GROUP_SIZE_MB):
        rdd=spark.sparkContext.parallelize(val_data)
        rdd = rdd.flatMap(lambda x: load_h5(x)).map(lambda item: {'id': item[0], 'time':item[1], 'data':item[2], 'non_wear':item[3], 'sleeping':item[4], 'label':item[5]})
        rdd =  rdd.map(lambda x: dict_to_spark_row(schema, x)) 
        
        df = spark.createDataFrame(rdd, schema=schema.as_spark_schema())
        df.orderBy("id","time").coalesce(NUM_PARTITIONS).write.mode('overwrite').parquet(os.path.join(output_url, 'val_data'))

if __name__ == "__main__":
    main()
Esempio n. 26
0
def copy_dataset(spark,
                 source_url,
                 target_url,
                 field_regex,
                 not_null_fields,
                 overwrite_output,
                 partitions_count,
                 row_group_size_mb,
                 hdfs_driver='libhdfs3'):
    """
    Creates a copy of a dataset. A new dataset will optionally contain a subset of columns. Rows that have NULL
    values in fields defined by ``not_null_fields`` argument are filtered out.


    :param spark: An instance of ``SparkSession`` object
    :param source_url: A url of the dataset to be copied.
    :param target_url: A url specifying location of the target dataset.
    :param field_regex: A list of regex patterns. Only columns that match one of these patterns are copied to the new
      dataset.
    :param not_null_fields: A list of fields that must have non-NULL valus in the target dataset.
    :param overwrite_output: If ``False`` and there is an existing path defined by ``target_url``, the operation will
      fail.
    :param partitions_count: If not ``None``, the dataset is repartitioned before write. Number of files in the target
      Parquet store is defined by this parameter.
    :param row_group_size_mb: The size of the rowgroup in the target dataset. Specified in megabytes.
    :param hdfs_driver: A string denoting the hdfs driver to use (if using a dataset on hdfs). Current choices are
        libhdfs (java through JNI) or libhdfs3 (C++)
    :param user: String denoting username when connecting to HDFS. None implies login user.
    :return: None
    """
    schema = get_schema_from_dataset_url(source_url, hdfs_driver=hdfs_driver)

    fields = match_unischema_fields(schema, field_regex)

    if field_regex and not fields:
        field_names = list(schema.fields.keys())
        raise ValueError(
            'Regular expressions (%s) do not match any fields (%s)',
            str(field_regex), str(field_names))

    if fields:
        subschema = schema.create_schema_view(fields)
    else:
        subschema = schema

    resolver = FilesystemResolver(
        target_url,
        spark.sparkContext._jsc.hadoopConfiguration(),
        hdfs_driver=hdfs_driver,
        user=spark.sparkContext.sparkUser())
    with materialize_dataset(spark,
                             target_url,
                             subschema,
                             row_group_size_mb,
                             filesystem_factory=resolver.filesystem_factory()):
        data_frame = spark.read \
            .parquet(source_url)

        if fields:
            data_frame = data_frame.select(*[f.name for f in fields])

        if not_null_fields:
            not_null_condition = reduce(operator.__and__,
                                        (data_frame[f].isNotNull()
                                         for f in not_null_fields))
            data_frame = data_frame.filter(not_null_condition)

        if partitions_count:
            data_frame = data_frame.repartition(partitions_count)

        data_frame.write \
            .mode('overwrite' if overwrite_output else 'error') \
            .option('compression', 'none') \
            .parquet(target_url)