def test_serialize_filesystem_factory(tmpdir): SimpleSchema = Unischema('SimpleSchema', [ UnischemaField('id', np.int32, (), ScalarCodec(IntegerType()), False), UnischemaField('foo', np.int32, (), ScalarCodec(IntegerType()), False), ]) class BogusFS(pyarrow.LocalFileSystem): def __getstate__(self): raise RuntimeError("can not serialize") rows_count = 10 output_url = "file://{0}/fs_factory_test".format(tmpdir) rowgroup_size_mb = 256 spark = SparkSession.builder.config('spark.driver.memory', '2g').master('local[2]').getOrCreate() sc = spark.sparkContext with materialize_dataset(spark, output_url, SimpleSchema, rowgroup_size_mb, filesystem_factory=BogusFS): rows_rdd = sc.parallelize(range(rows_count))\ .map(lambda x: {'id': x, 'foo': x})\ .map(lambda x: dict_to_spark_row(SimpleSchema, x)) spark.createDataFrame(rows_rdd, SimpleSchema.as_spark_schema()) \ .write \ .parquet(output_url)
def write(self, table: DataFrameMetadata, rows: Batch): """ Write rows into the dataframe. Arguments: table: table metadata object to write into rows : batch to be persisted in the storage. """ if rows.empty(): return # ToDo # Throw an error if the row schema doesn't match the table schema with materialize_dataset(self.spark_session, self._spark_url(table), table.schema.petastorm_schema): records = rows.frames columns = records.keys() rows_rdd = self.spark_context.parallelize(records.values) \ .map(lambda x: dict(zip(columns, x))) \ .map(lambda x: dict_to_spark_row(table.schema.petastorm_schema, x)) self.spark_session.createDataFrame(rows_rdd, table.schema.pyspark_schema) \ .coalesce(1) \ .write \ .mode('append') \ .parquet(self._spark_url(table))
def write_featureframe(self): """ Writes a dataframe of data as a training dataset on HDFS in the petastorm format Returns: None Raises: :ValueError: if not petastorm schema was provided """ spark = util._find_spark() if constants.PETASTORM_CONFIG.SCHEMA in self.petastorm_args: schema = self.petastorm_args[constants.PETASTORM_CONFIG.SCHEMA] del self.petastorm_args[constants.PETASTORM_CONFIG.SCHEMA] else: raise ValueError( "Required petastorm argument 'schema' is not defined in petastorm_args dict" ) if constants.PETASTORM_CONFIG.FILESYSTEM_FACTORY in self.petastorm_args: filesystem_factory = self.petastorm_args[ constants.PETASTORM_CONFIG.FILESYSTEM_FACTORY] del self.petastorm_args[ constants.PETASTORM_CONFIG.FILESYSTEM_FACTORY] else: filesystem_factory = lambda: pa.hdfs.connect( driver=constants.PETASTORM_CONFIG.LIBHDFS) with materialize_dataset(spark, self.path, schema, filesystem_factory=filesystem_factory, **self.petastorm_args): self.df.write.mode(self.write_mode).parquet(self.path)
def mnist_data_to_petastorm_dataset(download_dir, output_url, spark_master=None, parquet_files_count=1, mnist_data=None): """Converts a directory with MNIST data into a petastorm dataset. Data files are as specified in http://yann.lecun.com/exdb/mnist/: * train-images-idx3-ubyte.gz: training set images (9912422 bytes) * train-labels-idx1-ubyte.gz: training set labels (28881 bytes) * t10k-images-idx3-ubyte.gz: test set images (1648877 bytes) * t10k-labels-idx1-ubyte.gz: test set labels (4542 bytes) The images and labels and stored in the IDX file format for vectors and multidimensional matrices of various numerical types, as defined in the same URL. :param download_dir: the path to where the MNIST data will be downloaded. :param output_url: the location where your dataset will be written to. Should be a url: either file://... or hdfs://... :param spark_master: A master parameter used by spark session builder. Use default value (None) to use system environment configured spark cluster. Use 'local[*]' to run on a local box. :param mnist_data: A dictionary of MNIST data, with name of dataset as key, and the dataset object as value; if None is suplied, download it. :return: None """ session_builder = SparkSession \ .builder \ .appName('MNIST Dataset Creation') if spark_master: session_builder.master(spark_master) spark = session_builder.getOrCreate() # Get training and test data if mnist_data is None: mnist_data = { 'train': download_mnist_data(download_dir, train=True), 'test': download_mnist_data(download_dir, train=False) } # The MNIST data is small enough to do everything here in Python for dset, data in mnist_data.items(): dset_output_url = '{}/{}'.format(output_url, dset) with materialize_dataset(spark, dset_output_url, MnistSchema): # List of [(idx, image, digit), ...] # where image is shaped as a 28x28 numpy matrix idx_image_digit_list = map(lambda idx_image_digit: { MnistSchema.idx.name: idx_image_digit[0], MnistSchema.digit.name: idx_image_digit[1][1], MnistSchema.image.name: np.array(list(idx_image_digit[1][0].getdata()), dtype=np.uint8).reshape(28, 28) }, enumerate(data)) # Convert to pyspark.sql.Row sql_rows = map(lambda r: dict_to_spark_row(MnistSchema, r), idx_image_digit_list) # Write out the result spark.createDataFrame(sql_rows, MnistSchema.as_spark_schema()) \ .coalesce(parquet_files_count) \ .write \ .option('compression', 'none') \ .parquet(dset_output_url)
def generate_petastorm_metadata(spark, dataset_url, unischema_class=None, use_summary_metadata=False): """ Generates metadata necessary to read a petastorm dataset to an existing dataset. :param spark: spark session :param dataset_url: url of existing dataset :param unischema_class: (optional) fully qualified dataset unischema class. If not specified will attempt to find one already in the dataset. (e.g. :class:`examples.hello_world.generate_hello_world_dataset.HelloWorldSchema`) """ sc = spark.sparkContext resolver = FilesystemResolver(dataset_url, sc._jsc.hadoopConfiguration()) dataset = pq.ParquetDataset( resolver.get_dataset_path(), filesystem=resolver.filesystem(), validate_schema=False) if unischema_class: schema = locate(unischema_class) if not isinstance(schema, Unischema): raise ValueError('The specified class %s is not an instance of a petastorm.Unischema object.', unischema_class) else: try: schema = get_schema(dataset) except ValueError: raise ValueError('Unischema class could not be located in existing dataset,' ' please specify it') # In order to be backwards compatible, we retrieve the common metadata from the dataset before # overwriting the metadata to keep row group indexes and the old row group per file index arrow_metadata = dataset.common_metadata or None with materialize_dataset(spark, dataset_url, schema, use_summary_metadata=use_summary_metadata): if use_summary_metadata: # Inside the materialize dataset context we just need to write the metadata file as the schema will # be written by the context manager. # We use the java ParquetOutputCommitter to write the metadata file for the existing dataset # which will read all the footers of the dataset in parallel and merge them. hadoop_config = sc._jsc.hadoopConfiguration() Path = sc._gateway.jvm.org.apache.hadoop.fs.Path parquet_output_committer = sc._gateway.jvm.org.apache.parquet.hadoop.ParquetOutputCommitter parquet_output_committer.writeMetaDataFile(hadoop_config, Path(dataset_url)) spark.stop() if use_summary_metadata and arrow_metadata: # When calling writeMetaDataFile it will overwrite the _common_metadata file which could have schema information # or row group indexers. Therefore we want to retain this information and will add it to the new # _common_metadata file. If we were using the old legacy metadata method this file wont be deleted base_schema = arrow_metadata.schema.to_arrow_schema() metadata_dict = base_schema.metadata if ROW_GROUPS_PER_FILE_KEY in metadata_dict: add_to_dataset_metadata(dataset, ROW_GROUPS_PER_FILE_KEY, metadata_dict[ROW_GROUPS_PER_FILE_KEY]) if ROWGROUPS_INDEX_KEY in metadata_dict: add_to_dataset_metadata(dataset, ROWGROUPS_INDEX_KEY, metadata_dict[ROWGROUPS_INDEX_KEY])
def generate_parquet(feature_path, mask_path, output_path): """[summary] Generate parquet file with two columns - First column: npG_array representing image - Second column: np_array representing mask Arguments: feature_path {[type]} -- path to all images mask_path {[type]} -- path to masks of images output_path {[type]} -- parquet path """ from pyspark import SparkContext, SparkConf from pyspark.sql import SparkSession, Row from pyspark.sql import Row from pyspark.sql.types import _infer_schema from pyspark.sql.functions import monotonically_increasing_id rowgroup_size_mb = 256 spark_conf = SparkConf().setAppName('Image preprocess') sc = SparkContext(conf=spark_conf) session = SparkSession(sc) # Load images and convert it to dataframe images_rdd = sc.binaryFiles(feature_path).values() image_flat_numpy_rdd = images_rdd.map(lambda pair_raw_image_id: (raw_image_to_numpy_array(pair_raw_image_id[0]), pair_raw_image_id[1])) \ .map(lambda pair_np_array_id: {'features': pair_np_array_id[0], 'id': pair_np_array_id[1]}) \ .map(lambda x: dict_to_spark_row(FeatureSchema, x)) image_df = session.createDataFrame(image_flat_numpy_rdd, FeatureSchema.as_spark_schema()) # .withColumn("id", monotonically_increasing_id()) # Generate table row id # Load masks and convert it to dataframe mask_rdd = sc.binaryFiles(mask_path).values().zipWithIndex() # Convert mask rgb value to 0 for not building and 1 for building mask_flat_numpy_rdd = mask_rdd.map(lambda pair_raw_image_id: (raw_image_to_numpy_array(pair_raw_image_id[0]), pair_raw_image_id[1])) \ .map(lambda pair_np_array_id: ((pair_np_array_id[0] / 255).astype(np.uint8), pair_np_array_id[1])) \ .map(lambda pair_std_np_array_id: {'masks': pair_std_np_array_id[0], 'id': pair_std_np_array_id[1]}) \ .map(lambda x: dict_to_spark_row(MaskSchema, x)) mask_df = session.createDataFrame(mask_flat_numpy_rdd, MaskSchema.as_spark_schema()) #.withColumn("id", monotonically_increasing_id()) # Generate table row id mask_df.show(5, False) # Concat image_df and mask_df row by row train_df = image_df.join(mask_df, "id", "inner").drop('id') #print("Summary =>>>>>>>>>>>>>>>>>>>>>>>....>>>") #print("Image count {} , mask count {}, train_count {}".format(image_df.count(), mask_df.count(), train_df.count())) #print("=======================================") with materialize_dataset(session, output_path, TrainSchema, rowgroup_size_mb): train_df.write \ .mode('overwrite') \ .parquet(output_path)
def create_test_dataset(tmp_url, rows, num_files=2, spark=None): """ Creates a test dataset under tmp_dir, with rows and num_files that has TestSchema. :param tmp_url: The URL of the temp directory to store the test dataset in. :param rows: The number of rows for the dataset. :param num_files: The number of files to partition the data between. :param spark: An optional spark session to use :return: A list of the dataset dictionary. """ shutdown = False if not spark: spark_session = SparkSession \ .builder \ .appName('petastorm_end_to_end_test') \ .master('local[8]') spark = spark_session.getOrCreate() shutdown = True spark_context = spark.sparkContext with materialize_dataset(spark, tmp_url, TestSchema): id_rdd = spark_context.parallelize(rows, numSlices=40) # Make up some random data and store it for referencing in the tests random_dicts_rdd = id_rdd.map(_randomize_row).cache() dataset_dicts = random_dicts_rdd.collect() random_rows_rdd = random_dicts_rdd.map( partial(dict_to_spark_row, TestSchema)) # Create a spark dataframe with the random rows dataframe = spark. \ createDataFrame(random_rows_rdd, TestSchema.as_spark_schema()).sort('id') # Save a parquet dataframe. \ coalesce(num_files). \ write.option('compression', 'none'). \ partitionBy('partition_key'). \ mode('overwrite'). \ parquet(tmp_url) # Create list of objects to build row group indexes indexers = [ SingleFieldIndexer(TestSchema.id.name, TestSchema.id.name), SingleFieldIndexer(TestSchema.sensor_name.name, TestSchema.sensor_name.name), SingleFieldIndexer(TestSchema.string_array_nullable.name, TestSchema.string_array_nullable.name), ] build_rowgroup_index(tmp_url, spark_context, indexers) if shutdown: spark.stop() return dataset_dicts
def save_parquet_for_petastorm_parquet(spark: SparkSession, df: pyspark.sql.DataFrame, output_path: str, schema: Unischema): output_path = Path(output_path).absolute().as_uri() with materialize_dataset(spark, output_path, schema, row_group_size_mb=256): ( df .write .mode('overwrite') .parquet(output_path) )
def copy_dataset(spark, source_url, target_url, field_regex, not_null_fields, overwrite_output, partitions_count, row_group_size_mb): """Creates a copy of a dataset. A new dataset will optionally contain a subset of columns. Rows that have NULL values in fields defined by ``not_null_fields`` argument are filtered out. :param spark: An instance of ``SparkSession`` object :param source_url: A url of the dataset to be copied. :param target_url: A url specifying location of the target dataset. :param field_regex: A list of regex patterns. Only columns that match one of these patterns are copied to the new dataset. :param not_null_fields: A list of fields that must have non-NULL valus in the target dataset. :param overwrite_output: If ``False`` and there is an existing path defined by ``target_url``, the operation will fail. :param partitions_count: If not ``None``, the dataset is repartitioned before write. Number of files in the target Parquet store is defined by this parameter. :param row_group_size_mb: The size of the rowgroup in the target dataset. Specified in megabytes. :return: None """ schema = get_schema_from_dataset_url(source_url) fields = match_unischema_fields(schema, field_regex) if field_regex and not fields: field_names = list(schema.fields.keys()) raise ValueError( 'Regular expressions (%s) do not match any fields (%s)', str(field_regex), str(field_names)) if fields: subschema = schema.create_schema_view(fields) else: subschema = schema with materialize_dataset(spark, target_url, subschema, row_group_size_mb): data_frame = spark.read \ .parquet(source_url) if fields: data_frame = data_frame.select(*[f.name for f in fields]) if not_null_fields: not_null_condition = reduce(operator.__and__, (data_frame[f].isNotNull() for f in not_null_fields)) data_frame = data_frame.filter(not_null_condition) if partitions_count: data_frame = data_frame.repartition(partitions_count) data_frame.write \ .mode('overwrite' if overwrite_output else 'error') \ .option('compression', 'none') \ .parquet(target_url)
def generate_petastorm_metadata(spark, dataset_url, unischema_class=None): """ Generate metadata necessary to read a petastorm dataset to an existing dataset. :param spark: spark session :param dataset_url: url of existing dataset :param unischema_class: (optional) fully qualified dataset unischema class. If not specified will attempt to find one already in the dataset. (e.g. examples.hello_world.hello_world_dataset.HelloWorldSchema) :return: """ sc = spark.sparkContext resolver = FilesystemResolver(dataset_url, sc._jsc.hadoopConfiguration()) dataset = pq.ParquetDataset(resolver.parsed_dataset_url().path, filesystem=resolver.filesystem(), validate_schema=False) if unischema_class: schema = locate(unischema_class) else: try: schema = get_schema(dataset) except ValueError: raise ValueError( 'Unischema class could not be located in existing dataset,' ' please specify it') # In order to be backwards compatible, we retrieve the common metadata from the dataset before # overwriting the metadata to keep row group indexes and the old row group per file index arrow_metadata = dataset.common_metadata or None with materialize_dataset(spark, dataset_url, schema): # Inside the materialize dataset context we just need to write the metadata file as the schema will # be written by the context manager. # We use the java ParquetOutputCommitter to write the metadata file for the existing dataset # which will read all the footers of the dataset in parallel and merge them. hadoop_config = sc._jsc.hadoopConfiguration() Path = sc._gateway.jvm.org.apache.hadoop.fs.Path parquet_output_committer = sc._gateway.jvm.org.apache.parquet.hadoop.ParquetOutputCommitter parquet_output_committer.writeMetaDataFile(hadoop_config, Path(dataset_url)) if arrow_metadata: # If there was the old row groups per file key or the row groups index key, add them to the new dataset metadata base_schema = arrow_metadata.schema.to_arrow_schema() metadata_dict = base_schema.metadata if ROW_GROUPS_PER_FILE_KEY in metadata_dict: add_to_dataset_metadata(dataset, ROW_GROUPS_PER_FILE_KEY, metadata_dict[ROW_GROUPS_PER_FILE_KEY]) if ROWGROUPS_INDEX_KEY in metadata_dict: add_to_dataset_metadata(dataset, ROWGROUPS_INDEX_KEY, metadata_dict[ROWGROUPS_INDEX_KEY])
def generate_parquet(feature_path, mask_path, output_path): """[summary] Generate parquet file with two columns - First column: np_array representing image - Second column: np_array representing mask Arguments: feature_path {[type]} -- path to all images mask_path {[type]} -- path to masks of images output_path {[type]} -- parquet path """ from pyspark import SparkContext, SparkConf from pyspark.sql import SparkSession, Row from pyspark.sql import Row from pyspark.sql.types import _infer_schema from pyspark.sql.functions import monotonically_increasing_id rowgroup_size_mb = 256 spark_conf = SparkConf().setAppName('Image preprocess') sc = SparkContext(conf=spark_conf) session = SparkSession(sc) # Load images and convert it to dataframe images_rdd = sc.binaryFiles(feature_path) image_flat_numpy_rdd = images_rdd.values().map(raw_image_to_numpy_array) \ .map(lambda x: {'features': x}) \ .map(lambda x: dict_to_spark_row(FeatureSchema, x)) image_df = session.createDataFrame(image_flat_numpy_rdd, FeatureSchema.as_spark_schema()) \ .withColumn("id", monotonically_increasing_id()) # Generate table row id # Load masks and convert it to dataframe mask_rdd = sc.binaryFiles(mask_path) mask_flat_numpy_rdd = mask_rdd.values().map(raw_image_to_numpy_array) \ .map(lambda image_np_array: (image_np_array / 255).astype(np.uint8)) \ .map(lambda x: {'masks': x}) \ .map(lambda x: dict_to_spark_row(MaskSchema, x)) mask_df = session.createDataFrame(mask_flat_numpy_rdd, MaskSchema.as_spark_schema()) \ .withColumn("id", monotonically_increasing_id()) # Generate table row id # Concat image_df and mask_df row by row train_df = image_df.join(mask_df, "id", "outer").drop("id") with materialize_dataset(session, output_path, TrainSchema, rowgroup_size_mb): train_df.write \ .mode('overwrite') \ .parquet(output_path)
def create(self, table: DataFrameMetadata): """ Create an empty dataframe in petastorm. """ empty_rdd = self.spark_context.emptyRDD() with materialize_dataset(self.spark_session, self._spark_url(table), table.schema.petastorm_schema): self.spark_session.createDataFrame(empty_rdd, table.schema.pyspark_schema) \ .coalesce(1) \ .write \ .mode('overwrite') \ .parquet(self._spark_url(table))
def test_predicate_on_partitioned_dataset(tmpdir): """ Generates a partitioned dataset and ensures that readers evaluate the type of the partition column according to the type given in the Unischema. """ TestSchema = Unischema('TestSchema', [ UnischemaField('id', np.int32, (), ScalarCodec(IntegerType()), False), UnischemaField('id2', np.int32, (), ScalarCodec(IntegerType()), False), UnischemaField('test_field', np.int32, (), ScalarCodec(IntegerType()), False), ]) def test_row_generator(x): """Returns a single entry in the generated dataset.""" return {'id': x, 'id2': x + 1, 'test_field': x * x} rowgroup_size_mb = 256 dataset_url = "file://{0}/partitioned_test_dataset".format(tmpdir) spark = SparkSession.builder.config('spark.driver.memory', '2g').master('local[2]').getOrCreate() sc = spark.sparkContext rows_count = 10 with materialize_dataset(spark, dataset_url, TestSchema, rowgroup_size_mb): rows_rdd = sc.parallelize(range(rows_count))\ .map(test_row_generator)\ .map(lambda x: dict_to_spark_row(TestSchema, x)) spark.createDataFrame(rows_rdd, TestSchema.as_spark_schema()) \ .write \ .partitionBy('id', 'id2') \ .parquet(dataset_url) with make_reader(dataset_url, predicate=in_lambda(['id'], lambda x: x == 3)) as reader: assert next(reader).id == 3 with make_reader(dataset_url, predicate=in_lambda(['id2'], lambda x: x == 5)) as reader: assert next(reader).id == 5 with make_reader(dataset_url, predicate=in_lambda(['id'], lambda x: x == '3')) as reader: with pytest.raises(StopIteration): # Predicate should have selected none, so a StopIteration should be raised. next(reader)
def generate_dataset(data_directory, sensor, output_url, year=2018, max_files=100000, dayofyear=None): """ Write L1b patches to petastorm database for training Args: data_directory: directory of L1b data sensor: Select sensor from (G16,G17,H8) output_url: Directory to write petastorm database (file:///...) year: Integer (depending on directory, 2017-2020) max_files: Maximum number of files to iterate over dayofyear: 1-366 Returns: None """ rowgroup_size_mb = 256 spark = SparkSession.builder.config('spark.driver.memory', '2g').master('local[4]').getOrCreate() sc = spark.sparkContext geo = geonexl1b.GeoNEXL1b(data_directory=data_directory, sensor=sensor) tiles = geo.tiles() files = geo.files(year=year, dayofyear=dayofyear) files['v'] = files['tile'].map(lambda t: int(t[4:6])) files['h'] = files['tile'].map(lambda t: int(t[1:3])) idxs = np.random.randint(0, files.shape[0], max_files) files = files.iloc[idxs] files = files.reset_index() with materialize_dataset(spark, output_url, L1bSchema, rowgroup_size_mb): filerdd = spark.createDataFrame(files)\ .select("year", "dayofyear", "hour", "minute", "v", "h", "file")\ .rdd.map(tuple)\ .flatMap(sample_generator)\ .map(lambda x: dict_to_spark_row(L1bSchema, x)) spark.createDataFrame(filerdd, L1bSchema.as_spark_schema())\ .coalesce(50) \ .write \ .mode('overwrite') \ .parquet(output_url)
def test_pass_in_pyarrow_filesystem_to_materialize_dataset(synthetic_dataset, tmpdir): a_moved_path = tmpdir.join('moved').strpath copytree(synthetic_dataset.path, a_moved_path) local_fs = pyarrow.LocalFileSystem os.remove(a_moved_path + '/_common_metadata') spark = SparkSession.builder.getOrCreate() with materialize_dataset(spark, a_moved_path, TestSchema, filesystem_factory=local_fs): pass with make_reader('file://{}'.format(a_moved_path), reader_pool_type='dummy') as reader: _check_simple_reader(reader, synthetic_dataset.data) spark.stop() rmtree(a_moved_path)
def create_dataframe(df_metadata: DataFrameMetadata): spark = Session().get_session() spark_context = Session().get_context() # Create an empty RDD empty_rdd = spark_context.emptyRDD() print("url", df_metadata.file_url) # Use petastorm to create dataframe with materialize_dataset(spark, df_metadata.file_url, df_metadata.schema.petastorm_schema): spark.createDataFrame(empty_rdd, df_metadata.schema.pyspark_schema) \ .coalesce(1) \ .write \ .mode('overwrite') \ .parquet(df_metadata.file_url)
def append_rows(df_metadata: DataFrameMetadata, rows): spark = Session().get_session() # Convert a list of rows to RDD rows_df = spark.createDataFrame(rows, df_metadata.get_dataframe_pyspark_schema()) rows_rdd = rows_df.rdd # Use petastorm to appends rows with materialize_dataset(spark, df_metadata.get_dataframe_file_url(), df_metadata.get_dataframe_petastorm_schema()): spark.createDataFrame(rows_rdd, df_metadata.get_dataframe_pyspark_schema()) \ .coalesce(1) \ .write \ .mode('append') \ .parquet(df_metadata.get_dataframe_file_url())
def append_rows(df_metadata: DataFrameMetadata, rows): spark = Session().get_session() spark_context = Session().get_context() # Use petastorm to appends rows with materialize_dataset(spark, df_metadata.file_url, df_metadata.schema.petastorm_schema): # Convert a list of rows to RDD rows_rdd = spark_context.parallelize( rows).map(lambda x: dict_to_spark_row( df_metadata.schema.petastorm_schema, x)) spark.createDataFrame(rows_rdd, df_metadata.schema.pyspark_schema) \ .coalesce(1) \ .write \ .mode('append') \ .parquet(df_metadata.file_url)
def generate_shuffle_analysis_dataset(spark, output_dataset_url, num_rows=1000, row_group_size=100): """ Generates a small dataset useful for doing analysis on shuffling algorithms :param spark: spark session :param output_dataset_url: location to write dataset :param num_rows: how many rows should the dataset include :param row_group_size: how many rows in each row group (there is a minimum of 5) :return: """ spark_context = spark.sparkContext with materialize_dataset(spark, output_dataset_url, _ShuffleAnalysisSchema): rows_rdd = spark_context.parallelize(range(num_rows), numSlices=50) \ .map(lambda i: {'id': i}) \ .map(partial(dict_to_spark_row, _ShuffleAnalysisSchema)) spark.createDataFrame(rows_rdd, _ShuffleAnalysisSchema.as_spark_schema()) \ .sort('id') \ .coalesce(max(1, int(num_rows / row_group_size))) \ .write.option('compression', 'none') \ .parquet(output_dataset_url)
def generate_petastorm_dataset(output_url='file:///tmp/hello_world_dataset'): rowgroup_size_mb = 256 spark = SparkSession.builder.config('spark.driver.memory', '2g').master('local[2]').getOrCreate() sc = spark.sparkContext # Wrap dataset materialization portion. Will take care of setting up spark environment variables as # well as save petastorm specific metadata rows_count = 10 with materialize_dataset(spark, output_url, HelloWorldSchema, rowgroup_size_mb): rows_rdd = sc.parallelize(range(rows_count))\ .map(row_generator)\ .map(lambda x: dict_to_spark_row(HelloWorldSchema, x)) spark.createDataFrame(rows_rdd, HelloWorldSchema.as_spark_schema()) \ .coalesce(10) \ .write \ .mode('overwrite') \ .parquet(output_url)
def __init__(self, dataset_name: str, frame_metadata: FrameInfo): self.dataset_name = dataset_name self.H = frame_metadata.height self.W = frame_metadata.width self.C = frame_metadata.num_channels # The schema defines how the dataset schema looks like self.dataset_schema = Unischema(self.dataset_name, [ UnischemaField('frame_id', np.int32, (), ScalarCodec(IntegerType()), False), UnischemaField('frame_data', np.uint8, (self.H, self.W, self.C), CompressedNdarrayCodec(), False), ]) # Construct output location eva_dir = ConfigurationManager().get_value("core", "location") output_url = os.path.join(eva_dir, self.dataset_name) # Get session handle session = Session() spark = session.get_session() spark_context = session.get_context() # Wrap dataset materialization portion. rows_count = 10 with materialize_dataset(spark, output_url, self.dataset_schema): rows_rdd = spark_context.parallelize(range(rows_count))\ .map(lambda x: row_generator(x, self.H, self.W, self.C))\ .map(lambda x: dict_to_spark_row(self.dataset_schema, x)) spark.createDataFrame(rows_rdd, self.dataset_schema.as_spark_schema()) \ .coalesce(10) \ .write \ .mode('overwrite') \ .parquet(output_url)
def save_parquet(spark, df, path, schema): output_path = path.absolute().as_uri() with materialize_dataset(spark, output_path, schema, row_group_size_mb=256): (df.write.mode('overwrite').parquet(output_path))
def _create_dataset(store, df, validation, compress_sparse, num_partitions, num_workers, dataset_idx, parquet_row_group_size_mb, verbose): train_data_path = store.get_train_data_path(dataset_idx) val_data_path = store.get_val_data_path(dataset_idx) if verbose >= 1: print('CEREBRO => Time: {}, Writing DataFrames'.format( datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))) print('CEREBRO => Time: {}, Train Data Path: {}'.format( datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), train_data_path)) print('CEREBRO => Time: {}, Val Data Path: {}'.format( datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), val_data_path)) schema_cols = df.columns if isinstance(validation, str): schema_cols.append(validation) df = df[schema_cols] metadata = None if _has_vector_column(df): if compress_sparse: metadata = _get_metadata(df) to_petastorm = to_petastorm_fn(schema_cols, metadata) df = df.rdd.map(to_petastorm).toDF() train_df, val_df, validation_ratio = _train_val_split(df, validation) unischema_fields = [] metadata = _get_metadata(train_df) for k in metadata.keys(): type = spark_to_petastorm_type(metadata[k]['spark_data_type']) shape = petastorm_unischema_shape(metadata[k]['shape']) codec = petastorm_unischema_codec(metadata[k]['shape'], metadata[k]['spark_data_type']) unischema_fields.append(UnischemaField(k, type, shape, codec, False)) petastorm_schema = Unischema('petastorm_schema', unischema_fields) train_partitions = max(int(num_partitions * (1.0 - validation_ratio)), num_workers) if verbose >= 1: print('CEREBRO => Time: {}, Train Partitions: {}'.format( datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), train_partitions)) spark = SparkSession.builder.getOrCreate() # FIXME pass hdfs_driver from user interface instead of hardcoded PETASTORM_HDFS_DRIVER train_resolver = FilesystemResolver( train_data_path, spark.sparkContext._jsc.hadoopConfiguration(), user=spark.sparkContext.sparkUser(), hdfs_driver=constants.PETASTORM_HDFS_DRIVER) with materialize_dataset( spark, train_data_path, petastorm_schema, parquet_row_group_size_mb, filesystem_factory=train_resolver.filesystem_factory()): train_rdd = train_df.rdd.map(lambda x: x.asDict()).map( lambda x: {k: np.array(x[k], dtype=spark_to_petastorm_type(metadata[k]['spark_data_type'])) for k in x}) \ .map(lambda x: dict_to_spark_row(petastorm_schema, x)) spark.createDataFrame(train_rdd, petastorm_schema.as_spark_schema()) \ .coalesce(train_partitions) \ .write \ .mode('overwrite') \ .parquet(train_data_path) if val_df: val_partitions = max(int(num_partitions * validation_ratio), num_workers) if verbose >= 1: print('CEREBRO => Time: {}, Val Partitions: {}'.format( datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), val_partitions)) val_resolver = FilesystemResolver( val_data_path, spark.sparkContext._jsc.hadoopConfiguration(), user=spark.sparkContext.sparkUser(), hdfs_driver=constants.PETASTORM_HDFS_DRIVER) with materialize_dataset( spark, val_data_path, petastorm_schema, parquet_row_group_size_mb, filesystem_factory=val_resolver.filesystem_factory()): val_rdd = val_df.rdd.map(lambda x: x.asDict()).map( lambda x: {k: np.array(x[k], dtype=spark_to_petastorm_type(metadata[k]['spark_data_type'])) for k in x}) \ .map(lambda x: dict_to_spark_row(petastorm_schema, x)) spark.createDataFrame(val_rdd, petastorm_schema.as_spark_schema()) \ .coalesce(val_partitions) \ .write \ .mode('overwrite') \ .parquet(val_data_path) train_rows, val_rows, pq_metadata, avg_row_size = get_simple_meta_from_parquet( store, df.columns, dataset_idx) if verbose: print('CEREBRO => Time: {}, Train Rows: {}'.format( datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), train_rows)) if val_df: if val_rows == 0: raise ValueError( 'Validation DataFrame does not any samples with validation param {}' .format(validation)) if verbose: print('CEREBRO => Time: {}, Val Rows: {}'.format( datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), val_rows)) return train_rows, val_rows, pq_metadata, avg_row_size
def imagenet_directory_to_petastorm_dataset(imagenet_path, output_url, spark_master=None, parquet_files_count=100, noun_id_to_text=None): """Converts a directory with imagenet data into a petastorm dataset. Expected directory format is: >>> nXXXXXXXX/ >>> *.JPEG >>> nZZZZZZZZ/ >>> *.JPEG :param imagenet_path: a path to the directory containing ``n*/`` subdirectories. If you are running this script on a Spark cluster, you should have this file be mounted and accessible to executors. :param output_url: the location where your dataset will be written to. Should be a url: either ``file://...`` or ``hdfs://...`` :param spark_master: A master parameter used by spark session builder. Use default value (``None``) to use system environment configured spark cluster. Use ``local[*]`` to run on a local box. :param noun_id_to_text: A dictionary: ``{noun_id : text}``. If ``None``, this function will download the dictionary from the Internet. :return: ``None`` """ session_builder = SparkSession \ .builder \ .appName('Imagenet Dataset Creation') \ .config('spark.executor.memory', '10g') \ .config('spark.driver.memory', '10g') # Increase the memory if running locally with high number of executors if spark_master: session_builder.master(spark_master) spark = session_builder.getOrCreate() sc = spark.sparkContext # Get a list of noun_ids noun_ids = os.listdir(imagenet_path) if not all(noun_id.startswith('n') for noun_id in noun_ids): raise RuntimeError('Directory {} expected to contain only subdirectories with name ' 'starting with "n".'.format(imagenet_path)) if not noun_id_to_text: noun_id_to_text = download_nouns_mapping() ROWGROUP_SIZE_MB = 256 with materialize_dataset(spark, output_url, ImagenetSchema, ROWGROUP_SIZE_MB): # list of [(nXXXX, 'noun-text'), ...] noun_id_text_list = map(lambda noun_id: (noun_id, noun_id_to_text[noun_id]), noun_ids) # rdd of [(nXXXX, 'noun-text', path), ...] noun_id_text_image_path_rdd = sc.parallelize(noun_id_text_list, min(len(noun_ids) / 10 + 1, 10000)) \ .flatMap(lambda word_id_label: [word_id_label + (image_path,) for image_path in glob.glob(os.path.join(imagenet_path, word_id_label[0], '*.JPEG'))]) # rdd of [(nXXXX, 'noun-text', image), ...] noun_id_text_image_rdd = noun_id_text_image_path_rdd \ .map(lambda id_word_image_path: {ImagenetSchema.noun_id.name: id_word_image_path[0], ImagenetSchema.text.name: id_word_image_path[1], ImagenetSchema.image.name: cv2.imread(id_word_image_path[2])}) # Convert to pyspark.sql.Row sql_rows_rdd = noun_id_text_image_rdd.map(lambda r: dict_to_spark_row(ImagenetSchema, r)) # Write out the result spark.createDataFrame(sql_rows_rdd, ImagenetSchema.as_spark_schema()) \ .coalesce(parquet_files_count) \ .write \ .mode('overwrite') \ .option('compression', 'none') \ .parquet(output_url)
def main(): SPARK_MASTER_URL = 'spark://...' # Change the Spark master URL. H5_PRE_PROCESSED_DATA_DIR = 'file://...' # Change pre-processed data input path. Should be accessible from all Spark workers. OUTPUT_PATH = 'file:///...' # Change Petastorm output path. Should be accessible from all Spark workers. TRAIN_FRACTION = 0.7 # Fraction of train data. Remaining is validation data. ROW_GROUP_SIZE_MB = 512 # Size of Parquet row group size. NUM_PARTITIONS = 100 # Number of Parquet partitions for train and val data each. spark = SparkSession \ .builder \ .master(SPARK_MASTER_URL) \ .appName("Deep Postures Example - Petastorm Data Generation") \ .getOrCreate() input_data = [] if H5_PRE_PROCESSED_DATA_DIR.startswith('hdfs://'): args = "hdfs dfs -ls "+dir_in+" | awk '{print $8}'" proc = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) s_output, s_err = proc.communicate() input_data = ['hdfs://'+ path for path in s_output.split()] elif H5_PRE_PROCESSED_DATA_DIR.startswith('file://'): for dirname in os.listdir(H5_PRE_PROCESSED_DATA_DIR): if not os.path.join(H5_PRE_PROCESSED_DATA_DIR, dirname).startswith('.') input_data.append(str(os.path.join(H5_PRE_PROCESSED_DATA_DIR, dirname))) else: raise Exception('Unsupported file system in: {}'.format(H5_PRE_PROCESSED_DATA_DIR)) random.shuffle(input_data) n_train = int(len(input_data) * TRAIN_FRACTION) train_data = input_data[:n_train] val_data = input_data[n_train:] backend = SparkBackend(spark_context=spark.sparkContext) store = LocalStore(OUTPUT_PATH, train_path=os.path.join(OUTPUT_PATH, 'train_data'), val_path=os.path.join(OUTPUT_PATH, 'val_data')) schema = Unischema('schema', [ UnischemaField('id', np.string_, (), ScalarCodec(StringType()), False), UnischemaField('time', np.int64, (), ScalarCodec(LongType()), False), UnischemaField('data', np.float32, (100, 3), NdarrayCodec(), False), UnischemaField('non_wear', np.int32, (), ScalarCodec(IntegerType()), False), UnischemaField('sleeping', np.int32, (), ScalarCodec(IntegerType()), False), UnischemaField('label', np.int32, (), ScalarCodec(IntegerType()), False) ]) with materialize_dataset(spark, os.path.join(output_url, 'train_data'), schema, ROW_GROUP_SIZE_MB): rdd=spark.sparkContext.parallelize(train_data) rdd = rdd.flatMap(lambda x: load_h5(x)).map(lambda item: {'id': item[0], 'time':item[1], 'data':item[2], 'non_wear':item[3], 'sleeping':item[4], 'label':item[5]}) rdd = rdd.map(lambda x: dict_to_spark_row(schema, x)) df = spark.createDataFrame(rdd, schema=schema.as_spark_schema()) df.orderBy("id","time").coalesce(NUM_PARTITIONS).write.mode('overwrite').parquet(os.path.join(output_url, 'train_data')) with materialize_dataset(spark, os.path.join(output_url, 'val_data'), schema, ROW_GROUP_SIZE_MB): rdd=spark.sparkContext.parallelize(val_data) rdd = rdd.flatMap(lambda x: load_h5(x)).map(lambda item: {'id': item[0], 'time':item[1], 'data':item[2], 'non_wear':item[3], 'sleeping':item[4], 'label':item[5]}) rdd = rdd.map(lambda x: dict_to_spark_row(schema, x)) df = spark.createDataFrame(rdd, schema=schema.as_spark_schema()) df.orderBy("id","time").coalesce(NUM_PARTITIONS).write.mode('overwrite').parquet(os.path.join(output_url, 'val_data')) if __name__ == "__main__": main()
def copy_dataset(spark, source_url, target_url, field_regex, not_null_fields, overwrite_output, partitions_count, row_group_size_mb, hdfs_driver='libhdfs3'): """ Creates a copy of a dataset. A new dataset will optionally contain a subset of columns. Rows that have NULL values in fields defined by ``not_null_fields`` argument are filtered out. :param spark: An instance of ``SparkSession`` object :param source_url: A url of the dataset to be copied. :param target_url: A url specifying location of the target dataset. :param field_regex: A list of regex patterns. Only columns that match one of these patterns are copied to the new dataset. :param not_null_fields: A list of fields that must have non-NULL valus in the target dataset. :param overwrite_output: If ``False`` and there is an existing path defined by ``target_url``, the operation will fail. :param partitions_count: If not ``None``, the dataset is repartitioned before write. Number of files in the target Parquet store is defined by this parameter. :param row_group_size_mb: The size of the rowgroup in the target dataset. Specified in megabytes. :param hdfs_driver: A string denoting the hdfs driver to use (if using a dataset on hdfs). Current choices are libhdfs (java through JNI) or libhdfs3 (C++) :param user: String denoting username when connecting to HDFS. None implies login user. :return: None """ schema = get_schema_from_dataset_url(source_url, hdfs_driver=hdfs_driver) fields = match_unischema_fields(schema, field_regex) if field_regex and not fields: field_names = list(schema.fields.keys()) raise ValueError( 'Regular expressions (%s) do not match any fields (%s)', str(field_regex), str(field_names)) if fields: subschema = schema.create_schema_view(fields) else: subschema = schema resolver = FilesystemResolver( target_url, spark.sparkContext._jsc.hadoopConfiguration(), hdfs_driver=hdfs_driver, user=spark.sparkContext.sparkUser()) with materialize_dataset(spark, target_url, subschema, row_group_size_mb, filesystem_factory=resolver.filesystem_factory()): data_frame = spark.read \ .parquet(source_url) if fields: data_frame = data_frame.select(*[f.name for f in fields]) if not_null_fields: not_null_condition = reduce(operator.__and__, (data_frame[f].isNotNull() for f in not_null_fields)) data_frame = data_frame.filter(not_null_condition) if partitions_count: data_frame = data_frame.repartition(partitions_count) data_frame.write \ .mode('overwrite' if overwrite_output else 'error') \ .option('compression', 'none') \ .parquet(target_url)