def test_dict_to_spark_row_field_validation_ndarrays(self): """Test various validations done on data types when converting a dictionary to a spark row""" TestSchema = Unischema('TestSchema', [ UnischemaField('tensor3d', np.float32, (10, 20, 30), NdarrayCodec(), False), ]) self.assertTrue( isinstance( dict_to_spark_row( TestSchema, {'tensor3d': np.zeros((10, 20, 30), dtype=np.float32)}), Row)) # Null value into not nullable field with self.assertRaises(ValueError): isinstance(dict_to_spark_row(TestSchema, {'string_field': None}), Row) # Wrong dimensions with self.assertRaises(ValueError): isinstance( dict_to_spark_row( TestSchema, {'string_field': np.zeros((1, 2, 3), dtype=np.float32)}), Row)
def generate_parquet(feature_path, mask_path, output_path): """[summary] Generate parquet file with two columns - First column: npG_array representing image - Second column: np_array representing mask Arguments: feature_path {[type]} -- path to all images mask_path {[type]} -- path to masks of images output_path {[type]} -- parquet path """ from pyspark import SparkContext, SparkConf from pyspark.sql import SparkSession, Row from pyspark.sql import Row from pyspark.sql.types import _infer_schema from pyspark.sql.functions import monotonically_increasing_id rowgroup_size_mb = 256 spark_conf = SparkConf().setAppName('Image preprocess') sc = SparkContext(conf=spark_conf) session = SparkSession(sc) # Load images and convert it to dataframe images_rdd = sc.binaryFiles(feature_path).values() image_flat_numpy_rdd = images_rdd.map(lambda pair_raw_image_id: (raw_image_to_numpy_array(pair_raw_image_id[0]), pair_raw_image_id[1])) \ .map(lambda pair_np_array_id: {'features': pair_np_array_id[0], 'id': pair_np_array_id[1]}) \ .map(lambda x: dict_to_spark_row(FeatureSchema, x)) image_df = session.createDataFrame(image_flat_numpy_rdd, FeatureSchema.as_spark_schema()) # .withColumn("id", monotonically_increasing_id()) # Generate table row id # Load masks and convert it to dataframe mask_rdd = sc.binaryFiles(mask_path).values().zipWithIndex() # Convert mask rgb value to 0 for not building and 1 for building mask_flat_numpy_rdd = mask_rdd.map(lambda pair_raw_image_id: (raw_image_to_numpy_array(pair_raw_image_id[0]), pair_raw_image_id[1])) \ .map(lambda pair_np_array_id: ((pair_np_array_id[0] / 255).astype(np.uint8), pair_np_array_id[1])) \ .map(lambda pair_std_np_array_id: {'masks': pair_std_np_array_id[0], 'id': pair_std_np_array_id[1]}) \ .map(lambda x: dict_to_spark_row(MaskSchema, x)) mask_df = session.createDataFrame(mask_flat_numpy_rdd, MaskSchema.as_spark_schema()) #.withColumn("id", monotonically_increasing_id()) # Generate table row id mask_df.show(5, False) # Concat image_df and mask_df row by row train_df = image_df.join(mask_df, "id", "inner").drop('id') #print("Summary =>>>>>>>>>>>>>>>>>>>>>>>....>>>") #print("Image count {} , mask count {}, train_count {}".format(image_df.count(), mask_df.count(), train_df.count())) #print("=======================================") with materialize_dataset(session, output_path, TrainSchema, rowgroup_size_mb): train_df.write \ .mode('overwrite') \ .parquet(output_path)
def generate_parquet(feature_path, mask_path, output_path): """[summary] Generate parquet file with two columns - First column: np_array representing image - Second column: np_array representing mask Arguments: feature_path {[type]} -- path to all images mask_path {[type]} -- path to masks of images output_path {[type]} -- parquet path """ from pyspark import SparkContext, SparkConf from pyspark.sql import SparkSession, Row from pyspark.sql import Row from pyspark.sql.types import _infer_schema from pyspark.sql.functions import monotonically_increasing_id rowgroup_size_mb = 256 spark_conf = SparkConf().setAppName('Image preprocess') sc = SparkContext(conf=spark_conf) session = SparkSession(sc) # Load images and convert it to dataframe images_rdd = sc.binaryFiles(feature_path) image_flat_numpy_rdd = images_rdd.values().map(raw_image_to_numpy_array) \ .map(lambda x: {'features': x}) \ .map(lambda x: dict_to_spark_row(FeatureSchema, x)) image_df = session.createDataFrame(image_flat_numpy_rdd, FeatureSchema.as_spark_schema()) \ .withColumn("id", monotonically_increasing_id()) # Generate table row id # Load masks and convert it to dataframe mask_rdd = sc.binaryFiles(mask_path) mask_flat_numpy_rdd = mask_rdd.values().map(raw_image_to_numpy_array) \ .map(lambda image_np_array: (image_np_array / 255).astype(np.uint8)) \ .map(lambda x: {'masks': x}) \ .map(lambda x: dict_to_spark_row(MaskSchema, x)) mask_df = session.createDataFrame(mask_flat_numpy_rdd, MaskSchema.as_spark_schema()) \ .withColumn("id", monotonically_increasing_id()) # Generate table row id # Concat image_df and mask_df row by row train_df = image_df.join(mask_df, "id", "outer").drop("id") with materialize_dataset(session, output_path, TrainSchema, rowgroup_size_mb): train_df.write \ .mode('overwrite') \ .parquet(output_path)
def test_dict_to_spark_row_field_validation_scalar_types(): """Test various validations done on data types when converting a dictionary to a spark row""" TestSchema = Unischema('TestSchema', [ UnischemaField('string_field', np.string_, (), ScalarCodec(StringType()), False), ]) assert isinstance(dict_to_spark_row(TestSchema, {'string_field': 'abc'}), Row) # Not a nullable field with pytest.raises(ValueError): isinstance(dict_to_spark_row(TestSchema, {'string_field': None}), Row) # Wrong field type with pytest.raises(TypeError): isinstance(dict_to_spark_row(TestSchema, {'string_field': []}), Row)
def test_serialize_filesystem_factory(tmpdir): SimpleSchema = Unischema('SimpleSchema', [ UnischemaField('id', np.int32, (), ScalarCodec(IntegerType()), False), UnischemaField('foo', np.int32, (), ScalarCodec(IntegerType()), False), ]) class BogusFS(pyarrow.LocalFileSystem): def __getstate__(self): raise RuntimeError("can not serialize") rows_count = 10 output_url = "file://{0}/fs_factory_test".format(tmpdir) rowgroup_size_mb = 256 spark = SparkSession.builder.config('spark.driver.memory', '2g').master('local[2]').getOrCreate() sc = spark.sparkContext with materialize_dataset(spark, output_url, SimpleSchema, rowgroup_size_mb, filesystem_factory=BogusFS): rows_rdd = sc.parallelize(range(rows_count))\ .map(lambda x: {'id': x, 'foo': x})\ .map(lambda x: dict_to_spark_row(SimpleSchema, x)) spark.createDataFrame(rows_rdd, SimpleSchema.as_spark_schema()) \ .write \ .parquet(output_url)
def test_predicate_on_dataset(tmpdir): TestSchema = Unischema('TestSchema', [ UnischemaField('id', np.int32, (), ScalarCodec(IntegerType()), False), UnischemaField('test_field', np.int32, (), ScalarCodec(IntegerType()), False), ]) def test_row_generator(x): """Returns a single entry in the generated dataset.""" return {'id': x, 'test_field': x * x} blocklet_size_mb = 256 dataset_url = "file://{0}/partitioned_test_dataset".format(tmpdir) spark = SparkSession.builder.config('spark.driver.memory', '2g').master('local[2]').getOrCreate() sc = spark.sparkContext rows_count = 10 with materialize_dataset_carbon(spark, dataset_url, TestSchema, blocklet_size_mb): rows_rdd = sc.parallelize(range(rows_count)) \ .map(test_row_generator) \ .map(lambda x: dict_to_spark_row(TestSchema, x)) spark.createDataFrame(rows_rdd, TestSchema.as_spark_schema()) \ .write \ .save(path=dataset_url, format='carbon') with make_carbon_reader(dataset_url, predicate=in_lambda(['id'], lambda x: x == 3)) as reader: assert next(reader).id == 3 with make_carbon_reader(dataset_url, predicate=in_lambda(['id'], lambda x: x == '3')) as reader: with pytest.raises(StopIteration): # Predicate should have selected none, so a StopIteration should be raised. next(reader)
def change_df_schema(spark, df, schema): rows_rdd = ( df.rdd.map(row_generator).map(lambda x: dict_to_spark_row(schema, x))) df = spark.createDataFrame(rows_rdd, schema.as_spark_schema()) return df
def write(self, table: DataFrameMetadata, rows: Batch): """ Write rows into the dataframe. Arguments: table: table metadata object to write into rows : batch to be persisted in the storage. """ if rows.empty(): return # ToDo # Throw an error if the row schema doesn't match the table schema with materialize_dataset(self.spark_session, self._spark_url(table), table.schema.petastorm_schema): records = rows.frames columns = records.keys() rows_rdd = self.spark_context.parallelize(records.values) \ .map(lambda x: dict(zip(columns, x))) \ .map(lambda x: dict_to_spark_row(table.schema.petastorm_schema, x)) self.spark_session.createDataFrame(rows_rdd, table.schema.pyspark_schema) \ .coalesce(1) \ .write \ .mode('append') \ .parquet(self._spark_url(table))
def test_dict_to_spark_row_field_validation_scalar_nullable(): """Test various validations done on data types when converting a dictionary to a spark row""" TestSchema = Unischema('TestSchema', [ UnischemaField('string_field', np.string_, (), ScalarCodec(StringType()), True), UnischemaField('nullable_implicitly_set', np.string_, (), ScalarCodec(StringType()), True), ]) assert isinstance(dict_to_spark_row(TestSchema, {'string_field': None}), Row)
def mnist_data_to_petastorm_dataset(download_dir, output_url, spark_master=None, parquet_files_count=1, mnist_data=None): """Converts a directory with MNIST data into a petastorm dataset. Data files are as specified in http://yann.lecun.com/exdb/mnist/: * train-images-idx3-ubyte.gz: training set images (9912422 bytes) * train-labels-idx1-ubyte.gz: training set labels (28881 bytes) * t10k-images-idx3-ubyte.gz: test set images (1648877 bytes) * t10k-labels-idx1-ubyte.gz: test set labels (4542 bytes) The images and labels and stored in the IDX file format for vectors and multidimensional matrices of various numerical types, as defined in the same URL. :param download_dir: the path to where the MNIST data will be downloaded. :param output_url: the location where your dataset will be written to. Should be a url: either file://... or hdfs://... :param spark_master: A master parameter used by spark session builder. Use default value (None) to use system environment configured spark cluster. Use 'local[*]' to run on a local box. :param mnist_data: A dictionary of MNIST data, with name of dataset as key, and the dataset object as value; if None is suplied, download it. :return: None """ session_builder = SparkSession \ .builder \ .appName('MNIST Dataset Creation') if spark_master: session_builder.master(spark_master) spark = session_builder.getOrCreate() # Get training and test data if mnist_data is None: mnist_data = { 'train': download_mnist_data(download_dir, train=True), 'test': download_mnist_data(download_dir, train=False) } # The MNIST data is small enough to do everything here in Python for dset, data in mnist_data.items(): dset_output_url = '{}/{}'.format(output_url, dset) with materialize_dataset(spark, dset_output_url, MnistSchema): # List of [(idx, image, digit), ...] # where image is shaped as a 28x28 numpy matrix idx_image_digit_list = map(lambda idx_image_digit: { MnistSchema.idx.name: idx_image_digit[0], MnistSchema.digit.name: idx_image_digit[1][1], MnistSchema.image.name: np.array(list(idx_image_digit[1][0].getdata()), dtype=np.uint8).reshape(28, 28) }, enumerate(data)) # Convert to pyspark.sql.Row sql_rows = map(lambda r: dict_to_spark_row(MnistSchema, r), idx_image_digit_list) # Write out the result spark.createDataFrame(sql_rows, MnistSchema.as_spark_schema()) \ .coalesce(parquet_files_count) \ .write \ .option('compression', 'none') \ .parquet(dset_output_url)
def test_row_decoding(): expected_row = _rand_row() encoded_row = dict_to_spark_row(TestSchema, expected_row).asDict() decoder = RowDecoder(TestSchema, None) actual_row = decoder.decode(encoded_row)._asdict() # Out-of-the-box pytest `assert actual_row == expected_row` can not compare dictionaries properly np.testing.assert_equal(actual_row, expected_row)
def test_dict_to_spark_row_order(): TestSchema = Unischema('TestSchema', [ UnischemaField('float_col', np.float64, ()), UnischemaField('int_col', np.int64, ()), ]) row_dict = { TestSchema.int_col.name: 3, TestSchema.float_col.name: 2.0, } spark_row = dict_to_spark_row(TestSchema, row_dict) schema_field_names = list(TestSchema.fields) assert spark_row[0] == row_dict[schema_field_names[0]] assert spark_row[1] == row_dict[schema_field_names[1]]
def change_df_schema(spark: SparkSession, schema: Unischema, df: pyspark.sql.DataFrame) -> pyspark.sql.DataFrame: rows_rdd = ( df .rdd .map(row_generator) .map(lambda x: dict_to_spark_row(schema, x)) ) df = spark.createDataFrame( rows_rdd, schema.as_spark_schema() ) return df
def test_predicate_on_partitioned_dataset(tmpdir): """ Generates a partitioned dataset and ensures that readers evaluate the type of the partition column according to the type given in the Unischema. """ TestSchema = Unischema('TestSchema', [ UnischemaField('id', np.int32, (), ScalarCodec(IntegerType()), False), UnischemaField('id2', np.int32, (), ScalarCodec(IntegerType()), False), UnischemaField('test_field', np.int32, (), ScalarCodec(IntegerType()), False), ]) def test_row_generator(x): """Returns a single entry in the generated dataset.""" return {'id': x, 'id2': x + 1, 'test_field': x * x} rowgroup_size_mb = 256 dataset_url = "file://{0}/partitioned_test_dataset".format(tmpdir) spark = SparkSession.builder.config('spark.driver.memory', '2g').master('local[2]').getOrCreate() sc = spark.sparkContext rows_count = 10 with materialize_dataset(spark, dataset_url, TestSchema, rowgroup_size_mb): rows_rdd = sc.parallelize(range(rows_count))\ .map(test_row_generator)\ .map(lambda x: dict_to_spark_row(TestSchema, x)) spark.createDataFrame(rows_rdd, TestSchema.as_spark_schema()) \ .write \ .partitionBy('id', 'id2') \ .parquet(dataset_url) with make_reader(dataset_url, predicate=in_lambda(['id'], lambda x: x == 3)) as reader: assert next(reader).id == 3 with make_reader(dataset_url, predicate=in_lambda(['id2'], lambda x: x == 5)) as reader: assert next(reader).id == 5 with make_reader(dataset_url, predicate=in_lambda(['id'], lambda x: x == '3')) as reader: with pytest.raises(StopIteration): # Predicate should have selected none, so a StopIteration should be raised. next(reader)
def generate_dataset(data_directory, sensor, output_url, year=2018, max_files=100000, dayofyear=None): """ Write L1b patches to petastorm database for training Args: data_directory: directory of L1b data sensor: Select sensor from (G16,G17,H8) output_url: Directory to write petastorm database (file:///...) year: Integer (depending on directory, 2017-2020) max_files: Maximum number of files to iterate over dayofyear: 1-366 Returns: None """ rowgroup_size_mb = 256 spark = SparkSession.builder.config('spark.driver.memory', '2g').master('local[4]').getOrCreate() sc = spark.sparkContext geo = geonexl1b.GeoNEXL1b(data_directory=data_directory, sensor=sensor) tiles = geo.tiles() files = geo.files(year=year, dayofyear=dayofyear) files['v'] = files['tile'].map(lambda t: int(t[4:6])) files['h'] = files['tile'].map(lambda t: int(t[1:3])) idxs = np.random.randint(0, files.shape[0], max_files) files = files.iloc[idxs] files = files.reset_index() with materialize_dataset(spark, output_url, L1bSchema, rowgroup_size_mb): filerdd = spark.createDataFrame(files)\ .select("year", "dayofyear", "hour", "minute", "v", "h", "file")\ .rdd.map(tuple)\ .flatMap(sample_generator)\ .map(lambda x: dict_to_spark_row(L1bSchema, x)) spark.createDataFrame(filerdd, L1bSchema.as_spark_schema())\ .coalesce(50) \ .write \ .mode('overwrite') \ .parquet(output_url)
def append_rows(df_metadata: DataFrameMetadata, rows): spark = Session().get_session() spark_context = Session().get_context() # Use petastorm to appends rows with materialize_dataset(spark, df_metadata.file_url, df_metadata.schema.petastorm_schema): # Convert a list of rows to RDD rows_rdd = spark_context.parallelize( rows).map(lambda x: dict_to_spark_row( df_metadata.schema.petastorm_schema, x)) spark.createDataFrame(rows_rdd, df_metadata.schema.pyspark_schema) \ .coalesce(1) \ .write \ .mode('append') \ .parquet(df_metadata.file_url)
def generate_benchmark_dataset(output_url='file:///tmp/benchmark_dataset'): # """Creates an example dataset at output_url in Carbon format""" blocklet_size_mb = 256 spark = SparkSession.builder \ .master('local[2]') \ .getOrCreate() sc = spark.sparkContext rows_count = ROW_COUNT with materialize_dataset_carbon(spark, output_url, BenchmarkSchema, blocklet_size_mb): rows_rdd = sc.parallelize(range(rows_count)) \ .map(row_generator) \ .map(lambda x: dict_to_spark_row(BenchmarkSchema, x)) spark.createDataFrame(rows_rdd, BenchmarkSchema.as_spark_schema()) \ .write \ .mode('overwrite') \ .save(path=output_url, format='carbon')
def generate_pycarbon_dataset( output_url='file:///tmp/carbon_pycarbon_dataset'): blocklet_size_mb = 256 spark = SparkSession.builder.config('spark.driver.memory', '2g').master('local[2]').getOrCreate() sc = spark.sparkContext # Wrap dataset materialization portion. Will take care of setting up spark environment variables as # well as save pycarbon specific metadata rows_count = 10 with materialize_dataset_carbon(spark, output_url, HelloWorldSchema, blocklet_size_mb): rows_rdd = sc.parallelize(range(rows_count)) \ .map(row_generator) \ .map(lambda x: dict_to_spark_row(HelloWorldSchema, x)) spark.createDataFrame(rows_rdd, HelloWorldSchema.as_spark_schema()) \ .coalesce(10) \ .write \ .mode('overwrite') \ .save(path=output_url, format='carbon')
def __init__(self, dataset_name: str, frame_metadata: FrameInfo): self.dataset_name = dataset_name self.H = frame_metadata.height self.W = frame_metadata.width self.C = frame_metadata.num_channels # The schema defines how the dataset schema looks like self.dataset_schema = Unischema(self.dataset_name, [ UnischemaField('frame_id', np.int32, (), ScalarCodec(IntegerType()), False), UnischemaField('frame_data', np.uint8, (self.H, self.W, self.C), CompressedNdarrayCodec(), False), ]) # Construct output location eva_dir = ConfigurationManager().get_value("core", "location") output_url = os.path.join(eva_dir, self.dataset_name) # Get session handle session = Session() spark = session.get_session() spark_context = session.get_context() # Wrap dataset materialization portion. rows_count = 10 with materialize_dataset(spark, output_url, self.dataset_schema): rows_rdd = spark_context.parallelize(range(rows_count))\ .map(lambda x: row_generator(x, self.H, self.W, self.C))\ .map(lambda x: dict_to_spark_row(self.dataset_schema, x)) spark.createDataFrame(rows_rdd, self.dataset_schema.as_spark_schema()) \ .coalesce(10) \ .write \ .mode('overwrite') \ .parquet(output_url)
def test_ngram_decoding(): N = 5 ngram_spec = NGram( { -1: [TestSchema.some_number, TestSchema.some_matrix], 0: [TestSchema.some_number], 1: [TestSchema.some_number, TestSchema.some_matrix], }, 2, TestSchema.some_number) expected_rows = [_rand_row(n) for n in range(N)] encoded_rows = [ dict_to_spark_row(TestSchema, row).asDict() for row in expected_rows ] encoded_ngrams = ngram_spec.form_ngram(encoded_rows, TestSchema) decoder = RowDecoder(TestSchema, ngram_spec) # decoded_ngrams is a list of 3 dictionaries, each have -1, 0, 1 keys. decoded_ngrams = [decoder.decode(encoded) for encoded in encoded_ngrams] # Verify we got 3 dictionaries assert 3 == len(decoded_ngrams) single_sample = decoded_ngrams[0] # A single decoded ngram looks like this: # -1: some_number, some_matrix # 0: some_number # 1: some_number, some_matrix assert 2 == len(single_sample[-1]) assert 0 == single_sample[-1].some_number assert 1 == len(single_sample[0]) assert 1 == single_sample[0].some_number assert 2 == len(single_sample[1]) assert 2 == single_sample[1].some_number
def main(): SPARK_MASTER_URL = 'spark://...' # Change the Spark master URL. H5_PRE_PROCESSED_DATA_DIR = 'file://...' # Change pre-processed data input path. Should be accessible from all Spark workers. OUTPUT_PATH = 'file:///...' # Change Petastorm output path. Should be accessible from all Spark workers. TRAIN_FRACTION = 0.7 # Fraction of train data. Remaining is validation data. ROW_GROUP_SIZE_MB = 512 # Size of Parquet row group size. NUM_PARTITIONS = 100 # Number of Parquet partitions for train and val data each. spark = SparkSession \ .builder \ .master(SPARK_MASTER_URL) \ .appName("Deep Postures Example - Petastorm Data Generation") \ .getOrCreate() input_data = [] if H5_PRE_PROCESSED_DATA_DIR.startswith('hdfs://'): args = "hdfs dfs -ls "+dir_in+" | awk '{print $8}'" proc = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) s_output, s_err = proc.communicate() input_data = ['hdfs://'+ path for path in s_output.split()] elif H5_PRE_PROCESSED_DATA_DIR.startswith('file://'): for dirname in os.listdir(H5_PRE_PROCESSED_DATA_DIR): if not os.path.join(H5_PRE_PROCESSED_DATA_DIR, dirname).startswith('.') input_data.append(str(os.path.join(H5_PRE_PROCESSED_DATA_DIR, dirname))) else: raise Exception('Unsupported file system in: {}'.format(H5_PRE_PROCESSED_DATA_DIR)) random.shuffle(input_data) n_train = int(len(input_data) * TRAIN_FRACTION) train_data = input_data[:n_train] val_data = input_data[n_train:] backend = SparkBackend(spark_context=spark.sparkContext) store = LocalStore(OUTPUT_PATH, train_path=os.path.join(OUTPUT_PATH, 'train_data'), val_path=os.path.join(OUTPUT_PATH, 'val_data')) schema = Unischema('schema', [ UnischemaField('id', np.string_, (), ScalarCodec(StringType()), False), UnischemaField('time', np.int64, (), ScalarCodec(LongType()), False), UnischemaField('data', np.float32, (100, 3), NdarrayCodec(), False), UnischemaField('non_wear', np.int32, (), ScalarCodec(IntegerType()), False), UnischemaField('sleeping', np.int32, (), ScalarCodec(IntegerType()), False), UnischemaField('label', np.int32, (), ScalarCodec(IntegerType()), False) ]) with materialize_dataset(spark, os.path.join(output_url, 'train_data'), schema, ROW_GROUP_SIZE_MB): rdd=spark.sparkContext.parallelize(train_data) rdd = rdd.flatMap(lambda x: load_h5(x)).map(lambda item: {'id': item[0], 'time':item[1], 'data':item[2], 'non_wear':item[3], 'sleeping':item[4], 'label':item[5]}) rdd = rdd.map(lambda x: dict_to_spark_row(schema, x)) df = spark.createDataFrame(rdd, schema=schema.as_spark_schema()) df.orderBy("id","time").coalesce(NUM_PARTITIONS).write.mode('overwrite').parquet(os.path.join(output_url, 'train_data')) with materialize_dataset(spark, os.path.join(output_url, 'val_data'), schema, ROW_GROUP_SIZE_MB): rdd=spark.sparkContext.parallelize(val_data) rdd = rdd.flatMap(lambda x: load_h5(x)).map(lambda item: {'id': item[0], 'time':item[1], 'data':item[2], 'non_wear':item[3], 'sleeping':item[4], 'label':item[5]}) rdd = rdd.map(lambda x: dict_to_spark_row(schema, x)) df = spark.createDataFrame(rdd, schema=schema.as_spark_schema()) df.orderBy("id","time").coalesce(NUM_PARTITIONS).write.mode('overwrite').parquet(os.path.join(output_url, 'val_data')) if __name__ == "__main__": main()
def imagenet_directory_to_petastorm_dataset(imagenet_path, output_url, spark_master=None, parquet_files_count=100, noun_id_to_text=None): """Converts a directory with imagenet data into a petastorm dataset. Expected directory format is: >>> nXXXXXXXX/ >>> *.JPEG >>> nZZZZZZZZ/ >>> *.JPEG :param imagenet_path: a path to the directory containing ``n*/`` subdirectories. If you are running this script on a Spark cluster, you should have this file be mounted and accessible to executors. :param output_url: the location where your dataset will be written to. Should be a url: either ``file://...`` or ``hdfs://...`` :param spark_master: A master parameter used by spark session builder. Use default value (``None``) to use system environment configured spark cluster. Use ``local[*]`` to run on a local box. :param noun_id_to_text: A dictionary: ``{noun_id : text}``. If ``None``, this function will download the dictionary from the Internet. :return: ``None`` """ session_builder = SparkSession \ .builder \ .appName('Imagenet Dataset Creation') \ .config('spark.executor.memory', '10g') \ .config('spark.driver.memory', '10g') # Increase the memory if running locally with high number of executors if spark_master: session_builder.master(spark_master) spark = session_builder.getOrCreate() sc = spark.sparkContext # Get a list of noun_ids noun_ids = os.listdir(imagenet_path) if not all(noun_id.startswith('n') for noun_id in noun_ids): raise RuntimeError('Directory {} expected to contain only subdirectories with name ' 'starting with "n".'.format(imagenet_path)) if not noun_id_to_text: noun_id_to_text = download_nouns_mapping() ROWGROUP_SIZE_MB = 256 with materialize_dataset(spark, output_url, ImagenetSchema, ROWGROUP_SIZE_MB): # list of [(nXXXX, 'noun-text'), ...] noun_id_text_list = map(lambda noun_id: (noun_id, noun_id_to_text[noun_id]), noun_ids) # rdd of [(nXXXX, 'noun-text', path), ...] noun_id_text_image_path_rdd = sc.parallelize(noun_id_text_list, min(len(noun_ids) / 10 + 1, 10000)) \ .flatMap(lambda word_id_label: [word_id_label + (image_path,) for image_path in glob.glob(os.path.join(imagenet_path, word_id_label[0], '*.JPEG'))]) # rdd of [(nXXXX, 'noun-text', image), ...] noun_id_text_image_rdd = noun_id_text_image_path_rdd \ .map(lambda id_word_image_path: {ImagenetSchema.noun_id.name: id_word_image_path[0], ImagenetSchema.text.name: id_word_image_path[1], ImagenetSchema.image.name: cv2.imread(id_word_image_path[2])}) # Convert to pyspark.sql.Row sql_rows_rdd = noun_id_text_image_rdd.map(lambda r: dict_to_spark_row(ImagenetSchema, r)) # Write out the result spark.createDataFrame(sql_rows_rdd, ImagenetSchema.as_spark_schema()) \ .coalesce(parquet_files_count) \ .write \ .mode('overwrite') \ .option('compression', 'none') \ .parquet(output_url)
def _create_dataset(store, df, validation, compress_sparse, num_partitions, num_workers, dataset_idx, parquet_row_group_size_mb, verbose): train_data_path = store.get_train_data_path(dataset_idx) val_data_path = store.get_val_data_path(dataset_idx) if verbose >= 1: print('CEREBRO => Time: {}, Writing DataFrames'.format( datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))) print('CEREBRO => Time: {}, Train Data Path: {}'.format( datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), train_data_path)) print('CEREBRO => Time: {}, Val Data Path: {}'.format( datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), val_data_path)) schema_cols = df.columns if isinstance(validation, str): schema_cols.append(validation) df = df[schema_cols] metadata = None if _has_vector_column(df): if compress_sparse: metadata = _get_metadata(df) to_petastorm = to_petastorm_fn(schema_cols, metadata) df = df.rdd.map(to_petastorm).toDF() train_df, val_df, validation_ratio = _train_val_split(df, validation) unischema_fields = [] metadata = _get_metadata(train_df) for k in metadata.keys(): type = spark_to_petastorm_type(metadata[k]['spark_data_type']) shape = petastorm_unischema_shape(metadata[k]['shape']) codec = petastorm_unischema_codec(metadata[k]['shape'], metadata[k]['spark_data_type']) unischema_fields.append(UnischemaField(k, type, shape, codec, False)) petastorm_schema = Unischema('petastorm_schema', unischema_fields) train_partitions = max(int(num_partitions * (1.0 - validation_ratio)), num_workers) if verbose >= 1: print('CEREBRO => Time: {}, Train Partitions: {}'.format( datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), train_partitions)) spark = SparkSession.builder.getOrCreate() # FIXME pass hdfs_driver from user interface instead of hardcoded PETASTORM_HDFS_DRIVER train_resolver = FilesystemResolver( train_data_path, spark.sparkContext._jsc.hadoopConfiguration(), user=spark.sparkContext.sparkUser(), hdfs_driver=constants.PETASTORM_HDFS_DRIVER) with materialize_dataset( spark, train_data_path, petastorm_schema, parquet_row_group_size_mb, filesystem_factory=train_resolver.filesystem_factory()): train_rdd = train_df.rdd.map(lambda x: x.asDict()).map( lambda x: {k: np.array(x[k], dtype=spark_to_petastorm_type(metadata[k]['spark_data_type'])) for k in x}) \ .map(lambda x: dict_to_spark_row(petastorm_schema, x)) spark.createDataFrame(train_rdd, petastorm_schema.as_spark_schema()) \ .coalesce(train_partitions) \ .write \ .mode('overwrite') \ .parquet(train_data_path) if val_df: val_partitions = max(int(num_partitions * validation_ratio), num_workers) if verbose >= 1: print('CEREBRO => Time: {}, Val Partitions: {}'.format( datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), val_partitions)) val_resolver = FilesystemResolver( val_data_path, spark.sparkContext._jsc.hadoopConfiguration(), user=spark.sparkContext.sparkUser(), hdfs_driver=constants.PETASTORM_HDFS_DRIVER) with materialize_dataset( spark, val_data_path, petastorm_schema, parquet_row_group_size_mb, filesystem_factory=val_resolver.filesystem_factory()): val_rdd = val_df.rdd.map(lambda x: x.asDict()).map( lambda x: {k: np.array(x[k], dtype=spark_to_petastorm_type(metadata[k]['spark_data_type'])) for k in x}) \ .map(lambda x: dict_to_spark_row(petastorm_schema, x)) spark.createDataFrame(val_rdd, petastorm_schema.as_spark_schema()) \ .coalesce(val_partitions) \ .write \ .mode('overwrite') \ .parquet(val_data_path) train_rows, val_rows, pq_metadata, avg_row_size = get_simple_meta_from_parquet( store, df.columns, dataset_idx) if verbose: print('CEREBRO => Time: {}, Train Rows: {}'.format( datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), train_rows)) if val_df: if val_rows == 0: raise ValueError( 'Validation DataFrame does not any samples with validation param {}' .format(validation)) if verbose: print('CEREBRO => Time: {}, Val Rows: {}'.format( datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), val_rows)) return train_rows, val_rows, pq_metadata, avg_row_size