Exemple #1
0
    def test_dict_to_spark_row_field_validation_ndarrays(self):
        """Test various validations done on data types when converting a dictionary to a spark row"""
        TestSchema = Unischema('TestSchema', [
            UnischemaField('tensor3d', np.float32,
                           (10, 20, 30), NdarrayCodec(), False),
        ])

        self.assertTrue(
            isinstance(
                dict_to_spark_row(
                    TestSchema,
                    {'tensor3d': np.zeros((10, 20, 30), dtype=np.float32)}),
                Row))

        # Null value into not nullable field
        with self.assertRaises(ValueError):
            isinstance(dict_to_spark_row(TestSchema, {'string_field': None}),
                       Row)

        # Wrong dimensions
        with self.assertRaises(ValueError):
            isinstance(
                dict_to_spark_row(
                    TestSchema,
                    {'string_field': np.zeros((1, 2, 3), dtype=np.float32)}),
                Row)
Exemple #2
0
def generate_parquet(feature_path, mask_path, output_path):
    """[summary]
    Generate parquet file with two columns
        - First column: npG_array representing image
        - Second column: np_array representing mask

    Arguments:
        feature_path {[type]} -- path to all images
        mask_path {[type]} -- path to masks of images
        output_path {[type]} -- parquet path
    """

    from pyspark import SparkContext, SparkConf
    from pyspark.sql import SparkSession, Row

    from pyspark.sql import Row
    from pyspark.sql.types import _infer_schema
    from pyspark.sql.functions import monotonically_increasing_id

    rowgroup_size_mb = 256
    spark_conf = SparkConf().setAppName('Image preprocess')
    sc = SparkContext(conf=spark_conf)
    session = SparkSession(sc)

    # Load images and convert it to dataframe
    images_rdd = sc.binaryFiles(feature_path).values()
    image_flat_numpy_rdd = images_rdd.map(lambda pair_raw_image_id: (raw_image_to_numpy_array(pair_raw_image_id[0]), pair_raw_image_id[1])) \
                                     .map(lambda pair_np_array_id: {'features': pair_np_array_id[0], 'id': pair_np_array_id[1]}) \
                                     .map(lambda x: dict_to_spark_row(FeatureSchema, x))

    image_df = session.createDataFrame(image_flat_numpy_rdd,
                                       FeatureSchema.as_spark_schema())
    # .withColumn("id", monotonically_increasing_id()) # Generate table row id
    # Load masks and convert it to dataframe
    mask_rdd = sc.binaryFiles(mask_path).values().zipWithIndex()

    # Convert mask rgb value to 0 for not building and 1 for building
    mask_flat_numpy_rdd = mask_rdd.map(lambda pair_raw_image_id: (raw_image_to_numpy_array(pair_raw_image_id[0]), pair_raw_image_id[1])) \
                                           .map(lambda pair_np_array_id: ((pair_np_array_id[0] / 255).astype(np.uint8), pair_np_array_id[1])) \
                                           .map(lambda pair_std_np_array_id: {'masks': pair_std_np_array_id[0], 'id': pair_std_np_array_id[1]}) \
                                           .map(lambda x: dict_to_spark_row(MaskSchema, x))

    mask_df = session.createDataFrame(mask_flat_numpy_rdd,
                                      MaskSchema.as_spark_schema())
    #.withColumn("id", monotonically_increasing_id()) # Generate table row id
    mask_df.show(5, False)
    # Concat image_df and mask_df row by row
    train_df = image_df.join(mask_df, "id", "inner").drop('id')

    #print("Summary =>>>>>>>>>>>>>>>>>>>>>>>....>>>")
    #print("Image count {} , mask count {}, train_count {}".format(image_df.count(), mask_df.count(), train_df.count()))
    #print("=======================================")
    with materialize_dataset(session, output_path, TrainSchema,
                             rowgroup_size_mb):
        train_df.write \
                 .mode('overwrite') \
                 .parquet(output_path)
Exemple #3
0
def generate_parquet(feature_path, mask_path, output_path):
    """[summary]
    Generate parquet file with two columns
        - First column: np_array representing image
        - Second column: np_array representing mask

    Arguments:
        feature_path {[type]} -- path to all images
        mask_path {[type]} -- path to masks of images
        output_path {[type]} -- parquet path
    """

    from pyspark import SparkContext, SparkConf
    from pyspark.sql import SparkSession, Row

    from pyspark.sql import Row
    from pyspark.sql.types import _infer_schema
    from pyspark.sql.functions import monotonically_increasing_id

    rowgroup_size_mb = 256
    spark_conf = SparkConf().setAppName('Image preprocess')
    sc = SparkContext(conf=spark_conf)
    session = SparkSession(sc)

    # Load images and convert it to dataframe
    images_rdd = sc.binaryFiles(feature_path)
    image_flat_numpy_rdd = images_rdd.values().map(raw_image_to_numpy_array) \
                                            .map(lambda x: {'features': x}) \
                                            .map(lambda x: dict_to_spark_row(FeatureSchema, x))
    image_df = session.createDataFrame(image_flat_numpy_rdd, FeatureSchema.as_spark_schema()) \
                        .withColumn("id", monotonically_increasing_id()) # Generate table row id

    # Load masks and convert it to dataframe
    mask_rdd = sc.binaryFiles(mask_path)
    mask_flat_numpy_rdd = mask_rdd.values().map(raw_image_to_numpy_array) \
                                           .map(lambda image_np_array: (image_np_array / 255).astype(np.uint8)) \
                                           .map(lambda x: {'masks': x}) \
                                           .map(lambda x: dict_to_spark_row(MaskSchema, x))

    mask_df = session.createDataFrame(mask_flat_numpy_rdd, MaskSchema.as_spark_schema()) \
                        .withColumn("id", monotonically_increasing_id()) # Generate table row id

    # Concat image_df and mask_df row by row
    train_df = image_df.join(mask_df, "id", "outer").drop("id")
    with materialize_dataset(session, output_path, TrainSchema,
                             rowgroup_size_mb):
        train_df.write \
                .mode('overwrite') \
                .parquet(output_path)
def test_dict_to_spark_row_field_validation_scalar_types():
    """Test various validations done on data types when converting a dictionary to a spark row"""
    TestSchema = Unischema('TestSchema', [
        UnischemaField('string_field', np.string_, (), ScalarCodec(StringType()), False),
    ])

    assert isinstance(dict_to_spark_row(TestSchema, {'string_field': 'abc'}), Row)

    # Not a nullable field
    with pytest.raises(ValueError):
        isinstance(dict_to_spark_row(TestSchema, {'string_field': None}), Row)

    # Wrong field type
    with pytest.raises(TypeError):
        isinstance(dict_to_spark_row(TestSchema, {'string_field': []}), Row)
Exemple #5
0
def test_serialize_filesystem_factory(tmpdir):
    SimpleSchema = Unischema('SimpleSchema', [
        UnischemaField('id', np.int32, (), ScalarCodec(IntegerType()), False),
        UnischemaField('foo', np.int32, (), ScalarCodec(IntegerType()), False),
    ])

    class BogusFS(pyarrow.LocalFileSystem):
        def __getstate__(self):
            raise RuntimeError("can not serialize")

    rows_count = 10
    output_url = "file://{0}/fs_factory_test".format(tmpdir)
    rowgroup_size_mb = 256
    spark = SparkSession.builder.config('spark.driver.memory',
                                        '2g').master('local[2]').getOrCreate()
    sc = spark.sparkContext
    with materialize_dataset(spark,
                             output_url,
                             SimpleSchema,
                             rowgroup_size_mb,
                             filesystem_factory=BogusFS):
        rows_rdd = sc.parallelize(range(rows_count))\
            .map(lambda x: {'id': x, 'foo': x})\
            .map(lambda x: dict_to_spark_row(SimpleSchema, x))

        spark.createDataFrame(rows_rdd, SimpleSchema.as_spark_schema()) \
            .write \
            .parquet(output_url)
def test_predicate_on_dataset(tmpdir):
  TestSchema = Unischema('TestSchema', [
    UnischemaField('id', np.int32, (), ScalarCodec(IntegerType()), False),
    UnischemaField('test_field', np.int32, (), ScalarCodec(IntegerType()), False),
  ])

  def test_row_generator(x):
    """Returns a single entry in the generated dataset."""
    return {'id': x,
            'test_field': x * x}

  blocklet_size_mb = 256
  dataset_url = "file://{0}/partitioned_test_dataset".format(tmpdir)

  spark = SparkSession.builder.config('spark.driver.memory', '2g').master('local[2]').getOrCreate()
  sc = spark.sparkContext

  rows_count = 10
  with materialize_dataset_carbon(spark, dataset_url, TestSchema, blocklet_size_mb):
    rows_rdd = sc.parallelize(range(rows_count)) \
      .map(test_row_generator) \
      .map(lambda x: dict_to_spark_row(TestSchema, x))

    spark.createDataFrame(rows_rdd, TestSchema.as_spark_schema()) \
      .write \
      .save(path=dataset_url, format='carbon')

  with make_carbon_reader(dataset_url, predicate=in_lambda(['id'], lambda x: x == 3)) as reader:
    assert next(reader).id == 3
  with make_carbon_reader(dataset_url, predicate=in_lambda(['id'], lambda x: x == '3')) as reader:
    with pytest.raises(StopIteration):
      # Predicate should have selected none, so a StopIteration should be raised.
      next(reader)
Exemple #7
0
def change_df_schema(spark, df, schema):
    rows_rdd = (
        df.rdd.map(row_generator).map(lambda x: dict_to_spark_row(schema, x)))

    df = spark.createDataFrame(rows_rdd, schema.as_spark_schema())

    return df
    def write(self, table: DataFrameMetadata, rows: Batch):
        """
        Write rows into the dataframe.

        Arguments:
            table: table metadata object to write into
            rows : batch to be persisted in the storage.
        """

        if rows.empty():
            return
        # ToDo
        # Throw an error if the row schema doesn't match the table schema

        with materialize_dataset(self.spark_session, self._spark_url(table),
                                 table.schema.petastorm_schema):

            records = rows.frames
            columns = records.keys()
            rows_rdd = self.spark_context.parallelize(records.values) \
                .map(lambda x: dict(zip(columns, x))) \
                .map(lambda x: dict_to_spark_row(table.schema.petastorm_schema,
                                                 x))
            self.spark_session.createDataFrame(rows_rdd,
                                               table.schema.pyspark_schema) \
                .coalesce(1) \
                .write \
                .mode('append') \
                .parquet(self._spark_url(table))
def test_dict_to_spark_row_field_validation_scalar_nullable():
    """Test various validations done on data types when converting a dictionary to a spark row"""
    TestSchema = Unischema('TestSchema', [
        UnischemaField('string_field', np.string_, (), ScalarCodec(StringType()), True),
        UnischemaField('nullable_implicitly_set', np.string_, (), ScalarCodec(StringType()), True),
    ])

    assert isinstance(dict_to_spark_row(TestSchema, {'string_field': None}), Row)
Exemple #10
0
def mnist_data_to_petastorm_dataset(download_dir, output_url, spark_master=None, parquet_files_count=1,
                                    mnist_data=None):
    """Converts a directory with MNIST data into a petastorm dataset.

    Data files are as specified in http://yann.lecun.com/exdb/mnist/:
        * train-images-idx3-ubyte.gz:  training set images (9912422 bytes)
        * train-labels-idx1-ubyte.gz:  training set labels (28881 bytes)
        * t10k-images-idx3-ubyte.gz:   test set images (1648877 bytes)
        * t10k-labels-idx1-ubyte.gz:   test set labels (4542 bytes)

    The images and labels and stored in the IDX file format for vectors and multidimensional matrices of
    various numerical types, as defined in the same URL.

    :param download_dir: the path to where the MNIST data will be downloaded.
    :param output_url: the location where your dataset will be written to. Should be a url: either
      file://... or hdfs://...
    :param spark_master: A master parameter used by spark session builder. Use default value (None) to use system
      environment configured spark cluster. Use 'local[*]' to run on a local box.
    :param mnist_data: A dictionary of MNIST data, with name of dataset as key, and the dataset object as value;
      if None is suplied, download it.
    :return: None
    """
    session_builder = SparkSession \
        .builder \
        .appName('MNIST Dataset Creation')
    if spark_master:
        session_builder.master(spark_master)

    spark = session_builder.getOrCreate()

    # Get training and test data
    if mnist_data is None:
        mnist_data = {
            'train': download_mnist_data(download_dir, train=True),
            'test': download_mnist_data(download_dir, train=False)
        }

    # The MNIST data is small enough to do everything here in Python
    for dset, data in mnist_data.items():
        dset_output_url = '{}/{}'.format(output_url, dset)
        with materialize_dataset(spark, dset_output_url, MnistSchema):
            # List of [(idx, image, digit), ...]
            # where image is shaped as a 28x28 numpy matrix
            idx_image_digit_list = map(lambda idx_image_digit: {
                MnistSchema.idx.name: idx_image_digit[0],
                MnistSchema.digit.name: idx_image_digit[1][1],
                MnistSchema.image.name: np.array(list(idx_image_digit[1][0].getdata()), dtype=np.uint8).reshape(28, 28)
                }, enumerate(data))

            # Convert to pyspark.sql.Row
            sql_rows = map(lambda r: dict_to_spark_row(MnistSchema, r), idx_image_digit_list)

            # Write out the result
            spark.createDataFrame(sql_rows, MnistSchema.as_spark_schema()) \
                .coalesce(parquet_files_count) \
                .write \
                .option('compression', 'none') \
                .parquet(dset_output_url)
Exemple #11
0
def test_row_decoding():
    expected_row = _rand_row()
    encoded_row = dict_to_spark_row(TestSchema, expected_row).asDict()

    decoder = RowDecoder(TestSchema, None)
    actual_row = decoder.decode(encoded_row)._asdict()

    # Out-of-the-box pytest `assert actual_row == expected_row` can not compare dictionaries properly
    np.testing.assert_equal(actual_row, expected_row)
def test_dict_to_spark_row_order():
    TestSchema = Unischema('TestSchema', [
        UnischemaField('float_col', np.float64, ()),
        UnischemaField('int_col', np.int64, ()),
    ])
    row_dict = {
        TestSchema.int_col.name: 3,
        TestSchema.float_col.name: 2.0,
    }
    spark_row = dict_to_spark_row(TestSchema, row_dict)
    schema_field_names = list(TestSchema.fields)
    assert spark_row[0] == row_dict[schema_field_names[0]]
    assert spark_row[1] == row_dict[schema_field_names[1]]
Exemple #13
0
def change_df_schema(spark: SparkSession, schema: Unischema, df: pyspark.sql.DataFrame) -> pyspark.sql.DataFrame:
    rows_rdd = (
        df
            .rdd
            .map(row_generator)
            .map(lambda x: dict_to_spark_row(schema, x))
    )

    df = spark.createDataFrame(
        rows_rdd,
        schema.as_spark_schema()
    )

    return df
def test_predicate_on_partitioned_dataset(tmpdir):
    """
    Generates a partitioned dataset and ensures that readers evaluate the type of the partition
    column according to the type given in the Unischema.
    """
    TestSchema = Unischema('TestSchema', [
        UnischemaField('id', np.int32, (), ScalarCodec(IntegerType()), False),
        UnischemaField('id2', np.int32, (), ScalarCodec(IntegerType()), False),
        UnischemaField('test_field', np.int32,
                       (), ScalarCodec(IntegerType()), False),
    ])

    def test_row_generator(x):
        """Returns a single entry in the generated dataset."""
        return {'id': x, 'id2': x + 1, 'test_field': x * x}

    rowgroup_size_mb = 256
    dataset_url = "file://{0}/partitioned_test_dataset".format(tmpdir)

    spark = SparkSession.builder.config('spark.driver.memory',
                                        '2g').master('local[2]').getOrCreate()
    sc = spark.sparkContext

    rows_count = 10
    with materialize_dataset(spark, dataset_url, TestSchema, rowgroup_size_mb):

        rows_rdd = sc.parallelize(range(rows_count))\
            .map(test_row_generator)\
            .map(lambda x: dict_to_spark_row(TestSchema, x))

        spark.createDataFrame(rows_rdd, TestSchema.as_spark_schema()) \
            .write \
            .partitionBy('id', 'id2') \
            .parquet(dataset_url)

    with make_reader(dataset_url,
                     predicate=in_lambda(['id'], lambda x: x == 3)) as reader:
        assert next(reader).id == 3
    with make_reader(dataset_url,
                     predicate=in_lambda(['id2'], lambda x: x == 5)) as reader:
        assert next(reader).id == 5
    with make_reader(dataset_url,
                     predicate=in_lambda(['id'],
                                         lambda x: x == '3')) as reader:
        with pytest.raises(StopIteration):
            # Predicate should have selected none, so a StopIteration should be raised.
            next(reader)
def generate_dataset(data_directory,
                     sensor,
                     output_url,
                     year=2018,
                     max_files=100000,
                     dayofyear=None):
    """
    Write L1b patches to petastorm database for training
    Args:
        data_directory: directory of L1b data
        sensor: Select sensor from (G16,G17,H8)
        output_url: Directory to write petastorm database (file:///...)
        year: Integer (depending on directory, 2017-2020)
        max_files: Maximum number of files to iterate over
        dayofyear: 1-366
    Returns:
        None
    """
    rowgroup_size_mb = 256

    spark = SparkSession.builder.config('spark.driver.memory',
                                        '2g').master('local[4]').getOrCreate()
    sc = spark.sparkContext

    geo = geonexl1b.GeoNEXL1b(data_directory=data_directory, sensor=sensor)
    tiles = geo.tiles()
    files = geo.files(year=year, dayofyear=dayofyear)
    files['v'] = files['tile'].map(lambda t: int(t[4:6]))
    files['h'] = files['tile'].map(lambda t: int(t[1:3]))

    idxs = np.random.randint(0, files.shape[0], max_files)
    files = files.iloc[idxs]
    files = files.reset_index()

    with materialize_dataset(spark, output_url, L1bSchema, rowgroup_size_mb):
        filerdd = spark.createDataFrame(files)\
             .select("year", "dayofyear", "hour", "minute", "v", "h", "file")\
             .rdd.map(tuple)\
             .flatMap(sample_generator)\
             .map(lambda x: dict_to_spark_row(L1bSchema, x))

        spark.createDataFrame(filerdd, L1bSchema.as_spark_schema())\
            .coalesce(50) \
            .write \
            .mode('overwrite') \
            .parquet(output_url)
Exemple #16
0
def append_rows(df_metadata: DataFrameMetadata, rows):

    spark = Session().get_session()
    spark_context = Session().get_context()

    # Use petastorm to appends rows
    with materialize_dataset(spark, df_metadata.file_url,
                             df_metadata.schema.petastorm_schema):
        # Convert a list of rows to RDD
        rows_rdd = spark_context.parallelize(
            rows).map(lambda x: dict_to_spark_row(
                df_metadata.schema.petastorm_schema, x))

        spark.createDataFrame(rows_rdd,
                              df_metadata.schema.pyspark_schema) \
            .coalesce(1) \
            .write \
            .mode('append') \
            .parquet(df_metadata.file_url)
def generate_benchmark_dataset(output_url='file:///tmp/benchmark_dataset'):
    # """Creates an example dataset at output_url in Carbon format"""
    blocklet_size_mb = 256

    spark = SparkSession.builder \
      .master('local[2]') \
      .getOrCreate()

    sc = spark.sparkContext

    rows_count = ROW_COUNT

    with materialize_dataset_carbon(spark, output_url, BenchmarkSchema,
                                    blocklet_size_mb):
        rows_rdd = sc.parallelize(range(rows_count)) \
          .map(row_generator) \
          .map(lambda x: dict_to_spark_row(BenchmarkSchema, x))

        spark.createDataFrame(rows_rdd, BenchmarkSchema.as_spark_schema()) \
          .write \
          .mode('overwrite') \
          .save(path=output_url, format='carbon')
def generate_pycarbon_dataset(
        output_url='file:///tmp/carbon_pycarbon_dataset'):
    blocklet_size_mb = 256

    spark = SparkSession.builder.config('spark.driver.memory',
                                        '2g').master('local[2]').getOrCreate()
    sc = spark.sparkContext

    # Wrap dataset materialization portion. Will take care of setting up spark environment variables as
    # well as save pycarbon specific metadata
    rows_count = 10
    with materialize_dataset_carbon(spark, output_url, HelloWorldSchema,
                                    blocklet_size_mb):
        rows_rdd = sc.parallelize(range(rows_count)) \
          .map(row_generator) \
          .map(lambda x: dict_to_spark_row(HelloWorldSchema, x))

        spark.createDataFrame(rows_rdd, HelloWorldSchema.as_spark_schema()) \
          .coalesce(10) \
          .write \
          .mode('overwrite') \
          .save(path=output_url, format='carbon')
Exemple #19
0
    def __init__(self, dataset_name: str, frame_metadata: FrameInfo):

        self.dataset_name = dataset_name
        self.H = frame_metadata.height
        self.W = frame_metadata.width
        self.C = frame_metadata.num_channels

        # The schema defines how the dataset schema looks like
        self.dataset_schema = Unischema(self.dataset_name, [
            UnischemaField('frame_id', np.int32,
                           (), ScalarCodec(IntegerType()), False),
            UnischemaField('frame_data', np.uint8, (self.H, self.W, self.C),
                           CompressedNdarrayCodec(), False),
        ])

        # Construct output location
        eva_dir = ConfigurationManager().get_value("core", "location")
        output_url = os.path.join(eva_dir, self.dataset_name)

        # Get session handle
        session = Session()
        spark = session.get_session()
        spark_context = session.get_context()

        # Wrap dataset materialization portion.
        rows_count = 10
        with materialize_dataset(spark, output_url, self.dataset_schema):

            rows_rdd = spark_context.parallelize(range(rows_count))\
                .map(lambda x: row_generator(x, self.H, self.W, self.C))\
                .map(lambda x: dict_to_spark_row(self.dataset_schema, x))

            spark.createDataFrame(rows_rdd,
                                  self.dataset_schema.as_spark_schema()) \
                .coalesce(10) \
                .write \
                .mode('overwrite') \
                .parquet(output_url)
Exemple #20
0
def test_ngram_decoding():
    N = 5
    ngram_spec = NGram(
        {
            -1: [TestSchema.some_number, TestSchema.some_matrix],
            0: [TestSchema.some_number],
            1: [TestSchema.some_number, TestSchema.some_matrix],
        }, 2, TestSchema.some_number)

    expected_rows = [_rand_row(n) for n in range(N)]
    encoded_rows = [
        dict_to_spark_row(TestSchema, row).asDict() for row in expected_rows
    ]
    encoded_ngrams = ngram_spec.form_ngram(encoded_rows, TestSchema)

    decoder = RowDecoder(TestSchema, ngram_spec)

    # decoded_ngrams is a list of 3 dictionaries, each have -1, 0, 1 keys.
    decoded_ngrams = [decoder.decode(encoded) for encoded in encoded_ngrams]

    # Verify we got 3 dictionaries
    assert 3 == len(decoded_ngrams)

    single_sample = decoded_ngrams[0]

    # A single decoded ngram looks like this:
    #   -1: some_number, some_matrix
    #    0: some_number
    #    1: some_number, some_matrix
    assert 2 == len(single_sample[-1])
    assert 0 == single_sample[-1].some_number

    assert 1 == len(single_sample[0])
    assert 1 == single_sample[0].some_number

    assert 2 == len(single_sample[1])
    assert 2 == single_sample[1].some_number
def main():
    SPARK_MASTER_URL = 'spark://...' # Change the Spark master URL.
    H5_PRE_PROCESSED_DATA_DIR = 'file://...' # Change pre-processed data input path. Should be accessible from all Spark workers.
    OUTPUT_PATH = 'file:///...' # Change Petastorm output path. Should be accessible from all Spark workers.
    TRAIN_FRACTION = 0.7 # Fraction of train data. Remaining is validation data.
    
    ROW_GROUP_SIZE_MB = 512 # Size of Parquet row group size.
    NUM_PARTITIONS = 100 # Number of Parquet partitions for train and val data each.
    
    spark = SparkSession \
            .builder \
            .master(SPARK_MASTER_URL) \
            .appName("Deep Postures Example - Petastorm Data Generation") \
            .getOrCreate()

    input_data = []
    if H5_PRE_PROCESSED_DATA_DIR.startswith('hdfs://'):
        args = "hdfs dfs -ls "+dir_in+" | awk '{print $8}'"
        proc = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)

        s_output, s_err = proc.communicate()
        input_data = ['hdfs://'+ path for path in s_output.split()]
    elif H5_PRE_PROCESSED_DATA_DIR.startswith('file://'):
        for dirname in os.listdir(H5_PRE_PROCESSED_DATA_DIR):
            if not os.path.join(H5_PRE_PROCESSED_DATA_DIR, dirname).startswith('.')
            input_data.append(str(os.path.join(H5_PRE_PROCESSED_DATA_DIR, dirname)))
    else:
        raise Exception('Unsupported file system in: {}'.format(H5_PRE_PROCESSED_DATA_DIR))

    random.shuffle(input_data)
    n_train = int(len(input_data) * TRAIN_FRACTION)
    train_data = input_data[:n_train]
    val_data = input_data[n_train:]

    backend = SparkBackend(spark_context=spark.sparkContext)
    store = LocalStore(OUTPUT_PATH, train_path=os.path.join(OUTPUT_PATH, 'train_data'), val_path=os.path.join(OUTPUT_PATH, 'val_data'))
    
    schema = Unischema('schema', [
        UnischemaField('id', np.string_, (), ScalarCodec(StringType()), False),
        UnischemaField('time', np.int64, (), ScalarCodec(LongType()), False),
        UnischemaField('data', np.float32, (100, 3), NdarrayCodec(), False),
        UnischemaField('non_wear', np.int32, (), ScalarCodec(IntegerType()), False),
        UnischemaField('sleeping', np.int32, (), ScalarCodec(IntegerType()), False),
        UnischemaField('label', np.int32, (), ScalarCodec(IntegerType()), False)
    ])

    with materialize_dataset(spark, os.path.join(output_url, 'train_data'), schema, ROW_GROUP_SIZE_MB):
        rdd=spark.sparkContext.parallelize(train_data)
        rdd = rdd.flatMap(lambda x: load_h5(x)).map(lambda item: {'id': item[0], 'time':item[1], 'data':item[2], 'non_wear':item[3], 'sleeping':item[4], 'label':item[5]})
        rdd =  rdd.map(lambda x: dict_to_spark_row(schema, x)) 
        
        df = spark.createDataFrame(rdd, schema=schema.as_spark_schema())
        df.orderBy("id","time").coalesce(NUM_PARTITIONS).write.mode('overwrite').parquet(os.path.join(output_url, 'train_data'))


    with materialize_dataset(spark, os.path.join(output_url, 'val_data'), schema, ROW_GROUP_SIZE_MB):
        rdd=spark.sparkContext.parallelize(val_data)
        rdd = rdd.flatMap(lambda x: load_h5(x)).map(lambda item: {'id': item[0], 'time':item[1], 'data':item[2], 'non_wear':item[3], 'sleeping':item[4], 'label':item[5]})
        rdd =  rdd.map(lambda x: dict_to_spark_row(schema, x)) 
        
        df = spark.createDataFrame(rdd, schema=schema.as_spark_schema())
        df.orderBy("id","time").coalesce(NUM_PARTITIONS).write.mode('overwrite').parquet(os.path.join(output_url, 'val_data'))

if __name__ == "__main__":
    main()
Exemple #22
0
def imagenet_directory_to_petastorm_dataset(imagenet_path, output_url, spark_master=None, parquet_files_count=100,
                                            noun_id_to_text=None):
    """Converts a directory with imagenet data into a petastorm dataset.

    Expected directory format is:

    >>> nXXXXXXXX/
    >>>    *.JPEG

    >>> nZZZZZZZZ/
    >>>    *.JPEG

    :param imagenet_path: a path to the directory containing ``n*/`` subdirectories. If you are running this script on
      a Spark cluster, you should have this file be mounted and accessible to executors.
    :param output_url: the location where your dataset will be written to. Should be a url: either
      ``file://...`` or ``hdfs://...``
    :param spark_master: A master parameter used by spark session builder. Use default value (``None``) to use system
      environment configured spark cluster. Use ``local[*]`` to run on a local box.
    :param noun_id_to_text: A dictionary: ``{noun_id : text}``. If ``None``, this function will download the dictionary
      from the Internet.
    :return: ``None``
    """
    session_builder = SparkSession \
        .builder \
        .appName('Imagenet Dataset Creation') \
        .config('spark.executor.memory', '10g') \
        .config('spark.driver.memory', '10g')  # Increase the memory if running locally with high number of executors
    if spark_master:
        session_builder.master(spark_master)

    spark = session_builder.getOrCreate()
    sc = spark.sparkContext

    # Get a list of noun_ids
    noun_ids = os.listdir(imagenet_path)
    if not all(noun_id.startswith('n') for noun_id in noun_ids):
        raise RuntimeError('Directory {} expected to contain only subdirectories with name '
                           'starting with "n".'.format(imagenet_path))

    if not noun_id_to_text:
        noun_id_to_text = download_nouns_mapping()

    ROWGROUP_SIZE_MB = 256
    with materialize_dataset(spark, output_url, ImagenetSchema, ROWGROUP_SIZE_MB):
        # list of [(nXXXX, 'noun-text'), ...]
        noun_id_text_list = map(lambda noun_id: (noun_id, noun_id_to_text[noun_id]), noun_ids)

        # rdd of [(nXXXX, 'noun-text', path), ...]
        noun_id_text_image_path_rdd = sc.parallelize(noun_id_text_list, min(len(noun_ids) / 10 + 1, 10000)) \
            .flatMap(lambda word_id_label: [word_id_label + (image_path,) for image_path in
                                            glob.glob(os.path.join(imagenet_path, word_id_label[0], '*.JPEG'))])

        # rdd of [(nXXXX, 'noun-text', image), ...]
        noun_id_text_image_rdd = noun_id_text_image_path_rdd \
            .map(lambda id_word_image_path:
                 {ImagenetSchema.noun_id.name: id_word_image_path[0],
                  ImagenetSchema.text.name: id_word_image_path[1],
                  ImagenetSchema.image.name: cv2.imread(id_word_image_path[2])})

        # Convert to pyspark.sql.Row
        sql_rows_rdd = noun_id_text_image_rdd.map(lambda r: dict_to_spark_row(ImagenetSchema, r))

        # Write out the result
        spark.createDataFrame(sql_rows_rdd, ImagenetSchema.as_spark_schema()) \
            .coalesce(parquet_files_count) \
            .write \
            .mode('overwrite') \
            .option('compression', 'none') \
            .parquet(output_url)
Exemple #23
0
def _create_dataset(store, df, validation, compress_sparse, num_partitions,
                    num_workers, dataset_idx, parquet_row_group_size_mb,
                    verbose):
    train_data_path = store.get_train_data_path(dataset_idx)
    val_data_path = store.get_val_data_path(dataset_idx)
    if verbose >= 1:
        print('CEREBRO => Time: {}, Writing DataFrames'.format(
            datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
        print('CEREBRO => Time: {}, Train Data Path: {}'.format(
            datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
            train_data_path))
        print('CEREBRO => Time: {}, Val Data Path: {}'.format(
            datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
            val_data_path))

    schema_cols = df.columns

    if isinstance(validation, str):
        schema_cols.append(validation)
    df = df[schema_cols]

    metadata = None
    if _has_vector_column(df):
        if compress_sparse:
            metadata = _get_metadata(df)
        to_petastorm = to_petastorm_fn(schema_cols, metadata)
        df = df.rdd.map(to_petastorm).toDF()

    train_df, val_df, validation_ratio = _train_val_split(df, validation)

    unischema_fields = []
    metadata = _get_metadata(train_df)
    for k in metadata.keys():
        type = spark_to_petastorm_type(metadata[k]['spark_data_type'])
        shape = petastorm_unischema_shape(metadata[k]['shape'])
        codec = petastorm_unischema_codec(metadata[k]['shape'],
                                          metadata[k]['spark_data_type'])
        unischema_fields.append(UnischemaField(k, type, shape, codec, False))

    petastorm_schema = Unischema('petastorm_schema', unischema_fields)

    train_partitions = max(int(num_partitions * (1.0 - validation_ratio)),
                           num_workers)
    if verbose >= 1:
        print('CEREBRO => Time: {}, Train Partitions: {}'.format(
            datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
            train_partitions))

    spark = SparkSession.builder.getOrCreate()
    # FIXME pass hdfs_driver from user interface instead of hardcoded PETASTORM_HDFS_DRIVER
    train_resolver = FilesystemResolver(
        train_data_path,
        spark.sparkContext._jsc.hadoopConfiguration(),
        user=spark.sparkContext.sparkUser(),
        hdfs_driver=constants.PETASTORM_HDFS_DRIVER)
    with materialize_dataset(
            spark,
            train_data_path,
            petastorm_schema,
            parquet_row_group_size_mb,
            filesystem_factory=train_resolver.filesystem_factory()):
        train_rdd = train_df.rdd.map(lambda x: x.asDict()).map(
            lambda x: {k: np.array(x[k], dtype=spark_to_petastorm_type(metadata[k]['spark_data_type'])) for k in x}) \
            .map(lambda x: dict_to_spark_row(petastorm_schema, x))

        spark.createDataFrame(train_rdd, petastorm_schema.as_spark_schema()) \
            .coalesce(train_partitions) \
            .write \
            .mode('overwrite') \
            .parquet(train_data_path)

    if val_df:
        val_partitions = max(int(num_partitions * validation_ratio),
                             num_workers)
        if verbose >= 1:
            print('CEREBRO => Time: {}, Val Partitions: {}'.format(
                datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                val_partitions))
        val_resolver = FilesystemResolver(
            val_data_path,
            spark.sparkContext._jsc.hadoopConfiguration(),
            user=spark.sparkContext.sparkUser(),
            hdfs_driver=constants.PETASTORM_HDFS_DRIVER)
        with materialize_dataset(
                spark,
                val_data_path,
                petastorm_schema,
                parquet_row_group_size_mb,
                filesystem_factory=val_resolver.filesystem_factory()):
            val_rdd = val_df.rdd.map(lambda x: x.asDict()).map(
                lambda x: {k: np.array(x[k], dtype=spark_to_petastorm_type(metadata[k]['spark_data_type'])) for k in x}) \
                .map(lambda x: dict_to_spark_row(petastorm_schema, x))

            spark.createDataFrame(val_rdd, petastorm_schema.as_spark_schema()) \
                .coalesce(val_partitions) \
                .write \
                .mode('overwrite') \
                .parquet(val_data_path)

    train_rows, val_rows, pq_metadata, avg_row_size = get_simple_meta_from_parquet(
        store, df.columns, dataset_idx)

    if verbose:
        print('CEREBRO => Time: {}, Train Rows: {}'.format(
            datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), train_rows))
    if val_df:
        if val_rows == 0:
            raise ValueError(
                'Validation DataFrame does not any samples with validation param {}'
                .format(validation))
        if verbose:
            print('CEREBRO => Time: {}, Val Rows: {}'.format(
                datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                val_rows))

    return train_rows, val_rows, pq_metadata, avg_row_size