def test_dtype(test_ctx):
    df = test_ctx.spark.range(10)
    df = df.withColumn("float_col", df.id.cast(FloatType())) \
        .withColumn("double_col", df.id.cast(DoubleType()))

    converter1 = make_spark_converter(df)
    with converter1.make_tf_dataset() as dataset:
        iterator = dataset.make_one_shot_iterator()
        tensor = iterator.get_next()
        with tf.Session() as sess:
            ts = sess.run(tensor)
    assert np.float32 == ts.double_col.dtype.type

    converter2 = make_spark_converter(df, dtype='float64')
    with converter2.make_tf_dataset() as dataset:
        iterator = dataset.make_one_shot_iterator()
        tensor = iterator.get_next()
        with tf.Session() as sess:
            ts = sess.run(tensor)
    assert np.float64 == ts.float_col.dtype.type

    converter3 = make_spark_converter(df, dtype=None)
    with converter3.make_tf_dataset() as dataset:
        iterator = dataset.make_one_shot_iterator()
        tensor = iterator.get_next()
        with tf.Session() as sess:
            ts = sess.run(tensor)
    assert np.float32 == ts.float_col.dtype.type
    assert np.float64 == ts.double_col.dtype.type

    with pytest.raises(ValueError,
                       match="dtype float16 is not supported. \
            Use 'float32' or float64"):
        make_spark_converter(df, dtype="float16")
def test_compression(test_ctx):
    df1 = test_ctx.spark.range(10)

    converter1 = make_spark_converter(df1)
    assert "uncompressed" == \
           _get_compression_type(converter1.cache_dir_url).lower()

    converter2 = make_spark_converter(df1, compression_codec="snappy")
    assert "snappy" == \
           _get_compression_type(converter2.cache_dir_url).lower()
Ejemplo n.º 3
0
def run(data_dir):
    # Get SparkSession
    spark = SparkSession.builder \
        .master("local[2]") \
        .appName("petastorm.spark tensorflow_example") \
        .getOrCreate()

    # Load and preprocess data using Spark
    df = spark.read.format("libsvm") \
        .option("numFeatures", "784") \
        .load(data_dir) \
        .select(col("features"), col("label").cast("long").alias("label"))

    # Randomly split data into train and test dataset
    df_train, df_test = df.randomSplit([0.9, 0.1], seed=12345)

    # Set a cache directory for intermediate data.
    # The path should be accessible by both Spark workers and driver.
    spark.conf.set(SparkDatasetConverter.PARENT_CACHE_DIR_URL_CONF,
                   "file:///tmp/petastorm/cache/tf-example")

    converter_train = make_spark_converter(df_train)
    converter_test = make_spark_converter(df_test)

    def train_and_evaluate(_=None):
        import tensorflow as tf

        with converter_train.make_tf_dataset() as dataset:
            dataset = dataset.map(
                lambda x: (tf.reshape(x.features, [-1, 28, 28]), x.label))
            model = train(dataset)

        with converter_test.make_tf_dataset(num_epochs=1) as dataset:
            dataset = dataset.map(
                lambda x: (tf.reshape(x.features, [-1, 28, 28]), x.label))
            hist = model.evaluate(dataset)

        return hist[1]

    # Train and evaluate the model on the local machine
    accuracy = train_and_evaluate()
    logging.info("Train and evaluate the model on the local machine.")
    logging.info("Accuracy: %.6f", accuracy)

    # Train and evaluate the model on a spark worker
    accuracy = spark.sparkContext.parallelize(
        range(1)).map(train_and_evaluate).collect()[0]
    logging.info("Train and evaluate the model remotely on a spark worker, "
                 "which can be used for distributed hyperparameter tuning.")
    logging.info("Accuracy: %.6f", accuracy)

    # Cleanup
    converter_train.delete()
    converter_test.delete()
    spark.stop()
def test_df_delete_caching_meta(test_ctx):
    from petastorm.spark.spark_dataset_converter import _cache_df_meta_list
    df1 = test_ctx.spark.range(10)
    df2 = test_ctx.spark.range(20)
    converter1 = make_spark_converter(df1)
    converter2 = make_spark_converter(df2)
    converter1.delete()
    cached_list = set(map(lambda x: x.cache_dir_url, _cache_df_meta_list))
    assert converter1.cache_dir_url not in cached_list
    assert converter2.cache_dir_url in cached_list
    # test recreate converter1 after delete should work.
    make_spark_converter(df1)
def test_torch_unexpected_param(test_ctx):
    df = test_ctx.spark.range(8)
    conv = make_spark_converter(df)

    with pytest.raises(TypeError, match="unexpected keyword argument 'xyz'"):
        with conv.make_torch_dataloader(xyz=1) as _:
            pass
def test_torch_dataloader_advanced_params(mock_torch_make_batch_reader,
                                          test_ctx):
    SHARD_COUNT = 3
    df = test_ctx.spark.range(100).repartition(SHARD_COUNT)
    conv = make_spark_converter(df)

    mock_torch_make_batch_reader.return_value = \
        make_batch_reader(conv.cache_dir_url)

    with conv.make_torch_dataloader(reader_pool_type='dummy',
                                    cur_shard=1,
                                    shard_count=SHARD_COUNT) as _:
        pass
    peta_args = mock_torch_make_batch_reader.call_args.kwargs
    assert peta_args['reader_pool_type'] == 'dummy' and \
        peta_args['cur_shard'] == 1 and \
        peta_args['shard_count'] == SHARD_COUNT and \
        peta_args['num_epochs'] is None and \
        peta_args['workers_count'] == 4

    # Test default value overridden arguments.
    with conv.make_torch_dataloader(num_epochs=1, workers_count=2) as _:
        pass
    peta_args = mock_torch_make_batch_reader.call_args.kwargs
    assert peta_args['num_epochs'] == 1 and peta_args['workers_count'] == 2
def test_df_caching(test_ctx):
    df1 = test_ctx.spark.range(10)
    df2 = test_ctx.spark.range(10)
    df3 = test_ctx.spark.range(20)

    # Test caching for the dataframes with the same logical plan
    converter1 = make_spark_converter(df1)
    converter2 = make_spark_converter(df2)
    assert converter1.cache_dir_url == converter2.cache_dir_url

    # Test no caching for different dataframes
    converter3 = make_spark_converter(df3)
    assert converter1.cache_dir_url != converter3.cache_dir_url

    # Test no caching for the same dataframe with different row group size
    converter11 = make_spark_converter(df1,
                                       parquet_row_group_size_bytes=8 * 1024 *
                                       1024)
    converter21 = make_spark_converter(df1,
                                       parquet_row_group_size_bytes=16 * 1024 *
                                       1024)
    assert converter11.cache_dir_url != converter21.cache_dir_url

    # Test no caching for the same dataframe with different compression_codec
    converter12 = make_spark_converter(df1, compression_codec=None)
    converter22 = make_spark_converter(df1, compression_codec="snappy")
    assert converter12.cache_dir_url != converter22.cache_dir_url

    ori_temp_url = test_ctx.spark.conf.get(
        SparkDatasetConverter.PARENT_CACHE_DIR_URL_CONF)
    tempdir = tempfile.mkdtemp('_spark_converter_test1')
    new_temp_url = 'file://' + tempdir.replace(os.sep, '/')
    try:
        # Test no caching for the same dataframe with different parent cache dirs
        test_ctx.spark.conf.set(
            SparkDatasetConverter.PARENT_CACHE_DIR_URL_CONF, new_temp_url)
        assert ori_temp_url != new_temp_url
        converter13 = make_spark_converter(df1)
        assert converter1.cache_dir_url != converter13.cache_dir_url

        # Test caching for the same dataframe with different parent cache dirs
        # that could be normalized to the same parent cache dir
        new_temp_url_2 = new_temp_url + os.sep
        test_ctx.spark.conf.set(
            SparkDatasetConverter.PARENT_CACHE_DIR_URL_CONF, new_temp_url_2)
        assert new_temp_url != new_temp_url_2
        converter14 = make_spark_converter(df1)
        assert converter13.cache_dir_url == converter14.cache_dir_url
    finally:
        test_ctx.spark.conf.set(
            SparkDatasetConverter.PARENT_CACHE_DIR_URL_CONF, ori_temp_url)
def test_torch_batch_size(test_ctx):
    df = test_ctx.spark.range(8)
    conv = make_spark_converter(df)
    batch_size = 2
    with conv.make_torch_dataloader(batch_size=batch_size,
                                    num_epochs=1) as dataloader:
        for batch in dataloader:
            assert batch_size == batch['id'].shape[0]
def test_delete(test_ctx):
    df = test_ctx.spark.createDataFrame([(1, 2), (4, 5)], ["col1", "col2"])
    # TODO add test for hdfs url
    converter = make_spark_converter(df)
    local_path = urlparse(converter.cache_dir_url).path
    assert os.path.exists(local_path)
    converter.delete()
    assert not os.path.exists(local_path)
def test_torch_data_loader_fn(spark_test_ctx):
    from petastorm.pytorch import BatchedDataLoader

    df = spark_test_ctx.spark.range(8)
    conv = make_spark_converter(df)
    with conv.make_torch_dataloader(data_loader_fn=BatchedDataLoader,
                                    batch_size=2,
                                    num_epochs=1) as dataloader:
        assert isinstance(dataloader, BatchedDataLoader)
Ejemplo n.º 11
0
def test_tf_autograph(spark_test_ctx, caplog):
    caplog.clear()
    df1 = spark_test_ctx.spark.range(100)
    converter1 = make_spark_converter(df1)
    results = []
    with converter1.make_tf_dataset(num_epochs=1) as dataset:
        for batch in dataset:
            results.append(batch)
    assert "AutoGraph could not transform" not in " ".join(caplog.messages)
def test_primitive(test_ctx):
    schema = StructType([
        StructField("bool_col", BooleanType(), False),
        StructField("float_col", FloatType(), False),
        StructField("double_col", DoubleType(), False),
        StructField("short_col", ShortType(), False),
        StructField("int_col", IntegerType(), False),
        StructField("long_col", LongType(), False),
        StructField("str_col", StringType(), False),
        StructField("bin_col", BinaryType(), False),
        StructField("byte_col", ByteType(), False),
    ])
    df = test_ctx.spark.createDataFrame(
        [(True, 0.12, 432.1, 5, 5, 0, "hello", bytearray(b"spark\x01\x02"),
          -128),
         (False, 123.45, 0.987, 9, 908, 765, "petastorm",
          bytearray(b"\x0012345"), 127)],
        schema=schema).coalesce(1)
    # If we use numPartition > 1, the order of the loaded dataset would
    # be non-deterministic.
    expected_df = df.collect()

    converter = make_spark_converter(df)
    with converter.make_tf_dataset() as dataset:
        iterator = dataset.make_one_shot_iterator()
        tensor = iterator.get_next()
        with tf.Session() as sess:
            ts = sess.run(tensor)
            # TODO: we will improve the test once the batch_size argument
            #  added.
            # Now we only have one batch.
        for i in range(converter.dataset_size):
            for col in df.schema.names:
                actual_ele = getattr(ts, col)[i]
                expected_ele = expected_df[i][col]
                if col == "str_col":
                    actual_ele = actual_ele.decode()
                if col == "bin_col":
                    actual_ele = bytearray(actual_ele)
                if col == "float_col" or col == "double_col":
                    # Note that the default dtype is float32
                    assert pytest.approx(expected_ele, rel=1e-6) == actual_ele
                else:
                    assert expected_ele == actual_ele

        assert len(expected_df) == len(converter)

    assert np.bool_ == ts.bool_col.dtype.type
    assert np.float32 == ts.float_col.dtype.type
    # Default dtype float32
    assert np.float32 == ts.double_col.dtype.type
    assert np.int16 == ts.short_col.dtype.type
    assert np.int32 == ts.int_col.dtype.type
    assert np.int64 == ts.long_col.dtype.type
    assert np.object_ == ts.str_col.dtype.type
    assert np.object_ == ts.bin_col.dtype.type
def test_array(test_ctx):
    df = test_ctx.spark.createDataFrame(
        [([1., 2., 3.], ), ([4., 5., 6.], )],
        StructType([StructField(name='c1', dataType=ArrayType(DoubleType()))]))
    converter1 = make_spark_converter(df)
    with converter1.make_tf_dataset() as dataset:
        iterator = dataset.make_one_shot_iterator()
        tensor = iterator.get_next()
        with tf.Session() as sess:
            ts = sess.run(tensor)
    assert np.float32 == ts.c1.dtype.type
def test_tf_dataset_batch_size(test_ctx):
    df1 = test_ctx.spark.range(100)

    batch_size = 30
    converter1 = make_spark_converter(df1)

    with converter1.make_tf_dataset(batch_size=batch_size) as dataset:
        iterator = dataset.make_one_shot_iterator()
        tensor = iterator.get_next()
        with tf.Session() as sess:
            ts = sess.run(tensor)
    assert len(ts.id) == batch_size
def test_torch_pickling_remotely(test_ctx):
    df1 = test_ctx.spark.range(100, 101)
    converter1 = make_spark_converter(df1)

    def map_fn(_):
        with converter1.make_torch_dataloader(num_epochs=1) as dataloader:
            for batch in dataloader:
                ret = batch["id"][0]
        return ret

    result = test_ctx.spark.sparkContext.parallelize(range(1), 1) \
        .map(map_fn).collect()[0]
    assert result == 100
def test_array_field(spark_test_ctx):
    @pandas_udf('array<float>')
    def gen_array(v):
        return v.map(lambda x: np.random.rand(10))
    df1 = spark_test_ctx.spark.range(10).withColumn('v', gen_array('id')).repartition(2)
    cv1 = make_spark_converter(df1)
    # we can auto infer one-dim array shape
    with cv1.make_tf_dataset(batch_size=4, num_epochs=1) as dataset:
        tf_iter = dataset.make_one_shot_iterator()
        next_op = tf_iter.get_next()
        with tf.Session() as sess:
            batch1 = sess.run(next_op)
        assert batch1.v.shape == (4, 10)
def test_pickling_remotely(test_ctx):
    df1 = test_ctx.spark.range(100, 101)
    converter1 = make_spark_converter(df1)

    def map_fn(_):
        with converter1.make_tf_dataset() as dataset:
            iterator = dataset.make_one_shot_iterator()
            tensor = iterator.get_next()
            with tf.Session() as sess:
                ts = sess.run(tensor)
        return getattr(ts, 'id')[0]

    result = test_ctx.spark.sparkContext.parallelize(range(1), 1).map(map_fn).collect()[0]
    assert result == 100
def test_tf_dataset_preproc(test_ctx):
    df1 = test_ctx.spark.createDataFrame(
        [([1., 2., 3., 4., 5., 6.],),
         ([4., 5., 6., 7., 8., 9.],)],
        StructType([StructField(name='c1', dataType=ArrayType(DoubleType()))]))

    converter1 = make_spark_converter(df1)

    def preproc_fn(x):
        return tf.reshape(x.c1, [-1, 3, 2]),

    with converter1.make_tf_dataset(batch_size=2, preproc_fn=preproc_fn) as dataset:
        iterator = dataset.make_one_shot_iterator()
        tensor = iterator.get_next()
        with tf.Session() as sess:
            ts = sess.run(tensor)

    assert ts[0].shape == (2, 3, 2)
def test_torch_transform_spec(test_ctx):
    df = test_ctx.spark.range(8)
    conv = make_spark_converter(df)

    from torchvision import transforms
    from petastorm import TransformSpec

    def _transform_row(df_row):
        scale_tranform = transforms.Compose([
            transforms.Lambda(lambda x: x * 0.1),
        ])
        return scale_tranform(df_row)

    transform = TransformSpec(_transform_row)
    with conv.make_torch_dataloader(transform_spec=transform,
                                    num_epochs=1) as dataloader:
        for batch in dataloader:
            assert min(batch['id']) >= 0 and max(batch['id']) < 1
def test_vector_to_array(test_ctx):
    from pyspark.ml.linalg import Vectors
    from pyspark.mllib.linalg import Vectors as OldVectors
    df = test_ctx.spark.createDataFrame([
        (Vectors.dense(1.0, 2.0, 3.0), OldVectors.dense(10.0, 20.0, 30.0)),
        (Vectors.dense(5.0, 6.0, 7.0), OldVectors.dense(50.0, 60.0, 70.0))],
                                        ["vec", "oldVec"])
    converter1 = make_spark_converter(df)
    with converter1.make_tf_dataset() as dataset:
        iterator = dataset.make_one_shot_iterator()
        tensor = iterator.get_next()
        with tf.Session() as sess:
            ts = sess.run(tensor)
    assert np.float32 == ts.vec.dtype.type
    assert np.float32 == ts.oldVec.dtype.type
    assert (2, 3) == ts.vec.shape
    assert (2, 3) == ts.oldVec.shape
    assert [[1., 2., 3.], [5., 6., 7.]] == ts.vec
    assert [[10., 20., 30.], [50., 60., 70]] == ts.oldVec
def test_tf_dataset_petastorm_args(mock_make_batch_reader, spark_test_ctx):
    df1 = spark_test_ctx.spark.range(100).repartition(4)
    conv1 = make_spark_converter(df1)

    mock_make_batch_reader.return_value = make_batch_reader(conv1.cache_dir_url)

    with conv1.make_tf_dataset(reader_pool_type='dummy', cur_shard=1, shard_count=4):
        pass
    peta_args = mock_make_batch_reader.call_args[1]
    assert peta_args['reader_pool_type'] == 'dummy' and \
        peta_args['cur_shard'] == 1 and \
        peta_args['shard_count'] == 4 and \
        peta_args['num_epochs'] is None and \
        peta_args['workers_count'] == 4

    with conv1.make_tf_dataset(num_epochs=1, workers_count=2):
        pass
    peta_args = mock_make_batch_reader.call_args[1]
    assert peta_args['num_epochs'] == 1 and peta_args['workers_count'] == 2
def test_torch_primitive(test_ctx):
    import torch

    schema = StructType([
        StructField("bool_col", BooleanType(), False),
        StructField("float_col", FloatType(), False),
        StructField("double_col", DoubleType(), False),
        StructField("short_col", ShortType(), False),
        StructField("int_col", IntegerType(), False),
        StructField("long_col", LongType(), False),
        StructField("byte_col", ByteType(), False),
    ])
    df = test_ctx.spark.createDataFrame(
        [(True, 0.12, 432.1, 5, 5, 0, -128),
         (False, 123.45, 0.987, 9, 908, 765, 127)],
        schema=schema).coalesce(1)
    # If we use numPartition > 1, the order of the loaded dataset would
    # be non-deterministic.
    expected_df = df.collect()

    converter = make_spark_converter(df)
    batch = None
    with converter.make_torch_dataloader(num_epochs=1) as dataloader:
        for i, batch in enumerate(dataloader):
            # default batch_size = 1
            for col in df.schema.names:
                actual_ele = batch[col][0]
                expected_ele = expected_df[i][col]
                if col == "float_col" or col == "double_col":
                    # Note that the default dtype is float32
                    assert pytest.approx(expected_ele, rel=1e-6) == actual_ele
                else:
                    assert expected_ele == actual_ele

        assert len(expected_df) == len(converter)
    assert torch.uint8 == batch["bool_col"].dtype
    assert torch.int8 == batch["byte_col"].dtype
    assert torch.float32 == batch["double_col"].dtype
    assert torch.float32 == batch["float_col"].dtype
    assert torch.int32 == batch["int_col"].dtype
    assert torch.int64 == batch["long_col"].dtype
    assert torch.int16 == batch["short_col"].dtype
def test_df_caching(test_ctx):
    df1 = test_ctx.spark.range(10)
    df2 = test_ctx.spark.range(10)
    df3 = test_ctx.spark.range(20)

    converter1 = make_spark_converter(df1)
    converter2 = make_spark_converter(df2)
    assert converter1.cache_dir_url == converter2.cache_dir_url

    converter3 = make_spark_converter(df3)
    assert converter1.cache_dir_url != converter3.cache_dir_url

    converter11 = make_spark_converter(
        df1, parquet_row_group_size_bytes=8 * 1024 * 1024)
    converter21 = make_spark_converter(
        df1, parquet_row_group_size_bytes=16 * 1024 * 1024)
    assert converter11.cache_dir_url != converter21.cache_dir_url

    converter12 = make_spark_converter(df1, compression_codec=None)
    converter22 = make_spark_converter(df1, compression_codec="snappy")
    assert converter12.cache_dir_url != converter22.cache_dir_url
Ejemplo n.º 24
0
def test_vector_to_array(spark_test_ctx):
    from pyspark.ml.linalg import Vectors
    from pyspark.mllib.linalg import Vectors as OldVectors
    df = spark_test_ctx.spark.createDataFrame(
        [(Vectors.dense(1.0, 2.0, 3.0), OldVectors.dense(10.0, 20.0, 30.0)),
         (Vectors.dense(5.0, 6.0, 7.0), OldVectors.dense(50.0, 60.0, 70.0))],
        ["vec", "oldVec"])
    converter1 = make_spark_converter(df)
    with converter1.make_tf_dataset(num_epochs=1) as dataset:
        iterator = dataset.make_one_shot_iterator()
        tensor = iterator.get_next()
        with tf.Session() as sess:
            ts = sess.run(tensor)
    assert np.float32 == ts.vec.dtype.type
    assert np.float32 == ts.oldVec.dtype.type
    vec_col = ts.vec[ts.vec[:, 0].argsort()]
    old_vec_col = ts.oldVec[ts.oldVec[:, 0].argsort()]
    assert (2, 3) == ts.vec.shape
    assert (2, 3) == ts.oldVec.shape
    assert ([1., 2., 3.] == vec_col[0]).all() and \
           ([5., 6., 7.] == vec_col[1]).all()
    assert ([10., 20., 30.] == old_vec_col[0]).all() and \
           ([50., 60., 70] == old_vec_col[1]).all()
def test_advanced_params(test_ctx):
    df = test_ctx.spark.range(8)
    conv = make_spark_converter(df)
    batch_size = 2
    with conv.make_torch_dataloader(batch_size=batch_size,
                                    num_epochs=1) as dataloader:
        for batch in dataloader:
            assert batch_size == batch['id'].shape[0]

    from torchvision import transforms
    from petastorm import TransformSpec

    def _transform_row(df_row):
        scale_tranform = transforms.Compose([
            transforms.Lambda(lambda x: x * 0.1),
        ])
        return scale_tranform(df_row)

    transform = TransformSpec(_transform_row)
    with conv.make_torch_dataloader(transform_spec=transform,
                                    num_epochs=1) as dataloader:
        for batch in dataloader:
            assert min(batch['id']) >= 0 and max(batch['id']) < 1

    with pytest.raises(TypeError, match="unexpected keyword argument 'xyz'"):
        conv.make_torch_dataloader(xyz=1)

    def mock_make_batch_reader(dataset_url,
                               schema_fields=None,
                               reader_pool_type='thread', workers_count=10,
                               shuffle_row_groups=True, shuffle_row_drop_partitions=1,
                               predicate=None,
                               rowgroup_selector=None,
                               num_epochs=1,
                               cur_shard=None, shard_count=None,
                               cache_type='null', cache_location=None, cache_size_limit=None,
                               cache_row_size_estimate=None, cache_extra_settings=None,
                               hdfs_driver='libhdfs3',
                               transform_spec=None):
        return {
            "dataset_url": dataset_url,
            "schema_fields": schema_fields,
            "reader_pool_type": reader_pool_type,
            "workers_count": workers_count,
            "shuffle_row_groups": shuffle_row_groups,
            "shuffle_row_drop_partitions": shuffle_row_drop_partitions,
            "predicate": predicate,
            "rowgroup_selector": rowgroup_selector,
            "num_epochs": num_epochs,
            "cur_shard": cur_shard,
            "shard_count": shard_count,
            "cache_type": cache_type,
            "cache_location": cache_location,
            "cache_size_limit": cache_size_limit,
            "cache_row_size_estimate": cache_row_size_estimate,
            "cache_extra_settings": cache_extra_settings,
            "hdfs_driver": hdfs_driver,
            "transform_spec": transform_spec,
        }

    original_fn = petastorm.make_batch_reader
    petastorm.make_batch_reader = mock_make_batch_reader
    ctm = conv.make_torch_dataloader(schema_fields="schema_1",
                                     reader_pool_type='type_1',
                                     workers_count="count_1",
                                     shuffle_row_groups="row_group_1",
                                     shuffle_row_drop_partitions="drop_1",
                                     predicate="predicate_1",
                                     rowgroup_selector="selector_1",
                                     num_epochs="num_1",
                                     cur_shard="shard_1",
                                     shard_count="total_shard",
                                     cache_type="cache_1",
                                     cache_location="location_1",
                                     cache_size_limit="limit_1",
                                     cache_extra_settings="extra_1",
                                     hdfs_driver="driver_1",
                                     transform_spec="transform_spec_1")
    assert ctm.reader["schema_fields"] == "schema_1"
    assert ctm.reader["reader_pool_type"] == "type_1"
    assert ctm.reader["workers_count"] == "count_1"
    assert ctm.reader["shuffle_row_groups"] == "row_group_1"
    assert ctm.reader["shuffle_row_drop_partitions"] == "drop_1"
    assert ctm.reader["predicate"] == "predicate_1"
    assert ctm.reader["rowgroup_selector"] == "selector_1"
    assert ctm.reader["num_epochs"] == "num_1"
    assert ctm.reader["cur_shard"] == "shard_1"
    assert ctm.reader["shard_count"] == "total_shard"
    assert ctm.reader["cache_type"] == "cache_1"
    assert ctm.reader["cache_location"] == "location_1"
    assert ctm.reader["cache_size_limit"] == "limit_1"
    assert ctm.reader["cache_extra_settings"] == "extra_1"
    assert ctm.reader["hdfs_driver"] == "driver_1"
    assert ctm.reader["transform_spec"] == "transform_spec_1"

    petastorm.make_batch_reader = original_fn
Ejemplo n.º 26
0
    return tf.image.per_image_standardization(image)


# COMMAND ----------

from petastorm.spark import SparkDatasetConverter, make_spark_converter

spark.conf.set(SparkDatasetConverter.PARENT_CACHE_DIR_URL_CONF,
               "file:///dbfs/ml/petastormcache")
data = spark.read.format("delta").load("/Users/msh/nihxray/nih_xray.delta")
_, testDf = data.select("image", "labels").randomSplit([0.9, 0.01], seed=42)

test_rows = testDf.count()
print(test_rows)

converter_test = make_spark_converter(testDf)

# COMMAND ----------

# DBTITLE 1,Prepare evaluation dataset
with converter_test.make_tf_dataset() as test_dataset:

    test_dataset = (test_dataset.unbatch().batch(test_rows).map(
        lambda x: (basic_transform(x.image),
                   tf.reshape(tf.cast(x.labels, dtype=tf.uint8), (-1, 14)))))

    x_test, y_test = next(iter(test_dataset))

# COMMAND ----------

# DBTITLE 1,We can access MLflow experiments using native Spark
Ejemplo n.º 27
0
# MAGIC ## Test / Train Split with Petastorm

# COMMAND ----------

from petastorm.spark import SparkDatasetConverter, make_spark_converter

spark.conf.set(SparkDatasetConverter.PARENT_CACHE_DIR_URL_CONF,
               "file:///dbfs/ml/petastormcache")

trainDf, testDf = data.select("image", "labels").randomSplit([0.9, 0.1],
                                                             seed=42)

train_rows = trainDf.count()
test_rows = testDf.count()

converter_train = make_spark_converter(trainDf)
converter_test = make_spark_converter(testDf)

# COMMAND ----------

# MAGIC %md
# MAGIC ## Model using DenseNet architecture

# COMMAND ----------

import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Concatenate, Activation, Input, Dense, Dropout, Flatten, Lambda, Conv2D, MaxPooling2D, BatchNormalization, Flatten
from tensorflow.keras.optimizers import Nadam, SGD, Adam
from tensorflow.keras.regularizers import l2
def train(data_conf, model_conf, **kwargs):

    try:
        print("-----------------------------------")
        print("Starting Cashflow DL Model Training")
        print("-----------------------------------")
        print()

        # ==============================
        # 0. Main parameters definitions
        # ==============================

        # Size of X and y arrays definition
        N_days_X, N_days_y = int(data_conf['number_of_historical_days']), int(
            data_conf['number_of_predicted_days'])  #365, 92
        print('Number of days used for prediction (X): ', N_days_X)
        print('Number of days predicted (y): ', N_days_y)
        print()

        # Date range definition
        start_date, end_date = data_conf['start_date'], data_conf['end_date']
        start_date_dt, end_date_dt, start_date_prediction, end_date_prediction, end_date_plusOneDay, end_date_minus_6month = dates_definitions(
            start_date, end_date, N_days_X, N_days_y)
        print('Date range: ', start_date, end_date)
        print()

        model_name = model_conf['model_name']

    except Exception as e:
        print("Errored on initialization")
        print("Exception Trace: {0}".format(e))
        print(traceback.format_exc())
        raise e

    try:
        # ========================================
        # T.1 Pre-processing before model training
        # ========================================

        # Loading dataset
        table_in = data_conf[environment]['table_to_train_on']
        #ts_balance = spark.read.parquet("/mnt/test/{0}.parquet".format(table_in)).cache()
        ts_balance = spark.read.format("delta").load(
            "/mnt/delta/{0}".format(table_in))

        # Cleaning of the time series
        ts_balance = ts_balance.withColumn(
            'balance', ts_balance.balance.cast("array<float>"))

        ts_balance = ts_balance.withColumn(
            'keep_ts',
            F.udf(lambda x, y: time_series_cleaning(x, y), "int")('balance',
                                                                  F.lit(20))
        )  #at least 10 transactions in the ts, to be used in the training

        ts_balance = ts_balance.where('keep_ts == 1')

        # Creating the dataset on which we train (and test and validate) the model
        ts_balance_model = ts_balance.sample(
            False, 0.7,
            seed=0)  #now 0.7, but in real case would be 0.1 at best... or 0.05
        print('ts_balance_model.count()', ts_balance_model.count())

        # Pre-processing before model training
        ts_balance_model = pre_processing(ts_balance_model,
                                          end_date,
                                          spark,
                                          serving=False)
        ts_balance_model.show(3)

        print('ts_balance_model.rdd.getNumPartitions()',
              ts_balance_model.rdd.getNumPartitions())
        ts_balance_model.show(3)

        # Saving prepared dataset
        table_out = 'cashflow_training_step1'
        #ts_balance_model.write.format("parquet").mode("overwrite").save("/mnt/test/{0}.parquet".format(table_out))
        ts_balance_model.write.format("delta").mode("overwrite").save(
            "/mnt/delta/{0}".format(table_out))

    except Exception as e:
        print("Errored on step T.1: pre-processing before model training")
        print("Exception Trace: {0}".format(e))
        print(traceback.format_exc())
        raise e

    try:
        # ========================================
        # T.2 Generating TRAIN, VAL, TEST datasets
        # ========================================

        # Loading datasets
        table_model = 'cashflow_training_step1'
        #ts_balance_model = spark.read.parquet("/mnt/test/{0}.parquet".format(table_model)).cache()
        ts_balance_model = spark.read.format("delta").load(
            "/mnt/delta/{0}".format(table_model)).cache()
        ts_balance_model.show(3)

        print('ts_balance_model.count()', ts_balance_model.count())
        print('ts_balance_model.rdd.getNumPartitions()',
              ts_balance_model.rdd.getNumPartitions())

        train_set, val_set, test_set = ts_balance_model.randomSplit(
            [0.6, 0.2, 0.2], seed=12345)
        train_set.show(3)
        print(
            'train_set.rdd.getNumPartitions(), val_set.rdd.getNumPartitions(), test_set.rdd.getNumPartitions()',
            train_set.rdd.getNumPartitions(), val_set.rdd.getNumPartitions(),
            test_set.rdd.getNumPartitions())

        # Saving prepared datasets (train, val, test sets to parquet)
        table_train = 'cashflow_train'
        table_val = 'cashflow_val'
        table_test = data_conf[environment][
            'table_test_for_performance']  #'cashflow_test'

        train_set.select('X',
                         'y').write.format("delta").mode("overwrite").save(
                             "/mnt/delta/{0}".format(table_train))
        val_set.select('X', 'y').write.format("delta").mode("overwrite").save(
            "/mnt/delta/{0}".format(table_val))
        test_set.select('primaryaccountholder','transactiondate','balance')\
            .write.format("delta").mode("overwrite").save("/mnt/delta/{0}".format(table_test))

    except Exception as e:
        print("Errored on step T.2: pre-processings")
        print("Exception Trace: {0}".format(e))
        print(traceback.format_exc())
        raise e

    try:
        # ==============================
        # T.3 MODEL DEFINITION AND TRAIN
        # ==============================

        table_train = 'cashflow_train'
        table_val = 'cashflow_val'
        #table_train = spark.read.parquet("/mnt/test/{0}.parquet".format(table_train))
        table_train = spark.read.format("delta").load(
            "/mnt/delta/{0}".format(table_train))
        #table_val = spark.read.parquet("/mnt/test/{0}.parquet".format(table_val))
        table_val = spark.read.format("delta").load(
            "/mnt/delta/{0}".format(table_val))
        table_train_count = table_train.count()
        table_val_count = table_val.count()
        #table_train_count, table_val_count

        from pyspark.sql.functions import col
        from petastorm.spark import SparkDatasetConverter, make_spark_converter

        # Set a cache directory on DBFS FUSE for intermediate data.
        spark.conf.set(SparkDatasetConverter.PARENT_CACHE_DIR_URL_CONF,
                       "file:///dbfs/tmp/petastorm/cache")
        converter_train = make_spark_converter(table_train)
        converter_val = make_spark_converter(table_val)

        print(f"train: {len(converter_train)}, val: {len(converter_val)}")

        def get_compiled_model(N_days_X, N_days_y, model_conf):  #lr=0.001
            #model = get_model(lr=lr)
            model = define_1dcnn_model(N_days_X, N_days_y, model_conf)

            hyperparameters = model_conf['hyperParameters']

            opt = tf.keras.optimizers.Adam()

            # Model compilation
            model.compile(optimizer=opt, loss=hyperparameters['loss'])

            return model

        # Enable auto-logging to MLflow to capture TensorBoard metrics.
        mlflow.tensorflow.autolog(every_n_iter=1)

        model_name = model_conf['model_name']
        mlflow_model_name = model_name
        model_dir = "/tmp/" + model_name
        try:
            dbutils.fs.rm(model_dir, recurse=True)
        except OSError:
            pass

        with mlflow.start_run():

            NUM_EPOCHS = model_conf['hyperParameters']['epochs']  #5
            BATCH_SIZE = model_conf['hyperParameters']['batch_size']  #500

            def train_and_evaluate(N_days_X, N_days_y, model_conf):  #lr=0.001
                model = get_compiled_model(N_days_X, N_days_y, model_conf)  #lr

                with converter_train.make_tf_dataset(batch_size=BATCH_SIZE) as train_dataset, \
                     converter_val.make_tf_dataset(batch_size=BATCH_SIZE) as val_dataset:

                    #train_dataset = train_dataset.map(lambda x: (x.features, x.label_index))
                    train_dataset = train_dataset.map(
                        lambda x: (tf.reshape(x.X, [-1, N_days_X, 1]),
                                   tf.reshape(x.y, [-1, N_days_y])))
                    steps_per_epoch = len(converter_train) // BATCH_SIZE

                    #val_dataset = val_dataset.map(lambda x: (x.features, x.label_index))
                    val_dataset = val_dataset.map(
                        lambda x: (tf.reshape(x.X, [-1, N_days_X, 1]),
                                   tf.reshape(x.y, [-1, N_days_y])))
                    validation_steps = max(1, len(converter_val) // BATCH_SIZE)

                    print(
                        f"steps_per_epoch: {steps_per_epoch}, validation_steps: {validation_steps}"
                    )

                    hist = model.fit(train_dataset,
                                     steps_per_epoch=steps_per_epoch,
                                     epochs=NUM_EPOCHS,
                                     validation_data=val_dataset,
                                     validation_steps=validation_steps,
                                     verbose=2)
                    return model, hist

            model, hist = train_and_evaluate(N_days_X, N_days_y, model_conf)
            print(hist.history['val_loss'][-1])

            #MLflow logging
            #mlflow.log_artifact(cwd + "data.json")
            #mlflow.log_artifact(cwd + "config.json")
            mlflow.log_param("model_name", str(model_name))
            mlflow.log_param("N_days_X", N_days_X)
            mlflow.log_param("N_days_y", N_days_y)
            mlflow.log_param("start_date", start_date)
            mlflow.log_param("end_date", end_date)
            mlflow.log_param("num_epochs", str(NUM_EPOCHS))
            mlflow.log_param("batch_size", str(BATCH_SIZE))
            #mlflow.log_param("steps_per_epoch", str(steps_per_epoch)) #validation_steps

            # saving using tf.keras.models.save_model
            tf.keras.models.save_model(model, filepath=model_dir +
                                       '/model')  #SavedModel format
            #model.save(filepath=model_dir+'model', save_format="h5")      #H5 format (todo, and look how to register that)

            # saving using mlflow.tensorflow.save_model (this does NOT log nor register the model) does not overwrites...
            #mlflow.tensorflow.save_model(tf_saved_model_dir=model_dir+'/model',
            #                             tf_meta_graph_tags=[tf.compat.v1.saved_model.tag_constants.SERVING],
            #                             tf_signature_def_key='serving_default',
            #                             path = 'model')

            # logging already saved model
            mlflow.tensorflow.log_model(
                tf_saved_model_dir=model_dir + '/model',
                tf_meta_graph_tags=[
                    tf.compat.v1.saved_model.tag_constants.SERVING
                ],
                tf_signature_def_key='serving_default',
                registered_model_name=model_name,
                artifact_path='model')

            # Getting the version number of the newly registered MLflow model (useful for next steps)
            mlflow_model_version = 0
            client_current_model = MlflowClient()
            for mv in client_current_model.search_model_versions(
                    "name='{0}'".format(mlflow_model_name)):
                #if int(dict(mv)['version']) == mlflow_model_version:
                if int(
                        dict(mv)['version']
                ) >= mlflow_model_version:  # finding the last version registered
                    mlflow_model_version = int(dict(mv)['version'])
                    model_dict = dict(mv)

            #update 2020-07017: to grab the latest model version, we can also do like this: (TO BE TESTED!!!)
            #model_version_infos = client_current_model.search_model_versions(f"name = '{model_name}'")
            #mlflow_model_version = max([model_version_info.version for model_version_info in model_version_infos])

            # Wait until the model is ready
            def wait_until_model_ready(model_name, model_version):
                client = MlflowClient()
                for _ in range(20):
                    model_version_details = client.get_model_version(
                        name=model_name,
                        version=model_version,
                    )
                    status = ModelVersionStatus.from_string(
                        model_version_details.status)
                    print("Model status: %s" %
                          ModelVersionStatus.to_string(status))
                    if status == ModelVersionStatus.READY:
                        break
                    tm.sleep(5)

            wait_until_model_ready(mlflow_model_name, mlflow_model_version)

            # Transition the registered model stage from "None" to "Staging"
            client_current_model.transition_model_version_stage(
                name=mlflow_model_name,
                version=mlflow_model_version,
                stage="Staging",
            )

            # Copy the file from the driver node and save it to DBFS (so that they can be accessed e.g. after the current cluster terminates.):
            dbutils.fs.cp("file:/tmp/{0}/model".format(model_name),
                          "dbfs:/mnt/test/{0}/model".format(model_name),
                          recurse=True)
            print('Model copied here: ',
                  "dbfs:/mnt/test/{0}/model/".format(model_name))

        #mlflow.end_run()

    except Exception as e:
        print("Errored on step T.3: model definition and train")
        print("Exception Trace: {0}".format(e))
        print(traceback.format_exc())
        raise e