def test_dtype(test_ctx): df = test_ctx.spark.range(10) df = df.withColumn("float_col", df.id.cast(FloatType())) \ .withColumn("double_col", df.id.cast(DoubleType())) converter1 = make_spark_converter(df) with converter1.make_tf_dataset() as dataset: iterator = dataset.make_one_shot_iterator() tensor = iterator.get_next() with tf.Session() as sess: ts = sess.run(tensor) assert np.float32 == ts.double_col.dtype.type converter2 = make_spark_converter(df, dtype='float64') with converter2.make_tf_dataset() as dataset: iterator = dataset.make_one_shot_iterator() tensor = iterator.get_next() with tf.Session() as sess: ts = sess.run(tensor) assert np.float64 == ts.float_col.dtype.type converter3 = make_spark_converter(df, dtype=None) with converter3.make_tf_dataset() as dataset: iterator = dataset.make_one_shot_iterator() tensor = iterator.get_next() with tf.Session() as sess: ts = sess.run(tensor) assert np.float32 == ts.float_col.dtype.type assert np.float64 == ts.double_col.dtype.type with pytest.raises(ValueError, match="dtype float16 is not supported. \ Use 'float32' or float64"): make_spark_converter(df, dtype="float16")
def test_compression(test_ctx): df1 = test_ctx.spark.range(10) converter1 = make_spark_converter(df1) assert "uncompressed" == \ _get_compression_type(converter1.cache_dir_url).lower() converter2 = make_spark_converter(df1, compression_codec="snappy") assert "snappy" == \ _get_compression_type(converter2.cache_dir_url).lower()
def run(data_dir): # Get SparkSession spark = SparkSession.builder \ .master("local[2]") \ .appName("petastorm.spark tensorflow_example") \ .getOrCreate() # Load and preprocess data using Spark df = spark.read.format("libsvm") \ .option("numFeatures", "784") \ .load(data_dir) \ .select(col("features"), col("label").cast("long").alias("label")) # Randomly split data into train and test dataset df_train, df_test = df.randomSplit([0.9, 0.1], seed=12345) # Set a cache directory for intermediate data. # The path should be accessible by both Spark workers and driver. spark.conf.set(SparkDatasetConverter.PARENT_CACHE_DIR_URL_CONF, "file:///tmp/petastorm/cache/tf-example") converter_train = make_spark_converter(df_train) converter_test = make_spark_converter(df_test) def train_and_evaluate(_=None): import tensorflow as tf with converter_train.make_tf_dataset() as dataset: dataset = dataset.map( lambda x: (tf.reshape(x.features, [-1, 28, 28]), x.label)) model = train(dataset) with converter_test.make_tf_dataset(num_epochs=1) as dataset: dataset = dataset.map( lambda x: (tf.reshape(x.features, [-1, 28, 28]), x.label)) hist = model.evaluate(dataset) return hist[1] # Train and evaluate the model on the local machine accuracy = train_and_evaluate() logging.info("Train and evaluate the model on the local machine.") logging.info("Accuracy: %.6f", accuracy) # Train and evaluate the model on a spark worker accuracy = spark.sparkContext.parallelize( range(1)).map(train_and_evaluate).collect()[0] logging.info("Train and evaluate the model remotely on a spark worker, " "which can be used for distributed hyperparameter tuning.") logging.info("Accuracy: %.6f", accuracy) # Cleanup converter_train.delete() converter_test.delete() spark.stop()
def test_df_delete_caching_meta(test_ctx): from petastorm.spark.spark_dataset_converter import _cache_df_meta_list df1 = test_ctx.spark.range(10) df2 = test_ctx.spark.range(20) converter1 = make_spark_converter(df1) converter2 = make_spark_converter(df2) converter1.delete() cached_list = set(map(lambda x: x.cache_dir_url, _cache_df_meta_list)) assert converter1.cache_dir_url not in cached_list assert converter2.cache_dir_url in cached_list # test recreate converter1 after delete should work. make_spark_converter(df1)
def test_torch_unexpected_param(test_ctx): df = test_ctx.spark.range(8) conv = make_spark_converter(df) with pytest.raises(TypeError, match="unexpected keyword argument 'xyz'"): with conv.make_torch_dataloader(xyz=1) as _: pass
def test_torch_dataloader_advanced_params(mock_torch_make_batch_reader, test_ctx): SHARD_COUNT = 3 df = test_ctx.spark.range(100).repartition(SHARD_COUNT) conv = make_spark_converter(df) mock_torch_make_batch_reader.return_value = \ make_batch_reader(conv.cache_dir_url) with conv.make_torch_dataloader(reader_pool_type='dummy', cur_shard=1, shard_count=SHARD_COUNT) as _: pass peta_args = mock_torch_make_batch_reader.call_args.kwargs assert peta_args['reader_pool_type'] == 'dummy' and \ peta_args['cur_shard'] == 1 and \ peta_args['shard_count'] == SHARD_COUNT and \ peta_args['num_epochs'] is None and \ peta_args['workers_count'] == 4 # Test default value overridden arguments. with conv.make_torch_dataloader(num_epochs=1, workers_count=2) as _: pass peta_args = mock_torch_make_batch_reader.call_args.kwargs assert peta_args['num_epochs'] == 1 and peta_args['workers_count'] == 2
def test_df_caching(test_ctx): df1 = test_ctx.spark.range(10) df2 = test_ctx.spark.range(10) df3 = test_ctx.spark.range(20) # Test caching for the dataframes with the same logical plan converter1 = make_spark_converter(df1) converter2 = make_spark_converter(df2) assert converter1.cache_dir_url == converter2.cache_dir_url # Test no caching for different dataframes converter3 = make_spark_converter(df3) assert converter1.cache_dir_url != converter3.cache_dir_url # Test no caching for the same dataframe with different row group size converter11 = make_spark_converter(df1, parquet_row_group_size_bytes=8 * 1024 * 1024) converter21 = make_spark_converter(df1, parquet_row_group_size_bytes=16 * 1024 * 1024) assert converter11.cache_dir_url != converter21.cache_dir_url # Test no caching for the same dataframe with different compression_codec converter12 = make_spark_converter(df1, compression_codec=None) converter22 = make_spark_converter(df1, compression_codec="snappy") assert converter12.cache_dir_url != converter22.cache_dir_url ori_temp_url = test_ctx.spark.conf.get( SparkDatasetConverter.PARENT_CACHE_DIR_URL_CONF) tempdir = tempfile.mkdtemp('_spark_converter_test1') new_temp_url = 'file://' + tempdir.replace(os.sep, '/') try: # Test no caching for the same dataframe with different parent cache dirs test_ctx.spark.conf.set( SparkDatasetConverter.PARENT_CACHE_DIR_URL_CONF, new_temp_url) assert ori_temp_url != new_temp_url converter13 = make_spark_converter(df1) assert converter1.cache_dir_url != converter13.cache_dir_url # Test caching for the same dataframe with different parent cache dirs # that could be normalized to the same parent cache dir new_temp_url_2 = new_temp_url + os.sep test_ctx.spark.conf.set( SparkDatasetConverter.PARENT_CACHE_DIR_URL_CONF, new_temp_url_2) assert new_temp_url != new_temp_url_2 converter14 = make_spark_converter(df1) assert converter13.cache_dir_url == converter14.cache_dir_url finally: test_ctx.spark.conf.set( SparkDatasetConverter.PARENT_CACHE_DIR_URL_CONF, ori_temp_url)
def test_torch_batch_size(test_ctx): df = test_ctx.spark.range(8) conv = make_spark_converter(df) batch_size = 2 with conv.make_torch_dataloader(batch_size=batch_size, num_epochs=1) as dataloader: for batch in dataloader: assert batch_size == batch['id'].shape[0]
def test_delete(test_ctx): df = test_ctx.spark.createDataFrame([(1, 2), (4, 5)], ["col1", "col2"]) # TODO add test for hdfs url converter = make_spark_converter(df) local_path = urlparse(converter.cache_dir_url).path assert os.path.exists(local_path) converter.delete() assert not os.path.exists(local_path)
def test_torch_data_loader_fn(spark_test_ctx): from petastorm.pytorch import BatchedDataLoader df = spark_test_ctx.spark.range(8) conv = make_spark_converter(df) with conv.make_torch_dataloader(data_loader_fn=BatchedDataLoader, batch_size=2, num_epochs=1) as dataloader: assert isinstance(dataloader, BatchedDataLoader)
def test_tf_autograph(spark_test_ctx, caplog): caplog.clear() df1 = spark_test_ctx.spark.range(100) converter1 = make_spark_converter(df1) results = [] with converter1.make_tf_dataset(num_epochs=1) as dataset: for batch in dataset: results.append(batch) assert "AutoGraph could not transform" not in " ".join(caplog.messages)
def test_primitive(test_ctx): schema = StructType([ StructField("bool_col", BooleanType(), False), StructField("float_col", FloatType(), False), StructField("double_col", DoubleType(), False), StructField("short_col", ShortType(), False), StructField("int_col", IntegerType(), False), StructField("long_col", LongType(), False), StructField("str_col", StringType(), False), StructField("bin_col", BinaryType(), False), StructField("byte_col", ByteType(), False), ]) df = test_ctx.spark.createDataFrame( [(True, 0.12, 432.1, 5, 5, 0, "hello", bytearray(b"spark\x01\x02"), -128), (False, 123.45, 0.987, 9, 908, 765, "petastorm", bytearray(b"\x0012345"), 127)], schema=schema).coalesce(1) # If we use numPartition > 1, the order of the loaded dataset would # be non-deterministic. expected_df = df.collect() converter = make_spark_converter(df) with converter.make_tf_dataset() as dataset: iterator = dataset.make_one_shot_iterator() tensor = iterator.get_next() with tf.Session() as sess: ts = sess.run(tensor) # TODO: we will improve the test once the batch_size argument # added. # Now we only have one batch. for i in range(converter.dataset_size): for col in df.schema.names: actual_ele = getattr(ts, col)[i] expected_ele = expected_df[i][col] if col == "str_col": actual_ele = actual_ele.decode() if col == "bin_col": actual_ele = bytearray(actual_ele) if col == "float_col" or col == "double_col": # Note that the default dtype is float32 assert pytest.approx(expected_ele, rel=1e-6) == actual_ele else: assert expected_ele == actual_ele assert len(expected_df) == len(converter) assert np.bool_ == ts.bool_col.dtype.type assert np.float32 == ts.float_col.dtype.type # Default dtype float32 assert np.float32 == ts.double_col.dtype.type assert np.int16 == ts.short_col.dtype.type assert np.int32 == ts.int_col.dtype.type assert np.int64 == ts.long_col.dtype.type assert np.object_ == ts.str_col.dtype.type assert np.object_ == ts.bin_col.dtype.type
def test_array(test_ctx): df = test_ctx.spark.createDataFrame( [([1., 2., 3.], ), ([4., 5., 6.], )], StructType([StructField(name='c1', dataType=ArrayType(DoubleType()))])) converter1 = make_spark_converter(df) with converter1.make_tf_dataset() as dataset: iterator = dataset.make_one_shot_iterator() tensor = iterator.get_next() with tf.Session() as sess: ts = sess.run(tensor) assert np.float32 == ts.c1.dtype.type
def test_tf_dataset_batch_size(test_ctx): df1 = test_ctx.spark.range(100) batch_size = 30 converter1 = make_spark_converter(df1) with converter1.make_tf_dataset(batch_size=batch_size) as dataset: iterator = dataset.make_one_shot_iterator() tensor = iterator.get_next() with tf.Session() as sess: ts = sess.run(tensor) assert len(ts.id) == batch_size
def test_torch_pickling_remotely(test_ctx): df1 = test_ctx.spark.range(100, 101) converter1 = make_spark_converter(df1) def map_fn(_): with converter1.make_torch_dataloader(num_epochs=1) as dataloader: for batch in dataloader: ret = batch["id"][0] return ret result = test_ctx.spark.sparkContext.parallelize(range(1), 1) \ .map(map_fn).collect()[0] assert result == 100
def test_array_field(spark_test_ctx): @pandas_udf('array<float>') def gen_array(v): return v.map(lambda x: np.random.rand(10)) df1 = spark_test_ctx.spark.range(10).withColumn('v', gen_array('id')).repartition(2) cv1 = make_spark_converter(df1) # we can auto infer one-dim array shape with cv1.make_tf_dataset(batch_size=4, num_epochs=1) as dataset: tf_iter = dataset.make_one_shot_iterator() next_op = tf_iter.get_next() with tf.Session() as sess: batch1 = sess.run(next_op) assert batch1.v.shape == (4, 10)
def test_pickling_remotely(test_ctx): df1 = test_ctx.spark.range(100, 101) converter1 = make_spark_converter(df1) def map_fn(_): with converter1.make_tf_dataset() as dataset: iterator = dataset.make_one_shot_iterator() tensor = iterator.get_next() with tf.Session() as sess: ts = sess.run(tensor) return getattr(ts, 'id')[0] result = test_ctx.spark.sparkContext.parallelize(range(1), 1).map(map_fn).collect()[0] assert result == 100
def test_tf_dataset_preproc(test_ctx): df1 = test_ctx.spark.createDataFrame( [([1., 2., 3., 4., 5., 6.],), ([4., 5., 6., 7., 8., 9.],)], StructType([StructField(name='c1', dataType=ArrayType(DoubleType()))])) converter1 = make_spark_converter(df1) def preproc_fn(x): return tf.reshape(x.c1, [-1, 3, 2]), with converter1.make_tf_dataset(batch_size=2, preproc_fn=preproc_fn) as dataset: iterator = dataset.make_one_shot_iterator() tensor = iterator.get_next() with tf.Session() as sess: ts = sess.run(tensor) assert ts[0].shape == (2, 3, 2)
def test_torch_transform_spec(test_ctx): df = test_ctx.spark.range(8) conv = make_spark_converter(df) from torchvision import transforms from petastorm import TransformSpec def _transform_row(df_row): scale_tranform = transforms.Compose([ transforms.Lambda(lambda x: x * 0.1), ]) return scale_tranform(df_row) transform = TransformSpec(_transform_row) with conv.make_torch_dataloader(transform_spec=transform, num_epochs=1) as dataloader: for batch in dataloader: assert min(batch['id']) >= 0 and max(batch['id']) < 1
def test_vector_to_array(test_ctx): from pyspark.ml.linalg import Vectors from pyspark.mllib.linalg import Vectors as OldVectors df = test_ctx.spark.createDataFrame([ (Vectors.dense(1.0, 2.0, 3.0), OldVectors.dense(10.0, 20.0, 30.0)), (Vectors.dense(5.0, 6.0, 7.0), OldVectors.dense(50.0, 60.0, 70.0))], ["vec", "oldVec"]) converter1 = make_spark_converter(df) with converter1.make_tf_dataset() as dataset: iterator = dataset.make_one_shot_iterator() tensor = iterator.get_next() with tf.Session() as sess: ts = sess.run(tensor) assert np.float32 == ts.vec.dtype.type assert np.float32 == ts.oldVec.dtype.type assert (2, 3) == ts.vec.shape assert (2, 3) == ts.oldVec.shape assert [[1., 2., 3.], [5., 6., 7.]] == ts.vec assert [[10., 20., 30.], [50., 60., 70]] == ts.oldVec
def test_tf_dataset_petastorm_args(mock_make_batch_reader, spark_test_ctx): df1 = spark_test_ctx.spark.range(100).repartition(4) conv1 = make_spark_converter(df1) mock_make_batch_reader.return_value = make_batch_reader(conv1.cache_dir_url) with conv1.make_tf_dataset(reader_pool_type='dummy', cur_shard=1, shard_count=4): pass peta_args = mock_make_batch_reader.call_args[1] assert peta_args['reader_pool_type'] == 'dummy' and \ peta_args['cur_shard'] == 1 and \ peta_args['shard_count'] == 4 and \ peta_args['num_epochs'] is None and \ peta_args['workers_count'] == 4 with conv1.make_tf_dataset(num_epochs=1, workers_count=2): pass peta_args = mock_make_batch_reader.call_args[1] assert peta_args['num_epochs'] == 1 and peta_args['workers_count'] == 2
def test_torch_primitive(test_ctx): import torch schema = StructType([ StructField("bool_col", BooleanType(), False), StructField("float_col", FloatType(), False), StructField("double_col", DoubleType(), False), StructField("short_col", ShortType(), False), StructField("int_col", IntegerType(), False), StructField("long_col", LongType(), False), StructField("byte_col", ByteType(), False), ]) df = test_ctx.spark.createDataFrame( [(True, 0.12, 432.1, 5, 5, 0, -128), (False, 123.45, 0.987, 9, 908, 765, 127)], schema=schema).coalesce(1) # If we use numPartition > 1, the order of the loaded dataset would # be non-deterministic. expected_df = df.collect() converter = make_spark_converter(df) batch = None with converter.make_torch_dataloader(num_epochs=1) as dataloader: for i, batch in enumerate(dataloader): # default batch_size = 1 for col in df.schema.names: actual_ele = batch[col][0] expected_ele = expected_df[i][col] if col == "float_col" or col == "double_col": # Note that the default dtype is float32 assert pytest.approx(expected_ele, rel=1e-6) == actual_ele else: assert expected_ele == actual_ele assert len(expected_df) == len(converter) assert torch.uint8 == batch["bool_col"].dtype assert torch.int8 == batch["byte_col"].dtype assert torch.float32 == batch["double_col"].dtype assert torch.float32 == batch["float_col"].dtype assert torch.int32 == batch["int_col"].dtype assert torch.int64 == batch["long_col"].dtype assert torch.int16 == batch["short_col"].dtype
def test_df_caching(test_ctx): df1 = test_ctx.spark.range(10) df2 = test_ctx.spark.range(10) df3 = test_ctx.spark.range(20) converter1 = make_spark_converter(df1) converter2 = make_spark_converter(df2) assert converter1.cache_dir_url == converter2.cache_dir_url converter3 = make_spark_converter(df3) assert converter1.cache_dir_url != converter3.cache_dir_url converter11 = make_spark_converter( df1, parquet_row_group_size_bytes=8 * 1024 * 1024) converter21 = make_spark_converter( df1, parquet_row_group_size_bytes=16 * 1024 * 1024) assert converter11.cache_dir_url != converter21.cache_dir_url converter12 = make_spark_converter(df1, compression_codec=None) converter22 = make_spark_converter(df1, compression_codec="snappy") assert converter12.cache_dir_url != converter22.cache_dir_url
def test_vector_to_array(spark_test_ctx): from pyspark.ml.linalg import Vectors from pyspark.mllib.linalg import Vectors as OldVectors df = spark_test_ctx.spark.createDataFrame( [(Vectors.dense(1.0, 2.0, 3.0), OldVectors.dense(10.0, 20.0, 30.0)), (Vectors.dense(5.0, 6.0, 7.0), OldVectors.dense(50.0, 60.0, 70.0))], ["vec", "oldVec"]) converter1 = make_spark_converter(df) with converter1.make_tf_dataset(num_epochs=1) as dataset: iterator = dataset.make_one_shot_iterator() tensor = iterator.get_next() with tf.Session() as sess: ts = sess.run(tensor) assert np.float32 == ts.vec.dtype.type assert np.float32 == ts.oldVec.dtype.type vec_col = ts.vec[ts.vec[:, 0].argsort()] old_vec_col = ts.oldVec[ts.oldVec[:, 0].argsort()] assert (2, 3) == ts.vec.shape assert (2, 3) == ts.oldVec.shape assert ([1., 2., 3.] == vec_col[0]).all() and \ ([5., 6., 7.] == vec_col[1]).all() assert ([10., 20., 30.] == old_vec_col[0]).all() and \ ([50., 60., 70] == old_vec_col[1]).all()
def test_advanced_params(test_ctx): df = test_ctx.spark.range(8) conv = make_spark_converter(df) batch_size = 2 with conv.make_torch_dataloader(batch_size=batch_size, num_epochs=1) as dataloader: for batch in dataloader: assert batch_size == batch['id'].shape[0] from torchvision import transforms from petastorm import TransformSpec def _transform_row(df_row): scale_tranform = transforms.Compose([ transforms.Lambda(lambda x: x * 0.1), ]) return scale_tranform(df_row) transform = TransformSpec(_transform_row) with conv.make_torch_dataloader(transform_spec=transform, num_epochs=1) as dataloader: for batch in dataloader: assert min(batch['id']) >= 0 and max(batch['id']) < 1 with pytest.raises(TypeError, match="unexpected keyword argument 'xyz'"): conv.make_torch_dataloader(xyz=1) def mock_make_batch_reader(dataset_url, schema_fields=None, reader_pool_type='thread', workers_count=10, shuffle_row_groups=True, shuffle_row_drop_partitions=1, predicate=None, rowgroup_selector=None, num_epochs=1, cur_shard=None, shard_count=None, cache_type='null', cache_location=None, cache_size_limit=None, cache_row_size_estimate=None, cache_extra_settings=None, hdfs_driver='libhdfs3', transform_spec=None): return { "dataset_url": dataset_url, "schema_fields": schema_fields, "reader_pool_type": reader_pool_type, "workers_count": workers_count, "shuffle_row_groups": shuffle_row_groups, "shuffle_row_drop_partitions": shuffle_row_drop_partitions, "predicate": predicate, "rowgroup_selector": rowgroup_selector, "num_epochs": num_epochs, "cur_shard": cur_shard, "shard_count": shard_count, "cache_type": cache_type, "cache_location": cache_location, "cache_size_limit": cache_size_limit, "cache_row_size_estimate": cache_row_size_estimate, "cache_extra_settings": cache_extra_settings, "hdfs_driver": hdfs_driver, "transform_spec": transform_spec, } original_fn = petastorm.make_batch_reader petastorm.make_batch_reader = mock_make_batch_reader ctm = conv.make_torch_dataloader(schema_fields="schema_1", reader_pool_type='type_1', workers_count="count_1", shuffle_row_groups="row_group_1", shuffle_row_drop_partitions="drop_1", predicate="predicate_1", rowgroup_selector="selector_1", num_epochs="num_1", cur_shard="shard_1", shard_count="total_shard", cache_type="cache_1", cache_location="location_1", cache_size_limit="limit_1", cache_extra_settings="extra_1", hdfs_driver="driver_1", transform_spec="transform_spec_1") assert ctm.reader["schema_fields"] == "schema_1" assert ctm.reader["reader_pool_type"] == "type_1" assert ctm.reader["workers_count"] == "count_1" assert ctm.reader["shuffle_row_groups"] == "row_group_1" assert ctm.reader["shuffle_row_drop_partitions"] == "drop_1" assert ctm.reader["predicate"] == "predicate_1" assert ctm.reader["rowgroup_selector"] == "selector_1" assert ctm.reader["num_epochs"] == "num_1" assert ctm.reader["cur_shard"] == "shard_1" assert ctm.reader["shard_count"] == "total_shard" assert ctm.reader["cache_type"] == "cache_1" assert ctm.reader["cache_location"] == "location_1" assert ctm.reader["cache_size_limit"] == "limit_1" assert ctm.reader["cache_extra_settings"] == "extra_1" assert ctm.reader["hdfs_driver"] == "driver_1" assert ctm.reader["transform_spec"] == "transform_spec_1" petastorm.make_batch_reader = original_fn
return tf.image.per_image_standardization(image) # COMMAND ---------- from petastorm.spark import SparkDatasetConverter, make_spark_converter spark.conf.set(SparkDatasetConverter.PARENT_CACHE_DIR_URL_CONF, "file:///dbfs/ml/petastormcache") data = spark.read.format("delta").load("/Users/msh/nihxray/nih_xray.delta") _, testDf = data.select("image", "labels").randomSplit([0.9, 0.01], seed=42) test_rows = testDf.count() print(test_rows) converter_test = make_spark_converter(testDf) # COMMAND ---------- # DBTITLE 1,Prepare evaluation dataset with converter_test.make_tf_dataset() as test_dataset: test_dataset = (test_dataset.unbatch().batch(test_rows).map( lambda x: (basic_transform(x.image), tf.reshape(tf.cast(x.labels, dtype=tf.uint8), (-1, 14))))) x_test, y_test = next(iter(test_dataset)) # COMMAND ---------- # DBTITLE 1,We can access MLflow experiments using native Spark
# MAGIC ## Test / Train Split with Petastorm # COMMAND ---------- from petastorm.spark import SparkDatasetConverter, make_spark_converter spark.conf.set(SparkDatasetConverter.PARENT_CACHE_DIR_URL_CONF, "file:///dbfs/ml/petastormcache") trainDf, testDf = data.select("image", "labels").randomSplit([0.9, 0.1], seed=42) train_rows = trainDf.count() test_rows = testDf.count() converter_train = make_spark_converter(trainDf) converter_test = make_spark_converter(testDf) # COMMAND ---------- # MAGIC %md # MAGIC ## Model using DenseNet architecture # COMMAND ---------- import tensorflow as tf from tensorflow.keras.models import Sequential, Model from tensorflow.keras.layers import Concatenate, Activation, Input, Dense, Dropout, Flatten, Lambda, Conv2D, MaxPooling2D, BatchNormalization, Flatten from tensorflow.keras.optimizers import Nadam, SGD, Adam from tensorflow.keras.regularizers import l2
def train(data_conf, model_conf, **kwargs): try: print("-----------------------------------") print("Starting Cashflow DL Model Training") print("-----------------------------------") print() # ============================== # 0. Main parameters definitions # ============================== # Size of X and y arrays definition N_days_X, N_days_y = int(data_conf['number_of_historical_days']), int( data_conf['number_of_predicted_days']) #365, 92 print('Number of days used for prediction (X): ', N_days_X) print('Number of days predicted (y): ', N_days_y) print() # Date range definition start_date, end_date = data_conf['start_date'], data_conf['end_date'] start_date_dt, end_date_dt, start_date_prediction, end_date_prediction, end_date_plusOneDay, end_date_minus_6month = dates_definitions( start_date, end_date, N_days_X, N_days_y) print('Date range: ', start_date, end_date) print() model_name = model_conf['model_name'] except Exception as e: print("Errored on initialization") print("Exception Trace: {0}".format(e)) print(traceback.format_exc()) raise e try: # ======================================== # T.1 Pre-processing before model training # ======================================== # Loading dataset table_in = data_conf[environment]['table_to_train_on'] #ts_balance = spark.read.parquet("/mnt/test/{0}.parquet".format(table_in)).cache() ts_balance = spark.read.format("delta").load( "/mnt/delta/{0}".format(table_in)) # Cleaning of the time series ts_balance = ts_balance.withColumn( 'balance', ts_balance.balance.cast("array<float>")) ts_balance = ts_balance.withColumn( 'keep_ts', F.udf(lambda x, y: time_series_cleaning(x, y), "int")('balance', F.lit(20)) ) #at least 10 transactions in the ts, to be used in the training ts_balance = ts_balance.where('keep_ts == 1') # Creating the dataset on which we train (and test and validate) the model ts_balance_model = ts_balance.sample( False, 0.7, seed=0) #now 0.7, but in real case would be 0.1 at best... or 0.05 print('ts_balance_model.count()', ts_balance_model.count()) # Pre-processing before model training ts_balance_model = pre_processing(ts_balance_model, end_date, spark, serving=False) ts_balance_model.show(3) print('ts_balance_model.rdd.getNumPartitions()', ts_balance_model.rdd.getNumPartitions()) ts_balance_model.show(3) # Saving prepared dataset table_out = 'cashflow_training_step1' #ts_balance_model.write.format("parquet").mode("overwrite").save("/mnt/test/{0}.parquet".format(table_out)) ts_balance_model.write.format("delta").mode("overwrite").save( "/mnt/delta/{0}".format(table_out)) except Exception as e: print("Errored on step T.1: pre-processing before model training") print("Exception Trace: {0}".format(e)) print(traceback.format_exc()) raise e try: # ======================================== # T.2 Generating TRAIN, VAL, TEST datasets # ======================================== # Loading datasets table_model = 'cashflow_training_step1' #ts_balance_model = spark.read.parquet("/mnt/test/{0}.parquet".format(table_model)).cache() ts_balance_model = spark.read.format("delta").load( "/mnt/delta/{0}".format(table_model)).cache() ts_balance_model.show(3) print('ts_balance_model.count()', ts_balance_model.count()) print('ts_balance_model.rdd.getNumPartitions()', ts_balance_model.rdd.getNumPartitions()) train_set, val_set, test_set = ts_balance_model.randomSplit( [0.6, 0.2, 0.2], seed=12345) train_set.show(3) print( 'train_set.rdd.getNumPartitions(), val_set.rdd.getNumPartitions(), test_set.rdd.getNumPartitions()', train_set.rdd.getNumPartitions(), val_set.rdd.getNumPartitions(), test_set.rdd.getNumPartitions()) # Saving prepared datasets (train, val, test sets to parquet) table_train = 'cashflow_train' table_val = 'cashflow_val' table_test = data_conf[environment][ 'table_test_for_performance'] #'cashflow_test' train_set.select('X', 'y').write.format("delta").mode("overwrite").save( "/mnt/delta/{0}".format(table_train)) val_set.select('X', 'y').write.format("delta").mode("overwrite").save( "/mnt/delta/{0}".format(table_val)) test_set.select('primaryaccountholder','transactiondate','balance')\ .write.format("delta").mode("overwrite").save("/mnt/delta/{0}".format(table_test)) except Exception as e: print("Errored on step T.2: pre-processings") print("Exception Trace: {0}".format(e)) print(traceback.format_exc()) raise e try: # ============================== # T.3 MODEL DEFINITION AND TRAIN # ============================== table_train = 'cashflow_train' table_val = 'cashflow_val' #table_train = spark.read.parquet("/mnt/test/{0}.parquet".format(table_train)) table_train = spark.read.format("delta").load( "/mnt/delta/{0}".format(table_train)) #table_val = spark.read.parquet("/mnt/test/{0}.parquet".format(table_val)) table_val = spark.read.format("delta").load( "/mnt/delta/{0}".format(table_val)) table_train_count = table_train.count() table_val_count = table_val.count() #table_train_count, table_val_count from pyspark.sql.functions import col from petastorm.spark import SparkDatasetConverter, make_spark_converter # Set a cache directory on DBFS FUSE for intermediate data. spark.conf.set(SparkDatasetConverter.PARENT_CACHE_DIR_URL_CONF, "file:///dbfs/tmp/petastorm/cache") converter_train = make_spark_converter(table_train) converter_val = make_spark_converter(table_val) print(f"train: {len(converter_train)}, val: {len(converter_val)}") def get_compiled_model(N_days_X, N_days_y, model_conf): #lr=0.001 #model = get_model(lr=lr) model = define_1dcnn_model(N_days_X, N_days_y, model_conf) hyperparameters = model_conf['hyperParameters'] opt = tf.keras.optimizers.Adam() # Model compilation model.compile(optimizer=opt, loss=hyperparameters['loss']) return model # Enable auto-logging to MLflow to capture TensorBoard metrics. mlflow.tensorflow.autolog(every_n_iter=1) model_name = model_conf['model_name'] mlflow_model_name = model_name model_dir = "/tmp/" + model_name try: dbutils.fs.rm(model_dir, recurse=True) except OSError: pass with mlflow.start_run(): NUM_EPOCHS = model_conf['hyperParameters']['epochs'] #5 BATCH_SIZE = model_conf['hyperParameters']['batch_size'] #500 def train_and_evaluate(N_days_X, N_days_y, model_conf): #lr=0.001 model = get_compiled_model(N_days_X, N_days_y, model_conf) #lr with converter_train.make_tf_dataset(batch_size=BATCH_SIZE) as train_dataset, \ converter_val.make_tf_dataset(batch_size=BATCH_SIZE) as val_dataset: #train_dataset = train_dataset.map(lambda x: (x.features, x.label_index)) train_dataset = train_dataset.map( lambda x: (tf.reshape(x.X, [-1, N_days_X, 1]), tf.reshape(x.y, [-1, N_days_y]))) steps_per_epoch = len(converter_train) // BATCH_SIZE #val_dataset = val_dataset.map(lambda x: (x.features, x.label_index)) val_dataset = val_dataset.map( lambda x: (tf.reshape(x.X, [-1, N_days_X, 1]), tf.reshape(x.y, [-1, N_days_y]))) validation_steps = max(1, len(converter_val) // BATCH_SIZE) print( f"steps_per_epoch: {steps_per_epoch}, validation_steps: {validation_steps}" ) hist = model.fit(train_dataset, steps_per_epoch=steps_per_epoch, epochs=NUM_EPOCHS, validation_data=val_dataset, validation_steps=validation_steps, verbose=2) return model, hist model, hist = train_and_evaluate(N_days_X, N_days_y, model_conf) print(hist.history['val_loss'][-1]) #MLflow logging #mlflow.log_artifact(cwd + "data.json") #mlflow.log_artifact(cwd + "config.json") mlflow.log_param("model_name", str(model_name)) mlflow.log_param("N_days_X", N_days_X) mlflow.log_param("N_days_y", N_days_y) mlflow.log_param("start_date", start_date) mlflow.log_param("end_date", end_date) mlflow.log_param("num_epochs", str(NUM_EPOCHS)) mlflow.log_param("batch_size", str(BATCH_SIZE)) #mlflow.log_param("steps_per_epoch", str(steps_per_epoch)) #validation_steps # saving using tf.keras.models.save_model tf.keras.models.save_model(model, filepath=model_dir + '/model') #SavedModel format #model.save(filepath=model_dir+'model', save_format="h5") #H5 format (todo, and look how to register that) # saving using mlflow.tensorflow.save_model (this does NOT log nor register the model) does not overwrites... #mlflow.tensorflow.save_model(tf_saved_model_dir=model_dir+'/model', # tf_meta_graph_tags=[tf.compat.v1.saved_model.tag_constants.SERVING], # tf_signature_def_key='serving_default', # path = 'model') # logging already saved model mlflow.tensorflow.log_model( tf_saved_model_dir=model_dir + '/model', tf_meta_graph_tags=[ tf.compat.v1.saved_model.tag_constants.SERVING ], tf_signature_def_key='serving_default', registered_model_name=model_name, artifact_path='model') # Getting the version number of the newly registered MLflow model (useful for next steps) mlflow_model_version = 0 client_current_model = MlflowClient() for mv in client_current_model.search_model_versions( "name='{0}'".format(mlflow_model_name)): #if int(dict(mv)['version']) == mlflow_model_version: if int( dict(mv)['version'] ) >= mlflow_model_version: # finding the last version registered mlflow_model_version = int(dict(mv)['version']) model_dict = dict(mv) #update 2020-07017: to grab the latest model version, we can also do like this: (TO BE TESTED!!!) #model_version_infos = client_current_model.search_model_versions(f"name = '{model_name}'") #mlflow_model_version = max([model_version_info.version for model_version_info in model_version_infos]) # Wait until the model is ready def wait_until_model_ready(model_name, model_version): client = MlflowClient() for _ in range(20): model_version_details = client.get_model_version( name=model_name, version=model_version, ) status = ModelVersionStatus.from_string( model_version_details.status) print("Model status: %s" % ModelVersionStatus.to_string(status)) if status == ModelVersionStatus.READY: break tm.sleep(5) wait_until_model_ready(mlflow_model_name, mlflow_model_version) # Transition the registered model stage from "None" to "Staging" client_current_model.transition_model_version_stage( name=mlflow_model_name, version=mlflow_model_version, stage="Staging", ) # Copy the file from the driver node and save it to DBFS (so that they can be accessed e.g. after the current cluster terminates.): dbutils.fs.cp("file:/tmp/{0}/model".format(model_name), "dbfs:/mnt/test/{0}/model".format(model_name), recurse=True) print('Model copied here: ', "dbfs:/mnt/test/{0}/model/".format(model_name)) #mlflow.end_run() except Exception as e: print("Errored on step T.3: model definition and train") print("Exception Trace: {0}".format(e)) print(traceback.format_exc()) raise e