def test_multi_partition_shuffle(): dat = [(1.0, Vectors.dense(np.random.normal(0, 1, 10))) for _ in range(0, 200)] dat2 = [(0.0, Vectors.dense(np.random.normal(2, 1, 10))) for _ in range(0, 200)] dat.extend(dat2) random.shuffle(dat) processed = spark.createDataFrame(dat, ["label", "features"]) mg = build_graph(create_random_model) spark_model = SparkAsyncDL(inputCol='features', tensorflowGraph=mg, tfInput='x:0', tfLabel='y:0', tfOutput='outer/Sigmoid:0', tfOptimizer='adam', tfLearningRate=.1, iters=20, partitions=4, predictionCol='predicted', labelCol='label', partitionShuffles=2) data = spark_model.fit(processed).transform(processed).take(10) nb_errors = 0 for d in data: lab = d['label'] predicted = d['predicted'][0] if predicted != lab: nb_errors += 1 assert nb_errors < len(data)
def test_overlapping_guassians(): dat = [(1.0, Vectors.dense(np.random.normal(0, 1, 10))) for _ in range(0, 200)] dat2 = [(0.0, Vectors.dense(np.random.normal(2, 1, 10))) for _ in range(0, 200)] dat.extend(dat2) random.shuffle(dat) processed = spark.createDataFrame(dat, ["label", "features"]) first_graph = tf.Graph() with first_graph.as_default() as g: v = create_random_model() mg = json_format.MessageToJson(tf.train.export_meta_graph()) spark_model = SparkAsyncDL(inputCol='features', tensorflowGraph=mg, tfInput='x:0', tfLabel='y:0', tfOutput='outer/Sigmoid:0', tfOptimizer='adam', tfLearningRate=.1, iters=35, partitions=4, predictionCol='predicted', labelCol='label') data = spark_model.fit(processed).transform(processed).take(10) nb_errors = 0 for d in data: lab = d['label'] predicted = d['predicted'][0] if predicted != lab: nb_errors += 1 assert nb_errors < len(data)
def test_auto_encoder(self): processed = self.generate_random_data() mg = build_graph(SparkFlowTests.create_autoencoder) spark_model = SparkAsyncDL(inputCol='features', tensorflowGraph=mg, tfInput='x:0', tfLabel=None, tfOutput='out/Sigmoid:0', tfOptimizer='adam', tfLearningRate=.001, iters=10, predictionCol='predicted', partitions=4, miniBatchSize=10, verbose=1) encoded = spark_model.fit(processed).transform(processed).take(10) print(encoded[0]['predicted'])
def test_save_model(self): processed = self.generate_random_data() mg = build_graph(SparkFlowTests.create_random_model) spark_model = SparkAsyncDL(inputCol='features', tensorflowGraph=mg, tfInput='x:0', tfLabel='y:0', tfOutput='outer/Sigmoid:0', tfOptimizer='adam', tfLearningRate=.1, iters=20, partitions=2, predictionCol='predicted', labelCol='label') fitted = spark_model.fit(processed) fitted.save('saved_model') model = SparkAsyncDLModel.load("saved_model") data = model.transform(processed).take(10) nb_errors = SparkFlowTests.calculate_errors(data) self.assertTrue(nb_errors < len(data))
def test_small_sparse(self): xor = [(0.0, Vectors.sparse(2, [0, 1], [0.0, 0.0])), (0.0, Vectors.sparse(2, [0, 1], [1.0, 1.0])), (1.0, Vectors.sparse(2, [0], [1.0])), (1.0, Vectors.sparse(2, [1], [1.0]))] processed = self.spark.createDataFrame(xor, ["label", "features"]) mg = build_graph(SparkFlowTests.create_model) spark_model = SparkAsyncDL(inputCol='features', tensorflowGraph=mg, tfInput='x:0', tfLabel='y:0', tfOutput='outer/Sigmoid:0', tfOptimizer='adam', tfLearningRate=.1, iters=35, partitions=2, predictionCol='predicted', labelCol='label') assert spark_model.fit(processed).transform( processed).collect() is not None
def stage_result(data_train, message): """ Input: Dataset Output: Pipeline model. """ #params_dict_LinearRegression = {'labelCol': 'price', 'featuresCol': '1_vector', 'predictionCol': 'prediction', 'aggregationDepth': 2, 'solver': 'auto', 'standardization': True, 'fitIntercept': True, 'elasticNetParam': 0, 'maxIter': 100, 'regParam': 0, 'tol': 1e-06, 'loss': 'squaredError', 'epsilon': 1.35} #modul_LinearRegression = LinearRegression(**params_dict_LinearRegression) import tensorflow as tf from sparkflow.pipeline_util import PysparkPipelineWrapper from sparkflow.graph_utils import build_graph from sparkflow.tensorflow_async import SparkAsyncDL mg = build_graph(graph_model) params_dict_DNN = { "featuresCol": "features", } params_dict_DNN["tensorflowGraph"] = mg modul_DNN = SparkAsyncDL(**params_dict_DNN) #modul_DNN = SparkAsyncDL(inputCol='features', # tensorflowGraph=mg, # tfInput='x:0', # tfLabel='y:0', # tfOutput='out:0', # tfLearningRate=.001, # iters=10, # predictionCol='predicted', # labelCol='labels', # verbose=1) #stages_pipeline = [modul_LinearRegression] stages_pipeline = [modul_DNN] pipeline = Pipeline(stages=stages_pipeline) model = pipeline.fit(data_train) prediction = model.transform(data_train) if message['output_col'] == 'all': data_prediction = prediction else: try: output_col = set(message['output_col']) prediction_col = set(prediction.columns) output_col = list(output_col.intersection(prediction_col)) data_prediction = prediction.select( [col for col in prediction.columns if col in output_col]) except Exception as er: raise er return model, data_prediction
def test_overlapping_guassians(): processed = generate_random_data() mg = build_graph(create_random_model) spark_model = SparkAsyncDL(inputCol='features', tensorflowGraph=mg, tfInput='x:0', tfLabel='y:0', tfOutput='outer/Sigmoid:0', tfOptimizer='adam', tfLearningRate=.1, iters=35, partitions=4, predictionCol='predicted', labelCol='label') handle_test(spark_model, processed)
def test_multi_partition_shuffle(self): processed = self.generate_random_data() mg = build_graph(SparkFlowTests.create_random_model) spark_model = SparkAsyncDL(inputCol='features', tensorflowGraph=mg, tfInput='x:0', tfLabel='y:0', tfOutput='outer/Sigmoid:0', tfOptimizer='adam', tfLearningRate=.1, iters=20, partitions=2, predictionCol='predicted', labelCol='label', partitionShuffles=2) self.handle_assertions(spark_model, processed)
def stage_result(): """ Input: Dataset Output: Pipeline model. """ def graph_model(): x = tf.placeholder(tf.float32, shape=[None, 784], name='x') y = tf.placeholder(tf.float32, shape=[None, 10], name='y') layer0 = tf.layers.dense(x, 128, activation=tf.nn.relu) layer1 = tf.layers.dense(layer0, 128, activation=tf.nn.relu) layer2 = tf.layers.dense(layer1, 64, activation=tf.nn.relu) layer3 = tf.layers.dense(layer2, 32, activation=tf.nn.relu) out = tf.layers.dense(layer3, 10) z = tf.argmax(out, 1, name='out') loss = tf.losses.softmax_cross_entropy(y, out) return loss params_dict_DNN = {'labelCol': 'price', 'tfLearningRate': 0.001, 'iters': 10, \ 'predictionCol' : 'predicted', 'labelCol': 'labels', 'verbose' :1} params_dict_DNN["tensorflowGraph"] = build_graph(graph_model) print("tensorflowGraph type : ", params_dict_DNN["tensorflowGraph"]) modul_DNN = SparkAsyncDL(**params_dict_DNN) stages_pipeline = [modul_DNN] pipeline = Pipeline(stages=stages_pipeline) #model = pipeline.fit(data_train) #prediction = model.transform(data_train) #if message['output_col'] == 'all': # data_prediction = prediction #else: # try: # output_col = set(message['output_col']) # prediction_col = set(prediction.columns) # output_col = list(output_col.intersection(prediction_col)) # data_prediction = prediction.select([col for col in prediction.columns if col in output_col]) # except Exception as er: # raise er return pipeline #model, data_prediction
def test_rmsprop(): processed = generate_random_data() mg = build_graph(create_random_model) options = build_rmsprop_config(learning_rate=0.1, decay=0.95, momentum=0.1, centered=False) spark_model = SparkAsyncDL( inputCol='features', tensorflowGraph=mg, tfInput='x:0', tfLabel='y:0', tfOutput='outer/Sigmoid:0', tfOptimizer='rmsprop', tfLearningRate=.1, iters=25, partitions=4, predictionCol='predicted', labelCol='label', optimizerOptions=options ) handle_assertions(spark_model, processed)
def test_adam_optimizer_options(): processed = generate_random_data() mg = build_graph(create_random_model) options = build_adam_config(learning_rate=0.1, beta1=0.85, beta2=0.98, epsilon=1e-8) spark_model = SparkAsyncDL( inputCol='features', tensorflowGraph=mg, tfInput='x:0', tfLabel='y:0', tfOutput='outer/Sigmoid:0', tfOptimizer='adam', tfLearningRate=.1, iters=25, partitions=4, predictionCol='predicted', labelCol='label', optimizerOptions=options ) handle_assertions(spark_model, processed)
def test_save_pipeline(self): processed = self.generate_random_data() mg = build_graph(SparkFlowTests.create_random_model) spark_model = SparkAsyncDL(inputCol='features', tensorflowGraph=mg, tfInput='x:0', tfLabel='y:0', tfOutput='outer/Sigmoid:0', tfOptimizer='adam', tfLearningRate=.1, iters=20, partitions=2, predictionCol='predicted', labelCol='label') p = Pipeline(stages=[spark_model]).fit(processed) p.write().overwrite().save('example_pipeline') p = PysparkPipelineWrapper.unwrap( PipelineModel.load('example_pipeline')) data = p.transform(processed).take(10) nb_errors = SparkFlowTests.calculate_errors(data) self.assertTrue(nb_errors < len(data))
mg = build_graph(small_model) #Assemble and one hot encode va = VectorAssembler(inputCols=final_df.columns[1:151], outputCol='features') encoded = OneHotEncoder(inputCol='result', outputCol='labels', dropLast=False) adam_config = build_adam_config(learning_rate=0.001, beta1=0.9, beta2=0.999) spark_model = SparkAsyncDL(inputCol='features', tensorflowGraph=mg, tfInput='x:0', tfLabel='y:0', tfOutput='out:0', tfLearningRate=.001, iters=20, predictionCol='predicted', labelCol='labels', verbose=1, optimizerOptions=adam_config) ckptpath = os.path.join(ckptdir, task) print('save model in ckptpath') p = Pipeline(stages=[va, encoded, spark_model]).fit(final_df) p.write().overwrite().save(ckptpath) print('===task all done===')
out = tf.layers.dense(fc1, 10) z = tf.argmax(out, 1, name='out') loss = tf.losses.softmax_cross_entropy(y, out) return loss # Build the graph mg = build_graph(cnn_model) spark_model = SparkAsyncDL( inputCol='features', tensorflowGraph=mg, tfInput='x:0', tfLabel='y:0', tfOutput='out:0', tfOptimizer='adam', miniBatchSize=300, miniStochasticIters=-1, shufflePerIter=True, iters=10, tfLearningRate=.001, predictionCol='predicted', labelCol='labels', verbose=1 ) if __name__ == '__main__': from pyspark.ml.pipeline import Pipeline try: import time # Pipeline definition
df = spark.read.option("inferSchema", "true").csv('mnist_train.csv').orderBy(rand()) mg = build_graph(small_model) adam_config = build_adam_config(learning_rate=0.001, beta1=0.9, beta2=0.999) va = VectorAssembler(inputCols=df.columns[1:785], outputCol='features').transform(df) encoded = OneHotEncoder(inputCol='_c0', outputCol='labels', dropLast=False).transform(va).select(['features', 'labels']) #demonstration of options. Not all are required spark_model = SparkAsyncDL( inputCol='features', tensorflowGraph=mg, tfInput='x:0', tfLabel='y:0', tfOutput='out:0', tfOptimizer='adam', miniBatchSize=300, miniStochasticIters=-1, shufflePerIter=True, iters=20, predictionCol='predicted', labelCol='labels', partitions=4, verbose=1, optimizerOptions=adam_config ) spark_model.fit(encoded).save('simple_dnn') x = SparkAsyncDLModel.load("simple_dnn").transform(encoded).take(10) print(x)
spark = SparkSession.builder \ .appName("examples") \ .master('local[8]').config('spark.driver.memory', '4g') \ .getOrCreate() df = spark.read.option("inferSchema", "true").csv('mnist_train.csv').orderBy(rand()) mg = build_graph(small_model) va = VectorAssembler(inputCols=df.columns[1:785], outputCol='feats').transform(df).select(['feats']) na = Normalizer(inputCol='feats', outputCol='features', p=1.0).transform(va).select(['features']) #demonstration of options. Not all are required spark_model = SparkAsyncDL( inputCol='features', tensorflowGraph=mg, tfInput='x:0', tfLabel=None, tfOutput='out:0', tfOptimizer='adam', tfLearningRate=.001, iters=10, predictionCol='predicted', partitions=3, miniBatchSize=256, verbose=1 ).fit(na) spark_model.fit(na).save('auto_encoded') x = SparkAsyncDLModel.load("auto_encoded").transform(na).take(10) print(x)
layer1 = tf.layers.dense(x, 10, activation=tf.nn.relu) layer2 = tf.layers.dense(layer1, 5, activation=tf.nn.relu) out = tf.layers.dense(layer2, 1, activation=tf.nn.sigmoid, name='out') loss = tf.losses.mean_squared_error(y, out) # 前面代表真实标签,后面代表神经网络输出结果 return loss mg = build_graph(small_model) # 构建计算图 spark_model = SparkAsyncDL( inputCol='features', # 输入列 labelCol='label', # 输出列 tensorflowGraph=mg, tfInput='x:0', tfLabel='y:0', tfOutput='out/Sigmoid:0', tfLearningRate=0.001, # 学习率设为0.001 iters=50, # 训练50轮 predictionCol='probability', miniBatchSize=200, # 每批200个数据 verbose=1, tfOptimizer='adam' # 优化器选择Adam ) # 拟合模型 pipeline = Pipeline(stages=[featuresCreator]) data_transformer = pipeline.fit(data_train) ANN_model = spark_model.fit( data_transformer \ .transform(data_train) )
if __name__ == '__main__': spark = SparkSession.builder \ .appName("examples") \ .master('local[4]').config('spark.driver.memory', '2g') \ .getOrCreate() df = spark.read.option("inferSchema", "true").csv('examples/mnist_train.csv').orderBy(rand()) mg = build_graph(small_model) va = VectorAssembler(inputCols=df.columns[1:785], outputCol='feats').transform(df).select(['feats']) na = Normalizer(inputCol='feats', outputCol='features', p=1.0).transform(va).select(['features']) #demonstration of options. Not all are required spark_model = SparkAsyncDL( inputCol='features', tensorflowGraph=mg, tfInput='x:0', tfLabel=None, tfOutput='out/Sigmoid:0', tfOptimizer='adam', tfLearningRate=.001, iters=10, predictionCol='predicted', partitions=4, miniBatchSize=256, verbose=1 ).fit(na) t = spark_model.transform(na).take(1) print(t[0]['predicted'])