def test_multi_partition_shuffle(): dat = [(1.0, Vectors.dense(np.random.normal(0, 1, 10))) for _ in range(0, 200)] dat2 = [(0.0, Vectors.dense(np.random.normal(2, 1, 10))) for _ in range(0, 200)] dat.extend(dat2) random.shuffle(dat) processed = spark.createDataFrame(dat, ["label", "features"]) mg = build_graph(create_random_model) spark_model = SparkAsyncDL(inputCol='features', tensorflowGraph=mg, tfInput='x:0', tfLabel='y:0', tfOutput='outer/Sigmoid:0', tfOptimizer='adam', tfLearningRate=.1, iters=20, partitions=4, predictionCol='predicted', labelCol='label', partitionShuffles=2) data = spark_model.fit(processed).transform(processed).take(10) nb_errors = 0 for d in data: lab = d['label'] predicted = d['predicted'][0] if predicted != lab: nb_errors += 1 assert nb_errors < len(data)
def stage_result(data_train, message): """ Input: Dataset Output: Pipeline model. """ #params_dict_LinearRegression = {'labelCol': 'price', 'featuresCol': '1_vector', 'predictionCol': 'prediction', 'aggregationDepth': 2, 'solver': 'auto', 'standardization': True, 'fitIntercept': True, 'elasticNetParam': 0, 'maxIter': 100, 'regParam': 0, 'tol': 1e-06, 'loss': 'squaredError', 'epsilon': 1.35} #modul_LinearRegression = LinearRegression(**params_dict_LinearRegression) import tensorflow as tf from sparkflow.pipeline_util import PysparkPipelineWrapper from sparkflow.graph_utils import build_graph from sparkflow.tensorflow_async import SparkAsyncDL mg = build_graph(graph_model) params_dict_DNN = { "featuresCol": "features", } params_dict_DNN["tensorflowGraph"] = mg modul_DNN = SparkAsyncDL(**params_dict_DNN) #modul_DNN = SparkAsyncDL(inputCol='features', # tensorflowGraph=mg, # tfInput='x:0', # tfLabel='y:0', # tfOutput='out:0', # tfLearningRate=.001, # iters=10, # predictionCol='predicted', # labelCol='labels', # verbose=1) #stages_pipeline = [modul_LinearRegression] stages_pipeline = [modul_DNN] pipeline = Pipeline(stages=stages_pipeline) model = pipeline.fit(data_train) prediction = model.transform(data_train) if message['output_col'] == 'all': data_prediction = prediction else: try: output_col = set(message['output_col']) prediction_col = set(prediction.columns) output_col = list(output_col.intersection(prediction_col)) data_prediction = prediction.select( [col for col in prediction.columns if col in output_col]) except Exception as er: raise er return model, data_prediction
def test_multi_partition_shuffle(self): processed = self.generate_random_data() mg = build_graph(SparkFlowTests.create_random_model) spark_model = SparkAsyncDL(inputCol='features', tensorflowGraph=mg, tfInput='x:0', tfLabel='y:0', tfOutput='outer/Sigmoid:0', tfOptimizer='adam', tfLearningRate=.1, iters=20, partitions=2, predictionCol='predicted', labelCol='label', partitionShuffles=2) self.handle_assertions(spark_model, processed)
def test_overlapping_guassians(): processed = generate_random_data() mg = build_graph(create_random_model) spark_model = SparkAsyncDL(inputCol='features', tensorflowGraph=mg, tfInput='x:0', tfLabel='y:0', tfOutput='outer/Sigmoid:0', tfOptimizer='adam', tfLearningRate=.1, iters=35, partitions=4, predictionCol='predicted', labelCol='label') handle_test(spark_model, processed)
def test_auto_encoder(self): processed = self.generate_random_data() mg = build_graph(SparkFlowTests.create_autoencoder) spark_model = SparkAsyncDL(inputCol='features', tensorflowGraph=mg, tfInput='x:0', tfLabel=None, tfOutput='out/Sigmoid:0', tfOptimizer='adam', tfLearningRate=.001, iters=10, predictionCol='predicted', partitions=4, miniBatchSize=10, verbose=1) encoded = spark_model.fit(processed).transform(processed).take(10) print(encoded[0]['predicted'])
def stage_result(): """ Input: Dataset Output: Pipeline model. """ def graph_model(): x = tf.placeholder(tf.float32, shape=[None, 784], name='x') y = tf.placeholder(tf.float32, shape=[None, 10], name='y') layer0 = tf.layers.dense(x, 128, activation=tf.nn.relu) layer1 = tf.layers.dense(layer0, 128, activation=tf.nn.relu) layer2 = tf.layers.dense(layer1, 64, activation=tf.nn.relu) layer3 = tf.layers.dense(layer2, 32, activation=tf.nn.relu) out = tf.layers.dense(layer3, 10) z = tf.argmax(out, 1, name='out') loss = tf.losses.softmax_cross_entropy(y, out) return loss params_dict_DNN = {'labelCol': 'price', 'tfLearningRate': 0.001, 'iters': 10, \ 'predictionCol' : 'predicted', 'labelCol': 'labels', 'verbose' :1} params_dict_DNN["tensorflowGraph"] = build_graph(graph_model) print("tensorflowGraph type : ", params_dict_DNN["tensorflowGraph"]) modul_DNN = SparkAsyncDL(**params_dict_DNN) stages_pipeline = [modul_DNN] pipeline = Pipeline(stages=stages_pipeline) #model = pipeline.fit(data_train) #prediction = model.transform(data_train) #if message['output_col'] == 'all': # data_prediction = prediction #else: # try: # output_col = set(message['output_col']) # prediction_col = set(prediction.columns) # output_col = list(output_col.intersection(prediction_col)) # data_prediction = prediction.select([col for col in prediction.columns if col in output_col]) # except Exception as er: # raise er return pipeline #model, data_prediction
def test_rmsprop(): processed = generate_random_data() mg = build_graph(create_random_model) options = build_rmsprop_config(learning_rate=0.1, decay=0.95, momentum=0.1, centered=False) spark_model = SparkAsyncDL( inputCol='features', tensorflowGraph=mg, tfInput='x:0', tfLabel='y:0', tfOutput='outer/Sigmoid:0', tfOptimizer='rmsprop', tfLearningRate=.1, iters=25, partitions=4, predictionCol='predicted', labelCol='label', optimizerOptions=options ) handle_assertions(spark_model, processed)
def test_adam_optimizer_options(): processed = generate_random_data() mg = build_graph(create_random_model) options = build_adam_config(learning_rate=0.1, beta1=0.85, beta2=0.98, epsilon=1e-8) spark_model = SparkAsyncDL( inputCol='features', tensorflowGraph=mg, tfInput='x:0', tfLabel='y:0', tfOutput='outer/Sigmoid:0', tfOptimizer='adam', tfLearningRate=.1, iters=25, partitions=4, predictionCol='predicted', labelCol='label', optimizerOptions=options ) handle_assertions(spark_model, processed)
def test_save_model(self): processed = self.generate_random_data() mg = build_graph(SparkFlowTests.create_random_model) spark_model = SparkAsyncDL(inputCol='features', tensorflowGraph=mg, tfInput='x:0', tfLabel='y:0', tfOutput='outer/Sigmoid:0', tfOptimizer='adam', tfLearningRate=.1, iters=20, partitions=2, predictionCol='predicted', labelCol='label') fitted = spark_model.fit(processed) fitted.save('saved_model') model = SparkAsyncDLModel.load("saved_model") data = model.transform(processed).take(10) nb_errors = SparkFlowTests.calculate_errors(data) self.assertTrue(nb_errors < len(data))
def test_small_sparse(self): xor = [(0.0, Vectors.sparse(2, [0, 1], [0.0, 0.0])), (0.0, Vectors.sparse(2, [0, 1], [1.0, 1.0])), (1.0, Vectors.sparse(2, [0], [1.0])), (1.0, Vectors.sparse(2, [1], [1.0]))] processed = self.spark.createDataFrame(xor, ["label", "features"]) mg = build_graph(SparkFlowTests.create_model) spark_model = SparkAsyncDL(inputCol='features', tensorflowGraph=mg, tfInput='x:0', tfLabel='y:0', tfOutput='outer/Sigmoid:0', tfOptimizer='adam', tfLearningRate=.1, iters=35, partitions=2, predictionCol='predicted', labelCol='label') assert spark_model.fit(processed).transform( processed).collect() is not None
def test_save_pipeline(self): processed = self.generate_random_data() mg = build_graph(SparkFlowTests.create_random_model) spark_model = SparkAsyncDL(inputCol='features', tensorflowGraph=mg, tfInput='x:0', tfLabel='y:0', tfOutput='outer/Sigmoid:0', tfOptimizer='adam', tfLearningRate=.1, iters=20, partitions=2, predictionCol='predicted', labelCol='label') p = Pipeline(stages=[spark_model]).fit(processed) p.write().overwrite().save('example_pipeline') p = PysparkPipelineWrapper.unwrap( PipelineModel.load('example_pipeline')) data = p.transform(processed).take(10) nb_errors = SparkFlowTests.calculate_errors(data) self.assertTrue(nb_errors < len(data))
final_schema.append(StructField('result', IntegerType(), True)) final_schema = StructType(final_schema) final_rdd = sc.parallelize(process_dic) final_df = sqlContext.createDataFrame(final_rdd, final_schema) print('== preprocess finished, final_df created ==') # create spark session and train with final_df spark = SparkSession.builder \ .appName(task+'flow') \ .getOrCreate() # sc.stop() ## stop? mg = build_graph(small_model) #Assemble and one hot encode va = VectorAssembler(inputCols=final_df.columns[1:151], outputCol='features') encoded = OneHotEncoder(inputCol='result', outputCol='labels', dropLast=False) adam_config = build_adam_config(learning_rate=0.001, beta1=0.9, beta2=0.999) spark_model = SparkAsyncDL(inputCol='features', tensorflowGraph=mg, tfInput='x:0', tfLabel='y:0', tfOutput='out:0',
def cnn_model(): x = tf.placeholder(tf.float32, shape=[None, 784], name='x') y = tf.placeholder(tf.float32, shape=[None, 10], name='y') x = tf.reshape(x, shape=[-1, 28, 28, 1]) conv1 = tf.layers.conv2d(x, 32, 5, activation=tf.nn.relu) conv1 = tf.layers.max_pooling2d(conv1, 2, 2) conv2 = tf.layers.conv2d(conv1, 64, 3, activation=tf.nn.relu) conv2 = tf.layers.max_pooling2d(conv2, 2, 2) fc1 = tf.layers.flatten(conv2) out = tf.layers.dense(fc1, 10) z = tf.argmax(out, 1, name='out') loss = tf.losses.softmax_cross_entropy(y, out) return loss # Build the graph mg = build_graph(cnn_model) spark_model = SparkAsyncDL( inputCol='features', tensorflowGraph=mg, tfInput='x:0', tfLabel='y:0', tfOutput='out:0', tfOptimizer='adam', miniBatchSize=300, miniStochasticIters=-1, shufflePerIter=True, iters=10, tfLearningRate=.001, predictionCol='predicted', labelCol='labels',