def test_spark_hogwild(self): xor = [(0.0, Vectors.dense(np.array([0.0, 0.0]))), (0.0, Vectors.dense(np.array([1.0, 1.0]))), (1.0, Vectors.dense(np.array([1.0, 0.0]))), (1.0, Vectors.dense(np.array([0.0, 1.0])))] processed = self.spark.createDataFrame(xor, ["label", "features"]) \ .coalesce(1).rdd.map(lambda x: (np.asarray(x["features"]), x["label"])) first_graph = tf.Graph() with first_graph.as_default() as g: v = SparkFlowTests.create_model() mg = json_format.MessageToJson(tf.train.export_meta_graph()) spark_model = HogwildSparkModel( tensorflowGraph=mg, iters=10, tfInput='x:0', tfLabel='y:0', optimizer=tf.train.AdamOptimizer(learning_rate=.1), master_url='localhost:5000') try: weights = spark_model.train(processed) self.assertTrue(len(weights) > 0) except Exception as e: spark_model.stop_server() raise Exception(e.message)
def _fit(self, dataset): inp_col = self.getInputCol() graph_json = self.getTensorflowGraph() iters = self.getIters() label = self.getLabelCol() prediction = self.getPredictionCol() tf_input = self.getTfInput() tf_label = self.getTfLabel() tf_output = self.getTfOutput() optimizer_options = self.getOptimizerOptions() if optimizer_options is not None: optimizer_options = json.loads(optimizer_options) tf_optimizer = build_optimizer(self.getTfOptimizer(), self.getTfLearningRate(), optimizer_options) partitions = self.getPartitions() acquire_lock = self.getAqcuireLock() mbs = self.getMiniBatchSize() msi = self.getMiniStochasticIters() verbose = self.getVerbose() spi = self.getShufflePerIter() tf_dropout = self.getTfDropout() to_keep_dropout = self.getToKeepDropout() partition_shuffles = self.getPartitionShuffles() port = self.getPort() df = dataset.rdd.map(lambda x: handle_data(x, inp_col, label)) df = df.coalesce( partitions) if partitions < df.getNumPartitions() else df spark_model = HogwildSparkModel( tensorflowGraph=graph_json, iters=iters, tfInput=tf_input, tfLabel=tf_label, optimizer=tf_optimizer, master_url=SparkContext._active_spark_context.getConf().get( "spark.driver.host").__str__() + ":" + str(port), acquire_lock=acquire_lock, mini_batch=mbs, mini_stochastic_iters=msi, shuffle=spi, verbose=verbose, partition_shuffles=partition_shuffles, port=port) weights = spark_model.train(df) json_weights = convert_weights_to_json(weights) return SparkAsyncDLModel(inputCol=inp_col, modelJson=graph_json, modelWeights=json_weights, tfOutput=tf_output, tfInput=tf_input, tfDropout=tf_dropout, toKeepDropout=to_keep_dropout, predictionCol=prediction)