Exemple #1
0
    def test_spark_hogwild(self):
        xor = [(0.0, Vectors.dense(np.array([0.0, 0.0]))),
               (0.0, Vectors.dense(np.array([1.0, 1.0]))),
               (1.0, Vectors.dense(np.array([1.0, 0.0]))),
               (1.0, Vectors.dense(np.array([0.0, 1.0])))]
        processed = self.spark.createDataFrame(xor, ["label", "features"]) \
            .coalesce(1).rdd.map(lambda x: (np.asarray(x["features"]), x["label"]))

        first_graph = tf.Graph()
        with first_graph.as_default() as g:
            v = SparkFlowTests.create_model()
            mg = json_format.MessageToJson(tf.train.export_meta_graph())

        spark_model = HogwildSparkModel(
            tensorflowGraph=mg,
            iters=10,
            tfInput='x:0',
            tfLabel='y:0',
            optimizer=tf.train.AdamOptimizer(learning_rate=.1),
            master_url='localhost:5000')

        try:
            weights = spark_model.train(processed)
            self.assertTrue(len(weights) > 0)
        except Exception as e:
            spark_model.stop_server()
            raise Exception(e.message)
    def _fit(self, dataset):
        inp_col = self.getInputCol()
        graph_json = self.getTensorflowGraph()
        iters = self.getIters()
        label = self.getLabelCol()
        prediction = self.getPredictionCol()
        tf_input = self.getTfInput()
        tf_label = self.getTfLabel()
        tf_output = self.getTfOutput()
        optimizer_options = self.getOptimizerOptions()
        if optimizer_options is not None:
            optimizer_options = json.loads(optimizer_options)
        tf_optimizer = build_optimizer(self.getTfOptimizer(),
                                       self.getTfLearningRate(),
                                       optimizer_options)
        partitions = self.getPartitions()
        acquire_lock = self.getAqcuireLock()
        mbs = self.getMiniBatchSize()
        msi = self.getMiniStochasticIters()
        verbose = self.getVerbose()
        spi = self.getShufflePerIter()
        tf_dropout = self.getTfDropout()
        to_keep_dropout = self.getToKeepDropout()
        partition_shuffles = self.getPartitionShuffles()
        port = self.getPort()

        df = dataset.rdd.map(lambda x: handle_data(x, inp_col, label))
        df = df.coalesce(
            partitions) if partitions < df.getNumPartitions() else df

        spark_model = HogwildSparkModel(
            tensorflowGraph=graph_json,
            iters=iters,
            tfInput=tf_input,
            tfLabel=tf_label,
            optimizer=tf_optimizer,
            master_url=SparkContext._active_spark_context.getConf().get(
                "spark.driver.host").__str__() + ":" + str(port),
            acquire_lock=acquire_lock,
            mini_batch=mbs,
            mini_stochastic_iters=msi,
            shuffle=spi,
            verbose=verbose,
            partition_shuffles=partition_shuffles,
            port=port)

        weights = spark_model.train(df)
        json_weights = convert_weights_to_json(weights)

        return SparkAsyncDLModel(inputCol=inp_col,
                                 modelJson=graph_json,
                                 modelWeights=json_weights,
                                 tfOutput=tf_output,
                                 tfInput=tf_input,
                                 tfDropout=tf_dropout,
                                 toKeepDropout=to_keep_dropout,
                                 predictionCol=prediction)