Beispiel #1
0
    def test_nnEstimator_fit_with_train_val_summary(self):
        model = Sequential().add(Linear(2, 2))
        criterion = MSECriterion()
        df, val_df = self.get_estimator_df()
        from bigdl.orca.learn.metrics import MAE
        est = Estimator.from_bigdl(model=model,
                                   loss=criterion,
                                   optimizer=Adam(),
                                   metrics=[MAE()],
                                   feature_preprocessing=SeqToTensor([2]),
                                   label_preprocessing=SeqToTensor([2]))
        tmp_dir = tempfile.mkdtemp()
        est.set_tensorboard(log_dir=tmp_dir, app_name="estTest")

        est.fit(df,
                epochs=5,
                batch_size=4,
                validation_data=val_df,
                validation_trigger=EveryEpoch(),
                checkpoint_trigger=SeveralIteration(1))

        res = est.predict(df)
        loss_result = est.get_train_summary("Loss")
        mae_result = est.get_validation_summary("MAE")
        assert type(res).__name__ == 'DataFrame'
        assert len(loss_result) == 5
        assert len(mae_result) == 4
    def test_control_inputs(self):

        features = np.random.randn(20, 10)
        labels = np.random.randint(0, 10, size=[20])
        with tf.Graph().as_default():
            dataset = TFDataset.from_ndarrays((features, labels),
                                              batch_size=4,
                                              val_tensors=(features, labels))
            is_training = tf.placeholder(dtype=tf.bool, shape=())
            feature_tensor, label_tensor = dataset.tensors
            features = tf.layers.dense(feature_tensor, 8)
            features = tf.layers.dropout(features, training=is_training)
            output = tf.layers.dense(features, 10)
            loss = tf.reduce_mean(
                tf.losses.sparse_softmax_cross_entropy(logits=output,
                                                       labels=label_tensor))
            optimizer = TFOptimizer.from_loss(
                loss,
                Adam(),
                val_outputs=[output],
                val_labels=[label_tensor],
                val_method=Accuracy(),
                tensor_with_value={is_training: (True, False)},
                metrics={"loss": loss})
            optimizer.optimize(end_trigger=MaxEpoch(1))
            optimizer.sess.close()
Beispiel #3
0
    def test_nnEstimator_multiOutput_cols(self):
        from pyspark.ml.linalg import Vectors
        from pyspark.sql import SparkSession

        spark = SparkSession \
            .builder \
            .getOrCreate()

        df = spark.createDataFrame([(1.0, 2.0, 1.0, 2.0), (2.0, 2.0, 2.0, 1.0),
                                    (3.0, 2.0, 1.0, 2.0),
                                    (4.0, 1.0, 1.0, 2.0)],
                                   ["user", "age", "label1", "label2"])
        linear_model = Sequential().add(Linear(2, 2))
        mse_criterion = MSECriterion()
        est = Estimator.from_bigdl(model=linear_model,
                                   loss=mse_criterion,
                                   optimizer=Adam(),
                                   feature_preprocessing=SeqToTensor([2]),
                                   label_preprocessing=SeqToTensor([2]))
        est.fit(df,
                1,
                batch_size=4,
                feature_cols=["user", "age"],
                label_cols=["label1", "label2"])
        result = est.predict(df, feature_cols=["user", "age"])
        result_c = result.collect()
        assert type(result).__name__ == 'DataFrame'
Beispiel #4
0
    def test_nnEstimator_multiInput_cols(self):
        from pyspark.ml.linalg import Vectors
        from pyspark.sql import SparkSession

        spark = SparkSession \
            .builder \
            .getOrCreate()

        df = spark.createDataFrame(
            [(1, 35, 109.0, Vectors.dense([2.0, 5.0, 0.5, 0.5]), 1.0),
             (2, 58, 2998.0, Vectors.dense([4.0, 10.0, 0.5, 0.5]), 2.0),
             (3, 18, 123.0, Vectors.dense([3.0, 15.0, 0.5, 0.5]), 1.0),
             (4, 18, 123.0, Vectors.dense([3.0, 15.0, 0.5, 0.5]), 1.0)],
            ["user", "age", "income", "history", "label"])

        x1 = ZLayer.Input(shape=(1, ))
        x2 = ZLayer.Input(shape=(2, ))
        x3 = ZLayer.Input(shape=(
            2,
            2,
        ))

        user_embedding = ZLayer.Embedding(5, 10)(x1)
        flatten = ZLayer.Flatten()(user_embedding)
        dense1 = ZLayer.Dense(2)(x2)
        gru = ZLayer.LSTM(4, input_shape=(2, 2))(x3)

        merged = ZLayer.merge([flatten, dense1, gru], mode="concat")
        zy = ZLayer.Dense(2)(merged)

        zmodel = ZModel([x1, x2, x3], zy)
        criterion = ClassNLLCriterion()
        est = Estimator.from_bigdl(model=zmodel,
                                   loss=criterion,
                                   optimizer=Adam(learningrate=0.1),
                                   feature_preprocessing=[[1], [2], [2, 2]])
        est.fit(df,
                epochs=1,
                batch_size=4,
                feature_cols=["user", "age", "income", "history"])

        res = est.predict(df,
                          feature_cols=["user", "age", "income", "history"])
        res_c = res.collect()
        assert type(res).__name__ == 'DataFrame'
Beispiel #5
0
    def test_nnEstimator_evaluation(self):
        df = self.get_estimator_df2()
        linear_model = Sequential().add(Linear(2, 2)).add(LogSoftMax())

        est = Estimator.from_bigdl(model=linear_model,
                                   loss=ClassNLLCriterion(),
                                   optimizer=Adam(),
                                   feature_preprocessing=SeqToTensor([2]),
                                   label_preprocessing=SeqToTensor([1]),
                                   metrics=Accuracy())
        est.fit(data=df, epochs=10, batch_size=8)
        result = est.evaluate(df, batch_size=8)

        shift = udf(lambda p: float(p.index(max(p))), DoubleType())
        pred = est.predict(df).withColumn("prediction",
                                          shift(col('prediction'))).cache()

        correct = pred.filter("label=prediction").count()
        overall = pred.count()
        accuracy = correct * 1.0 / overall
        assert accuracy == round(result['Top1Accuracy'], 2)
    def test_tf_optimizer_with_sparse_gradient(self):
        ids = np.random.randint(0, 10, size=[40])
        labels = np.random.randint(0, 5, size=[40])
        id_rdd = self.sc.parallelize(ids)
        label_rdd = self.sc.parallelize(labels)
        training_rdd = id_rdd.zip(label_rdd).map(lambda x: [x[0], x[1]])
        with tf.Graph().as_default():
            dataset = TFDataset.from_rdd(training_rdd,
                                         names=["ids", "labels"],
                                         shapes=[[], []],
                                         types=[tf.int32, tf.int32],
                                         batch_size=8)
            id_tensor, label_tensor = dataset.tensors
            embedding_table = tf.get_variable(name="word_embedding",
                                              shape=[10, 5])

            embedding = tf.nn.embedding_lookup(embedding_table, id_tensor)
            loss = tf.reduce_mean(
                tf.losses.sparse_softmax_cross_entropy(logits=embedding,
                                                       labels=label_tensor))
            optimizer = TFOptimizer.from_loss(loss, Adam(1e-3))
            optimizer.optimize(end_trigger=MaxEpoch(1))
            optimizer.sess.close()
    def test_tf_optimizer_metrics(self):

        features = np.random.randn(20, 10)
        labels = np.random.randint(0, 10, size=[20])
        with tf.Graph().as_default():
            dataset = TFDataset.from_ndarrays((features, labels),
                                              batch_size=4,
                                              val_tensors=(features, labels))
            feature_tensor, label_tensor = dataset.tensors
            features = tf.layers.dense(feature_tensor, 8)
            output = tf.layers.dense(features, 10)
            loss = tf.reduce_mean(
                tf.losses.sparse_softmax_cross_entropy(logits=output,
                                                       labels=label_tensor))
            optimizer = TFOptimizer.from_loss(loss, {
                "dense/": Adam(1e-3),
                "dense_1/": SGD(0.0)
            },
                                              val_outputs=[output],
                                              val_labels=[label_tensor],
                                              val_method=Accuracy(),
                                              metrics={"loss": loss})
            initial_weights = optimizer.tf_model.training_helper_layer.get_weights(
            )
            optimizer.optimize(end_trigger=MaxEpoch(1))
            updated_weights = optimizer.tf_model.training_helper_layer.get_weights(
            )
            for i in [
                    0, 1
            ]:  # weights and bias combined with "dense/" should be updated
                assert not np.allclose(initial_weights[i], updated_weights[i])
            for i in [
                    2, 3
            ]:  # weights and bias combined with "dense_1" should be unchanged
                assert np.allclose(initial_weights[i], updated_weights[i])
            optimizer.sess.close()
    def test_checkpoint(self):

        features = np.random.randn(20, 10)
        labels = np.random.randint(0, 10, size=[20])
        with tf.Graph().as_default():
            dataset = TFDataset.from_ndarrays((features, labels),
                                              batch_size=4,
                                              val_tensors=(features, labels))
            feature_tensor, label_tensor = dataset.tensors
            features = tf.layers.dense(feature_tensor, 8)
            output = tf.layers.dense(features, 10)
            loss = tf.reduce_mean(
                tf.losses.sparse_softmax_cross_entropy(logits=output,
                                                       labels=label_tensor))
            model_dir = tempfile.mkdtemp()
            try:
                optimizer = TFOptimizer.from_loss(loss,
                                                  Adam(),
                                                  val_outputs=[output],
                                                  val_labels=[label_tensor],
                                                  val_method=Accuracy(),
                                                  metrics={"loss": loss},
                                                  model_dir=model_dir)
                optimizer.optimize(end_trigger=MaxEpoch(1))

                first_weights = optimizer.sess.run(tf.trainable_variables()[0])
                import re
                ckpt_path = None
                versions = []
                for (root, dirs, files) in os.walk(model_dir, topdown=True):
                    temp_versions = []
                    for file_name in files:
                        if re.match("^optimMethod-TFParkTraining\.[0-9]+$",
                                    file_name) is not None:
                            version = int(file_name.split(".")[1])
                            temp_versions.append(version)
                    if temp_versions:
                        ckpt_path = root
                        versions = temp_versions
                        break

                assert ckpt_path is not None, "Cannot fine checkpoint file"
                optimizer.sess.run(
                    tf.global_variables_initializer())  # reset variable
                optimizer_load = TFOptimizer.from_loss(
                    loss,
                    Adam(),
                    session=optimizer.sess,
                    val_outputs=[output],
                    val_labels=[label_tensor],
                    val_method=Accuracy(),
                    metrics={"loss": loss},
                    model_dir=model_dir)
                optimizer_load.load_checkpoint(ckpt_path, max(versions))
                loaded_first_weights_before_train = optimizer.sess.run(
                    tf.trainable_variables()[0])
                assert np.allclose(first_weights,
                                   loaded_first_weights_before_train)
                # max epoch still 1, should not train
                optimizer_load.optimize(end_trigger=MaxEpoch(1))
                loaded_first_weights = optimizer.sess.run(
                    tf.trainable_variables()[0])
                assert np.allclose(first_weights, loaded_first_weights)

                # max epoch increase 1, should train 1 epoch
                optimizer_load.optimize(end_trigger=MaxEpoch(2))
                loaded_first_weights_2 = optimizer.sess.run(
                    tf.trainable_variables()[0])
                assert not np.allclose(first_weights, loaded_first_weights_2)
                optimizer_load.sess.close()
            finally:
                import shutil
                shutil.rmtree(model_dir)
Beispiel #9
0
    def test_nnEstimator(self):
        from bigdl.dllib.nnframes import NNModel
        linear_model = Sequential().add(Linear(2, 2))
        mse_criterion = MSECriterion()
        df, _ = self.get_estimator_df()
        est = Estimator.from_bigdl(model=linear_model,
                                   loss=mse_criterion,
                                   optimizer=Adam(),
                                   feature_preprocessing=SeqToTensor([2]),
                                   label_preprocessing=SeqToTensor([2]))
        res0 = est.predict(df)
        res0_c = res0.collect()
        est.fit(df, 2, batch_size=4)
        nn_model = NNModel(est.get_model(),
                           feature_preprocessing=SeqToTensor([2]))
        res1 = nn_model.transform(df)
        res2 = est.predict(df)
        res1_c = res1.collect()
        res2_c = res2.collect()
        assert type(res1).__name__ == 'DataFrame'
        assert type(res2).__name__ == 'DataFrame'
        assert len(res1_c) == len(res2_c)
        for idx in range(len(res1_c)):
            assert res1_c[idx]["prediction"] == res2_c[idx]["prediction"]
        with tempfile.TemporaryDirectory() as tempdirname:
            temp_path = os.path.join(tempdirname, "model")
            est.save(temp_path)
            est2 = Estimator.from_bigdl(model=linear_model, loss=mse_criterion)
            est2.load(temp_path,
                      optimizer=Adam(),
                      loss=mse_criterion,
                      feature_preprocessing=SeqToTensor([2]),
                      label_preprocessing=SeqToTensor([2]))
            est2.set_constant_gradient_clipping(0.1, 1.2)
            est2.clear_gradient_clipping()
            res3 = est2.predict(df)
            res3_c = res3.collect()
            assert type(res3).__name__ == 'DataFrame'
            assert len(res1_c) == len(res3_c)
            for idx in range(len(res1_c)):
                assert res1_c[idx]["prediction"] == res3_c[idx]["prediction"]
            est2.fit(df, 4, batch_size=4)

        data = self.sc.parallelize([((2.0, 1.0), (1.0, 2.0)),
                                    ((1.0, 2.0), (2.0, 1.0)),
                                    ((2.0, 1.0), (1.0, 2.0)),
                                    ((1.0, 2.0), (2.0, 1.0))])
        data_shard = SparkXShards(data)
        data_shard = data_shard.transform_shard(
            lambda feature_label_tuple: {
                "x":
                np.stack([
                    np.expand_dims(np.array(feature_label_tuple[0][0]), axis=0
                                   ),
                    np.expand_dims(np.array(feature_label_tuple[0][1]), axis=0)
                ],
                         axis=1),
                "y":
                np.stack([
                    np.expand_dims(np.array(feature_label_tuple[1][0]), axis=0
                                   ),
                    np.expand_dims(np.array(feature_label_tuple[1][1]), axis=0)
                ],
                         axis=1)
            })
        res4 = est.predict(data_shard)
        res4_c = res4.collect()
        assert type(res4).__name__ == 'SparkXShards'
        for idx in range(len(res4_c)):
            assert abs(res4_c[idx]["prediction"][0][0] -
                       res3_c[idx]["prediction"][0]) == 0
            assert abs(res4_c[idx]["prediction"][0][1] -
                       res3_c[idx]["prediction"][1]) == 0
        est.fit(data_shard, 1, batch_size=4)
        res5 = est.predict(data_shard)
        res5_c = res5.collect()
        res6 = est.predict(df)
        res6_c = res6.collect()
        for idx in range(len(res5_c)):
            assert abs(res5_c[idx]["prediction"][0][0] -
                       res6_c[idx]["prediction"][0]) == 0
            assert abs(res5_c[idx]["prediction"][0][1] -
                       res6_c[idx]["prediction"][1]) == 0