def evaluate(self,
              x=None,
              y=None,
              batch_per_thread=None,
              distributed=False):
     if isinstance(x, TFDataset):
         x = _standarize_feature_label_dataset(x, self.model)
         # todo check arguments
         return self._evaluate_distributed(x)
     else:
         if distributed:
             sc = getOrCreateSparkContext()
             rdd, types, shapes = _create_rdd_x_y(
                 x, y, self.model._feed_input_names,
                 self.model._feed_output_names, sc)
             names = self.model._feed_input_names + self.model._feed_output_names
             dataset = TFDataset.from_rdd(
                 rdd,
                 names=names,
                 types=types,
                 shapes=shapes,
                 batch_per_thread=-1
                 if batch_per_thread is None else batch_per_thread)
             return self._evaluate_distributed(dataset)
         else:
             return self.model.evaluate(x=x,
                                        y=y,
                                        batch_size=batch_per_thread)
    def predict(self, x, batch_per_thread=None, distributed=False):

        if isinstance(x, TFDataset):
            # todo check arguments
            x = _standarize_feature_dataset(x, self.model)
            return self._predict_distributed(x)
        else:
            if distributed:
                sc = getOrCreateSparkContext()
                rdd, types, shapes = _create_rdd_x(
                    x, self.model._feed_input_names, sc)

                dataset = TFDataset.from_rdd(
                    rdd,
                    names=self.model._feed_input_names,
                    types=types,
                    shapes=shapes,
                    batch_per_thread=-1
                    if batch_per_thread is None else batch_per_thread)
                results = self._predict_distributed(dataset).collect()
                output_num = len(self.model.outputs)
                if output_num == 1:
                    return np.stack(results)
                else:
                    predictions = []
                    for i in range(0, output_num):
                        predictions.append(
                            np.stack([res[i] for res in results]))
                    return predictions
            else:
                return self.model.predict(x=x, batch_size=batch_per_thread)
Beispiel #3
0
 def evaluate(self,
              x=None,
              y=None,
              batch_per_thread=None,
              distributed=False):
     if isinstance(x, TFDataset):
         if not x.has_batch:
             raise ValueError("The batch_per_thread of TFDataset must be " +
                              "specified when used in KerasModel evaluate.")
         x = _standarize_feature_label_dataset(x, self.model)
         # todo check arguments
         return self._evaluate_distributed(x)
     else:
         if distributed:
             sc = getOrCreateSparkContext()
             rdd, types, shapes = _create_rdd_x_y(
                 x, y, self.model._feed_input_names,
                 self.model._feed_output_names, sc)
             names = self.model._feed_input_names + self.model._feed_output_names
             dataset = TFDataset.from_rdd(
                 rdd,
                 names=names,
                 types=types,
                 shapes=shapes,
                 batch_per_thread=-1
                 if batch_per_thread is None else batch_per_thread)
             return self._evaluate_distributed(dataset)
         else:
             return self.model.evaluate(x=x,
                                        y=y,
                                        batch_size=batch_per_thread)
Beispiel #4
0
    def test_tf_optimizer_with_sparse_gradient(self):
        import tensorflow as tf

        ids = np.random.randint(0, 10, size=[40])
        labels = np.random.randint(0, 5, size=[40])
        id_rdd = self.sc.parallelize(ids)
        label_rdd = self.sc.parallelize(labels)
        training_rdd = id_rdd.zip(label_rdd).map(lambda x: [x[0], x[1]])
        with tf.Graph().as_default():
            dataset = TFDataset.from_rdd(training_rdd,
                                         names=["ids", "labels"],
                                         shapes=[[], []],
                                         types=[tf.int32, tf.int32],
                                         batch_size=8)
            id_tensor, label_tensor = dataset.tensors
            embedding_table = tf.get_variable(name="word_embedding",
                                              shape=[10, 5])

            embedding = tf.nn.embedding_lookup(embedding_table, id_tensor)
            loss = tf.reduce_mean(
                tf.losses.sparse_softmax_cross_entropy(logits=embedding,
                                                       labels=label_tensor))
            optimizer = TFOptimizer(loss, Adam(1e-3))
            optimizer.optimize(end_trigger=MaxEpoch(1))
            optimizer.sess.close()
Beispiel #5
0
    def predict(self, x, batch_per_thread=None, distributed=False):
        """
        Use a model to do prediction.

        :param x: Input data. It could be:
            - a TFDataset object
            - A Numpy array (or array-like), or a list of arrays
               (in case the model has multiple inputs).
            - A dict mapping input names to the corresponding array/tensors,
            if the model has named inputs.
        :param batch_per_thread:
          The default value is 1.
          When distributed is True,the total batch size is batch_per_thread * rdd.getNumPartitions.
          When distributed is False the total batch size is batch_per_thread * numOfCores.
        :param distributed: Boolean. Whether to do prediction in distributed mode or local mode.
                     Default is True. In local mode, x must be a Numpy array.
        """

        if isinstance(x, TFDataset):
            # todo check arguments
            if not x.has_batch:
                raise ValueError(
                    "The batch_per_thread of TFDataset" +
                    " must be specified when used in KerasModel predict.")
            if isinstance(x, TFNdarrayDataset):
                x = _standarize_feature_dataset(x, self.model)
            return self._predict_distributed(x)
        else:
            if distributed:
                sc = getOrCreateSparkContext()
                rdd, types, shapes = _create_rdd_x(
                    x, self.model._feed_input_names, sc)

                dataset = TFDataset.from_rdd(
                    rdd,
                    names=self.model._feed_input_names,
                    types=types,
                    shapes=shapes,
                    batch_per_thread=-1
                    if batch_per_thread is None else batch_per_thread)
                results = self._predict_distributed(dataset).collect()
                output_num = len(self.model.outputs)
                if output_num == 1:
                    return np.stack(results)
                else:
                    predictions = []
                    for i in range(0, output_num):
                        predictions.append(
                            np.stack([res[i] for res in results]))
                    return predictions
            else:
                return self.model.predict(x=x, batch_size=batch_per_thread)
Beispiel #6
0
    def fit(self,
            x=None,
            y=None,
            batch_size=None,
            epochs=1,
            validation_split=0.,
            validation_data=None,
            distributed=False,
            **kwargs):
        if isinstance(x, TFDataset):
            # todo check arguments
            if not x.has_batch:
                raise ValueError("The batch_size of TFDataset must be " +
                                 "specified when used in KerasModel fit.")
            x = _standarize_feature_label_dataset(x, self.model)
            self._fit_distributed(x, validation_split, epochs, **kwargs)

        elif distributed:
            sc = getOrCreateSparkContext()
            train_rdd, types, shapes = _create_rdd_x_y(
                x, y, self.model._feed_input_names,
                self.model._feed_output_names, sc)

            val_rdd = None
            if validation_data is not None:
                val_rdd, _, _ = _create_rdd_x_y(validation_data[0],
                                                validation_data[1],
                                                self.model._feed_input_names,
                                                self.model._feed_output_names,
                                                sc)
            names = self.model._feed_input_names + self.model._feed_output_names
            dataset = TFDataset.from_rdd(
                train_rdd,
                names=names,
                shapes=shapes,
                types=types,
                batch_size=batch_size if batch_size is not None else 32,
                val_rdd=val_rdd)
            self._fit_distributed(dataset, validation_split, epochs, **kwargs)

        else:
            self.model.fit(x=x,
                           y=y,
                           batch_size=batch_size,
                           epochs=epochs,
                           validation_split=validation_split,
                           validation_data=validation_data,
                           **kwargs)
Beispiel #7
0
def main(max_epoch, data_num):
    sc = init_nncontext()

    # get data, pre-process and create TFDataset
    def get_data_rdd(dataset):
        (images_data,
         labels_data) = mnist.read_data_sets("/tmp/mnist", dataset)
        image_rdd = sc.parallelize(images_data[:data_num])
        labels_rdd = sc.parallelize(labels_data[:data_num])
        rdd = image_rdd.zip(labels_rdd) \
            .map(lambda rec_tuple: [normalizer(rec_tuple[0], mnist.TRAIN_MEAN, mnist.TRAIN_STD),
                                    np.array(rec_tuple[1])])
        return rdd

    training_rdd = get_data_rdd("train")
    testing_rdd = get_data_rdd("test")
    dataset = TFDataset.from_rdd(training_rdd,
                                 names=["features", "labels"],
                                 shapes=[[28, 28, 1], []],
                                 types=[tf.float32, tf.int32],
                                 batch_size=280,
                                 val_rdd=testing_rdd)

    # construct the model from TFDataset
    images, labels = dataset.tensors

    with slim.arg_scope(lenet.lenet_arg_scope()):
        logits, end_points = lenet.lenet(images,
                                         num_classes=10,
                                         is_training=True)

    loss = tf.reduce_mean(
        tf.losses.sparse_softmax_cross_entropy(logits=logits, labels=labels))

    # create a optimizer
    optimizer = TFOptimizer(loss,
                            Adam(1e-3),
                            val_outputs=[logits],
                            val_labels=[labels],
                            val_method=Top1Accuracy())
    optimizer.set_train_summary(TrainSummary("/tmp/az_lenet", "lenet"))
    optimizer.set_val_summary(ValidationSummary("/tmp/az_lenet", "lenet"))
    # kick off training
    optimizer.optimize(end_trigger=MaxEpoch(max_epoch))

    saver = tf.train.Saver()
    saver.save(optimizer.sess, "/tmp/lenet/")
def main(data_num):

    data = Input(shape=[28, 28, 1])

    x = Flatten()(data)
    x = Dense(64, activation='relu')(x)
    x = Dense(64, activation='relu')(x)
    predictions = Dense(10, activation='softmax')(x)

    model = Model(inputs=data, outputs=predictions)

    model.load_weights("/tmp/mnist_keras.h5")

    if DISTRIBUTED:
        # using RDD api to do distributed evaluation
        sc = init_nncontext()
        # get data, pre-process and create TFDataset
        (images_data, labels_data) = mnist.read_data_sets("/tmp/mnist", "test")
        image_rdd = sc.parallelize(images_data[:data_num])
        labels_rdd = sc.parallelize(labels_data[:data_num])
        rdd = image_rdd.zip(labels_rdd) \
            .map(lambda rec_tuple: [normalizer(rec_tuple[0], mnist.TRAIN_MEAN, mnist.TRAIN_STD)])

        dataset = TFDataset.from_rdd(rdd,
                                     names=["features"],
                                     shapes=[[28, 28, 1]],
                                     types=[tf.float32],
                                     batch_per_thread=20)
        predictor = TFPredictor.from_keras(model, dataset)

        accuracy = predictor.predict().zip(labels_rdd).map(
            lambda x: np.argmax(x[0]) == x[1]).mean()

        print("predict accuracy is %s" % accuracy)

    else:
        # using keras api for local evaluation
        model.compile(optimizer='rmsprop',
                      loss='sparse_categorical_crossentropy',
                      metrics=['accuracy'])

        (images_data, labels_data) = mnist.read_data_sets("/tmp/mnist", "test")
        images_data = normalizer(images_data, mnist.TRAIN_MEAN,
                                 mnist.TRAIN_STD)
        result = model.evaluate(images_data, labels_data)
        print(model.metrics_names)
        print(result)
Beispiel #9
0
def main(max_epoch, data_num):
    sc = init_nncontext()

    # get data, pre-process and create TFDataset
    def get_data_rdd(dataset):
        (images_data, labels_data) = mnist.read_data_sets("/tmp/mnist", dataset)
        image_rdd = sc.parallelize(images_data[:data_num])
        labels_rdd = sc.parallelize(labels_data[:data_num])
        rdd = image_rdd.zip(labels_rdd) \
            .map(lambda rec_tuple: [normalizer(rec_tuple[0], mnist.TRAIN_MEAN, mnist.TRAIN_STD),
                                    np.array(rec_tuple[1])])
        return rdd

    training_rdd = get_data_rdd("train")
    testing_rdd = get_data_rdd("test")
    dataset = TFDataset.from_rdd(training_rdd,
                                 names=["features", "labels"],
                                 shapes=[[28, 28, 1], []],
                                 types=[tf.float32, tf.int32],
                                 batch_size=280,
                                 val_rdd=testing_rdd
                                 )

    data = Input(shape=[28, 28, 1])

    x = Flatten()(data)
    x = Dense(64, activation='relu')(x)
    x = Dense(64, activation='relu')(x)
    predictions = Dense(10, activation='softmax')(x)

    model = Model(input=data, output=predictions)

    model.compile(optimizer='rmsprop',
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])

    optimizer = TFOptimizer.from_keras(model, dataset)

    optimizer.set_train_summary(TrainSummary("/tmp/az_lenet", "lenet"))
    optimizer.set_val_summary(ValidationSummary("/tmp/az_lenet", "lenet"))
    # kick off training
    optimizer.optimize(end_trigger=MaxEpoch(max_epoch))

    saver = tf.train.Saver()
    saver.save(optimizer.sess, "/tmp/lenet/")
Beispiel #10
0
def main():

    sc = init_nncontext()

    # get data, pre-process and create TFDataset
    (images_data, labels_data) = mnist.read_data_sets("/tmp/mnist", "test")
    image_rdd = sc.parallelize(images_data)
    labels_rdd = sc.parallelize(labels_data)
    rdd = image_rdd.zip(labels_rdd) \
        .map(lambda rec_tuple: [normalizer(rec_tuple[0], mnist.TRAIN_MEAN, mnist.TRAIN_STD),
                                np.array(rec_tuple[1])])

    dataset = TFDataset.from_rdd(rdd,
                                 names=["features", "labels"],
                                 shapes=[[28, 28, 1], [1]],
                                 types=[tf.float32, tf.int32],
                                 batch_per_thread=20)

    # construct the model from TFDataset
    images, labels = dataset.tensors

    labels = tf.squeeze(labels)

    with slim.arg_scope(lenet.lenet_arg_scope()):
        logits, end_points = lenet.lenet(images,
                                         num_classes=10,
                                         is_training=False)

    predictions = tf.to_int32(tf.argmax(logits, axis=1))
    correct = tf.expand_dims(tf.to_int32(tf.equal(predictions, labels)),
                             axis=1)

    saver = tf.train.Saver()

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        saver.restore(sess, "/tmp/lenet/")

        predictor = TFPredictor(sess, [correct])

        accuracy = predictor.predict().mean()

        print("predict accuracy is %s" % accuracy)
def main():
    sc = init_nncontext()

    # get data, pre-process and create TFDataset
    (images_data, labels_data) = mnist.read_data_sets("/tmp/mnist", "train")
    image_rdd = sc.parallelize(images_data)
    labels_rdd = sc.parallelize(labels_data)
    rdd = image_rdd.zip(labels_rdd) \
        .map(lambda rec_tuple: [normalizer(rec_tuple[0], mnist.TRAIN_MEAN, mnist.TRAIN_STD),
                                np.array(rec_tuple[1])])

    dataset = TFDataset.from_rdd(rdd,
                                 names=["features", "labels"],
                                 shapes=[(None, 28, 28, 1), (None, 1)],
                                 types=[tf.float32, tf.int32]
                                 )

    # construct the model from TFDataset
    images, labels = dataset.inputs

    labels = tf.squeeze(labels)

    with slim.arg_scope(lenet.lenet_arg_scope()):
        logits, end_points = lenet.lenet(images, num_classes=10, is_training=True)

    loss = tf.reduce_mean(tf.losses.sparse_softmax_cross_entropy(logits=logits, labels=labels))

    # create a optimizer
    optimizer = TFOptimizer(loss, Adam(1e-3))
    # kick off training
    # you may change the MaxIteration to MaxEpoch(5) to make it converge
    optimizer.optimize(end_trigger=MaxIteration(20), batch_size=280)

    # evaluate
    (images_data, labels_data) = mnist.read_data_sets("/tmp/mnist", "test")
    images_data = normalizer(images_data, mnist.TRAIN_MEAN, mnist.TRAIN_STD)
    predictions = tf.argmax(logits, axis=1)
    predictions_data, loss_value = optimizer.sess.run([predictions, loss],
                                                      feed_dict={images: images_data,
                                                                 labels: labels_data})
    print(np.mean(np.equal(predictions_data, labels_data)))
    print(loss_value)
Beispiel #12
0
def main():
    sc = init_nncontext()

    # get data, pre-process and create TFDataset
    (images_data, labels_data) = mnist.read_data_sets("/tmp/mnist", "train")
    image_rdd = sc.parallelize(images_data)
    labels_rdd = sc.parallelize(labels_data)
    rdd = image_rdd.zip(labels_rdd) \
        .map(lambda rec_tuple: [normalizer(rec_tuple[0], mnist.TRAIN_MEAN, mnist.TRAIN_STD),
                                np.array(rec_tuple[1])])

    dataset = TFDataset.from_rdd(rdd,
                                 names=["features", "labels"],
                                 shapes=[[28, 28, 1], [1]],
                                 types=[tf.float32, tf.int32],
                                 batch_size=280)

    # construct the model from TFDataset
    images, labels = dataset.tensors

    labels = tf.squeeze(labels)

    with slim.arg_scope(lenet.lenet_arg_scope()):
        logits, end_points = lenet.lenet(images,
                                         num_classes=10,
                                         is_training=True)

    loss = tf.reduce_mean(
        tf.losses.sparse_softmax_cross_entropy(logits=logits, labels=labels))

    # create a optimizer
    optimizer = TFOptimizer(loss, Adam(1e-3))
    optimizer.set_train_summary(TrainSummary("/tmp/az_lenet", "lenet"))
    # kick off training
    for i in range(5):
        optimizer.optimize(end_trigger=MaxEpoch(i + 1))

    saver = tf.train.Saver()
    saver.save(optimizer.sess, "/tmp/lenet/")
Beispiel #13
0
    def test_tf_optimizer_with_sparse_gradient_using_keras(self):
        import tensorflow as tf

        ids = np.random.randint(0, 10, size=[40])
        labels = np.random.randint(0, 5, size=[40])
        id_rdd = self.sc.parallelize(ids)
        label_rdd = self.sc.parallelize(labels)
        training_rdd = id_rdd.zip(label_rdd).map(lambda x: [x[0], x[1]])
        with tf.Graph().as_default():
            dataset = TFDataset.from_rdd(training_rdd,
                                         names=["ids", "labels"],
                                         shapes=[[], []],
                                         types=[tf.int32, tf.int32],
                                         batch_size=8)
            from tensorflow.python.ops import variable_scope

            def variable_creator(**kwargs):
                kwargs["use_resource"] = False
                return variable_scope.default_variable_creator(None, **kwargs)

            getter = lambda next_creator, **kwargs: variable_creator(**kwargs)
            with variable_scope.variable_creator_scope(getter):
                words_input = tf.keras.layers.Input(shape=(),
                                                    name='words_input')
                embedding_layer = tf.keras.layers.Embedding(
                    input_dim=10, output_dim=5, name='word_embedding')
                word_embeddings = embedding_layer(words_input)
                embedding = tf.keras.layers.Flatten()(word_embeddings)
                output = tf.keras.layers.Dense(5,
                                               activation="softmax")(embedding)
                model = tf.keras.models.Model(inputs=[words_input],
                                              outputs=[output])
                model.compile(optimizer="sgd",
                              loss="sparse_categorical_crossentropy")

            optimizer = TFOptimizer.from_keras(model, dataset)
            optimizer.optimize(end_trigger=MaxEpoch(1))
            optimizer.sess.close()
Beispiel #14
0
# In[10]:

# Let's have a look at one element of trainRDD
trainRDD.take(1)

# We can see that `features` is  now composed by the list of 801 particles with 19 features each (`shape=[801 19]`) plus the HLF (`shape=[14]`) and the encoded label (`shape=[3]`).

# In[11]:

from zoo.pipeline.api.net import TFDataset
from zoo.tfpark.model import KerasModel

# create TFDataset for TF training
dataset = TFDataset.from_rdd(trainRDD,
                             features=[(tf.float32, [801, 19]),
                                       (tf.float32, [14])],
                             labels=(tf.float32, [3]),
                             batch_size=256,
                             val_rdd=testRDD)

# ## Optimizer setup and training

# In[12]:

# Set of hyperparameters
numEpochs = 8

# The batch used by BDL must be a multiple of numExecutors * executorCores
# Because data will be equally distibuted inside each executor

workerBatch = 64
# numExecutors = int(sc._conf.get('spark.executor.instances'))
Beispiel #15
0
        output_dim=FC_LINEAR_DIMENSION,  # 尺寸: 32 -> 64.
        activation="sigmoid"))

# BigDL 不支持 parameter sharing, 不得已而为之.
both_feature = TimeDistributed(layer=convolve_net,
                               input_shape=input_shape)(both_input)

encode_left = both_feature.index_select(1, 0)
encode_right = both_feature.index_select(1, 1)

distance = autograd.abs(encode_left - encode_right)
predict = Dense(output_dim=NUM_CLASS_LABEL, activation="sigmoid")(distance)

siamese_net = Model(input=both_input, output=predict)
siamese_net.compile(optimizer="adam",
                    loss='sparse_categorical_crossentropy',
                    metrics=["accuracy"])

# 构造分布式的数据集对象.
data_set = TFDataset.from_rdd(train_rdd,
                              shapes=[input_shape, [1]],
                              batch_size=args.batch_size,
                              val_rdd=test_rdd)

optimizer = TFOptimizer.from_keras(siamese_net, data_set)
app_name = "Siamese Network"

optimizer.set_train_summary(TrainSummary("tmp", app_name))
optimizer.set_val_summary(ValidationSummary("tmp", app_name))
optimizer.optimize(end_trigger=MaxEpoch(args.num_epoch))