Example #1
0
    def fit(self,
            x=None,
            y=None,
            batch_size=None,
            epochs=1,
            validation_split=0.,
            validation_data=None,
            distributed=False,
            **kwargs):
        if isinstance(x, TFDataset):
            # todo check arguments
            if not x.has_batch:
                raise ValueError("The batch_size of TFDataset must be " +
                                 "specified when used in KerasModel fit.")
            if isinstance(x, TFNdarrayDataset):
                x = _standarize_feature_label_dataset(x, self.model)
            self._fit_distributed(x, validation_split, epochs, **kwargs)

        elif distributed:
            dataset = TFDataset.from_ndarrays((x, y),
                                              val_tensors=validation_data)
            self._fit_distributed(dataset, validation_split, epochs, **kwargs)

        else:
            self.model.fit(x=x,
                           y=y,
                           batch_size=batch_size,
                           epochs=epochs,
                           validation_split=validation_split,
                           validation_data=validation_data,
                           **kwargs)
Example #2
0
 def evaluate(self,
              x=None,
              y=None,
              batch_per_thread=None,
              distributed=False):
     if isinstance(x, TFDataset):
         x = _standarize_feature_label_dataset(x, self.model)
         # todo check arguments
         return self._evaluate_distributed(x)
     else:
         if distributed:
             sc = getOrCreateSparkContext()
             rdd, types, shapes = _create_rdd_x_y(
                 x, y, self.model._feed_input_names,
                 self.model._feed_output_names, sc)
             names = self.model._feed_input_names + self.model._feed_output_names
             dataset = TFDataset.from_rdd(
                 rdd,
                 names=names,
                 types=types,
                 shapes=shapes,
                 batch_per_thread=-1
                 if batch_per_thread is None else batch_per_thread)
             return self._evaluate_distributed(dataset)
         else:
             return self.model.evaluate(x=x,
                                        y=y,
                                        batch_size=batch_per_thread)
Example #3
0
    def test_tf_optimizer_with_sparse_gradient(self):
        import tensorflow as tf

        ids = np.random.randint(0, 10, size=[40])
        labels = np.random.randint(0, 5, size=[40])
        id_rdd = self.sc.parallelize(ids)
        label_rdd = self.sc.parallelize(labels)
        training_rdd = id_rdd.zip(label_rdd).map(lambda x: [x[0], x[1]])
        with tf.Graph().as_default():
            dataset = TFDataset.from_rdd(training_rdd,
                                         names=["ids", "labels"],
                                         shapes=[[], []],
                                         types=[tf.int32, tf.int32],
                                         batch_size=8)
            id_tensor, label_tensor = dataset.tensors
            embedding_table = tf.get_variable(name="word_embedding",
                                              shape=[10, 5])

            embedding = tf.nn.embedding_lookup(embedding_table, id_tensor)
            loss = tf.reduce_mean(
                tf.losses.sparse_softmax_cross_entropy(logits=embedding,
                                                       labels=label_tensor))
            optimizer = TFOptimizer(loss, Adam(1e-3))
            optimizer.optimize(end_trigger=MaxEpoch(1))
            optimizer.sess.close()
Example #4
0
 def evaluate(self,
              x=None,
              y=None,
              batch_per_thread=None,
              distributed=False):
     if isinstance(x, TFDataset):
         if not x.has_batch:
             raise ValueError("The batch_per_thread of TFDataset must be " +
                              "specified when used in KerasModel evaluate.")
         x = _standarize_feature_label_dataset(x, self.model)
         # todo check arguments
         return self._evaluate_distributed(x)
     else:
         if distributed:
             sc = getOrCreateSparkContext()
             rdd, types, shapes = _create_rdd_x_y(
                 x, y, self.model._feed_input_names,
                 self.model._feed_output_names, sc)
             names = self.model._feed_input_names + self.model._feed_output_names
             dataset = TFDataset.from_rdd(
                 rdd,
                 names=names,
                 types=types,
                 shapes=shapes,
                 batch_per_thread=-1
                 if batch_per_thread is None else batch_per_thread)
             return self._evaluate_distributed(dataset)
         else:
             return self.model.evaluate(x=x,
                                        y=y,
                                        batch_size=batch_per_thread)
Example #5
0
    def predict(self, x, batch_per_thread=None, distributed=False):

        if isinstance(x, TFDataset):
            # todo check arguments
            x = _standarize_feature_dataset(x, self.model)
            return self._predict_distributed(x)
        else:
            if distributed:
                sc = getOrCreateSparkContext()
                rdd, types, shapes = _create_rdd_x(
                    x, self.model._feed_input_names, sc)

                dataset = TFDataset.from_rdd(
                    rdd,
                    names=self.model._feed_input_names,
                    types=types,
                    shapes=shapes,
                    batch_per_thread=-1
                    if batch_per_thread is None else batch_per_thread)
                results = self._predict_distributed(dataset).collect()
                output_num = len(self.model.outputs)
                if output_num == 1:
                    return np.stack(results)
                else:
                    predictions = []
                    for i in range(0, output_num):
                        predictions.append(
                            np.stack([res[i] for res in results]))
                    return predictions
            else:
                return self.model.predict(x=x, batch_size=batch_per_thread)
Example #6
0
def _standarize_feature_label_dataset(dataset, model):
    input_names = model.input_names
    output_names = model.output_names
    rdd = dataset.rdd.map(lambda x: (x[0], _process_labels(x[1])))\
        .map(lambda sample: _training_reorder(sample, input_names, output_names))
    if dataset.val_rdd is not None:
        val_rdd = dataset.val_rdd.map(lambda x: (x[0], _process_labels(x[1])))\
            .map(lambda sample: _training_reorder(sample, input_names, output_names))
    else:
        val_rdd = None
    tensor_structure = _training_reorder(dataset.tensor_structure, input_names,
                                         output_names)
    new_dataset = TFDataset(rdd, tensor_structure, dataset.batch_size, -1,
                            dataset.hard_code_batch_size, val_rdd)
    new_dataset.batch_per_thread = dataset.batch_per_thread
    return new_dataset
Example #7
0
def _standarize_feature_dataset(dataset, model):
    input_names = model.input_names
    rdd = dataset.rdd.map(lambda sample: _reorder(sample, input_names))
    feature_schema = _reorder(dataset.tensor_structure[0], input_names)

    dataset = TFDataset(rdd, feature_schema, dataset.batch_size, -1,
                        dataset.hard_code_batch_size)
    return dataset
Example #8
0
 def test_tf_net_predict_dataset(self):
     resource_path = os.path.join(os.path.split(__file__)[0], "../../resources")
     tfnet_path = os.path.join(resource_path, "tfnet")
     net = TFNet.from_export_folder(tfnet_path)
     dataset = TFDataset.from_ndarrays((np.random.rand(16, 4),))
     output = net.predict(dataset)
     output = np.stack(output.collect())
     assert output.shape == (16, 2)
Example #9
0
def _standarize_feature_label_dataset(dataset, model):
    input_names = model.input_names
    output_names = model.output_names

    def _process_labels(ys):
        if isinstance(ys, dict):
            return {
                k: np.expand_dims(y, axis=1) if y.ndim == 0 else y
                for k, y in ys.items()
            }
        elif isinstance(ys, list):
            return [
                np.expand_dims(y, axis=1) if y.ndim == 0 else y for y in ys
            ]
        else:
            return np.expand_dims(ys, axis=1) if ys.ndim == 0 else ys

    def _training_reorder(x, input_names, output_names):
        assert isinstance(x, tuple)

        return _reorder(x[0], input_names) + _reorder(x[1], output_names)

    def _reorder(x, names):
        if isinstance(x, dict):
            return [x[name] for name in names]
        elif isinstance(x, list):
            return x
        else:
            return [x]

    rdd = dataset.rdd.map(lambda x: (x[0], _process_labels(x[1])))\
        .map(lambda sample: _training_reorder(sample, input_names, output_names))
    if dataset.val_rdd is not None:
        val_rdd = dataset.val_rdd.map(lambda x: (x[0], _process_labels(x[1])))\
            .map(lambda sample: _training_reorder(sample, input_names, output_names))
    else:
        val_rdd = None
    tensor_structure = _training_reorder(dataset.tensor_structure, input_names,
                                         output_names)
    new_dataset = TFDataset(rdd, tensor_structure, dataset.batch_size, -1,
                            dataset.hard_code_batch_size, val_rdd)
    new_dataset.batch_per_thread = dataset.batch_per_thread
    return new_dataset
Example #10
0
    def predict(self, x, batch_per_thread=None, distributed=False):
        """
        Use a model to do prediction.

        :param x: Input data. It could be:
            - a TFDataset object
            - A Numpy array (or array-like), or a list of arrays
               (in case the model has multiple inputs).
            - A dict mapping input names to the corresponding array/tensors,
            if the model has named inputs.
        :param batch_per_thread:
          The default value is 1.
          When distributed is True,the total batch size is batch_per_thread * rdd.getNumPartitions.
          When distributed is False the total batch size is batch_per_thread * numOfCores.
        :param distributed: Boolean. Whether to do prediction in distributed mode or local mode.
                     Default is True. In local mode, x must be a Numpy array.
        """

        if isinstance(x, TFDataset):
            # todo check arguments
            if not x.has_batch:
                raise ValueError(
                    "The batch_per_thread of TFDataset" +
                    " must be specified when used in KerasModel predict.")
            if isinstance(x, TFNdarrayDataset):
                x = _standarize_feature_dataset(x, self.model)
            return self._predict_distributed(x)
        else:
            if distributed:
                sc = getOrCreateSparkContext()
                rdd, types, shapes = _create_rdd_x(
                    x, self.model._feed_input_names, sc)

                dataset = TFDataset.from_rdd(
                    rdd,
                    names=self.model._feed_input_names,
                    types=types,
                    shapes=shapes,
                    batch_per_thread=-1
                    if batch_per_thread is None else batch_per_thread)
                results = self._predict_distributed(dataset).collect()
                output_num = len(self.model.outputs)
                if output_num == 1:
                    return np.stack(results)
                else:
                    predictions = []
                    for i in range(0, output_num):
                        predictions.append(
                            np.stack([res[i] for res in results]))
                    return predictions
            else:
                return self.model.predict(x=x, batch_size=batch_per_thread)
Example #11
0
    def fit(self,
            x=None,
            y=None,
            batch_size=None,
            epochs=1,
            validation_split=0.,
            validation_data=None,
            distributed=False,
            **kwargs):
        if isinstance(x, TFDataset):
            # todo check arguments
            if not x.has_batch:
                raise ValueError("The batch_size of TFDataset must be " +
                                 "specified when used in KerasModel fit.")
            x = _standarize_feature_label_dataset(x, self.model)
            self._fit_distributed(x, validation_split, epochs, **kwargs)

        elif distributed:
            sc = getOrCreateSparkContext()
            train_rdd, types, shapes = _create_rdd_x_y(
                x, y, self.model._feed_input_names,
                self.model._feed_output_names, sc)

            val_rdd = None
            if validation_data is not None:
                val_rdd, _, _ = _create_rdd_x_y(validation_data[0],
                                                validation_data[1],
                                                self.model._feed_input_names,
                                                self.model._feed_output_names,
                                                sc)
            names = self.model._feed_input_names + self.model._feed_output_names
            dataset = TFDataset.from_rdd(
                train_rdd,
                names=names,
                shapes=shapes,
                types=types,
                batch_size=batch_size if batch_size is not None else 32,
                val_rdd=val_rdd)
            self._fit_distributed(dataset, validation_split, epochs, **kwargs)

        else:
            self.model.fit(x=x,
                           y=y,
                           batch_size=batch_size,
                           epochs=epochs,
                           validation_split=validation_split,
                           validation_data=validation_data,
                           **kwargs)
Example #12
0
def main(max_epoch, data_num):
    sc = init_nncontext()

    # get data, pre-process and create TFDataset
    def get_data_rdd(dataset):
        (images_data,
         labels_data) = mnist.read_data_sets("/tmp/mnist", dataset)
        image_rdd = sc.parallelize(images_data[:data_num])
        labels_rdd = sc.parallelize(labels_data[:data_num])
        rdd = image_rdd.zip(labels_rdd) \
            .map(lambda rec_tuple: [normalizer(rec_tuple[0], mnist.TRAIN_MEAN, mnist.TRAIN_STD),
                                    np.array(rec_tuple[1])])
        return rdd

    training_rdd = get_data_rdd("train")
    testing_rdd = get_data_rdd("test")
    dataset = TFDataset.from_rdd(training_rdd,
                                 names=["features", "labels"],
                                 shapes=[[28, 28, 1], []],
                                 types=[tf.float32, tf.int32],
                                 batch_size=280,
                                 val_rdd=testing_rdd)

    # construct the model from TFDataset
    images, labels = dataset.tensors

    with slim.arg_scope(lenet.lenet_arg_scope()):
        logits, end_points = lenet.lenet(images,
                                         num_classes=10,
                                         is_training=True)

    loss = tf.reduce_mean(
        tf.losses.sparse_softmax_cross_entropy(logits=logits, labels=labels))

    # create a optimizer
    optimizer = TFOptimizer(loss,
                            Adam(1e-3),
                            val_outputs=[logits],
                            val_labels=[labels],
                            val_method=Top1Accuracy())
    optimizer.set_train_summary(TrainSummary("/tmp/az_lenet", "lenet"))
    optimizer.set_val_summary(ValidationSummary("/tmp/az_lenet", "lenet"))
    # kick off training
    optimizer.optimize(end_trigger=MaxEpoch(max_epoch))

    saver = tf.train.Saver()
    saver.save(optimizer.sess, "/tmp/lenet/")
def main(data_num):

    data = Input(shape=[28, 28, 1])

    x = Flatten()(data)
    x = Dense(64, activation='relu')(x)
    x = Dense(64, activation='relu')(x)
    predictions = Dense(10, activation='softmax')(x)

    model = Model(inputs=data, outputs=predictions)

    model.load_weights("/tmp/mnist_keras.h5")

    if DISTRIBUTED:
        # using RDD api to do distributed evaluation
        sc = init_nncontext()
        # get data, pre-process and create TFDataset
        (images_data, labels_data) = mnist.read_data_sets("/tmp/mnist", "test")
        image_rdd = sc.parallelize(images_data[:data_num])
        labels_rdd = sc.parallelize(labels_data[:data_num])
        rdd = image_rdd.zip(labels_rdd) \
            .map(lambda rec_tuple: [normalizer(rec_tuple[0], mnist.TRAIN_MEAN, mnist.TRAIN_STD)])

        dataset = TFDataset.from_rdd(rdd,
                                     names=["features"],
                                     shapes=[[28, 28, 1]],
                                     types=[tf.float32],
                                     batch_per_thread=20)
        predictor = TFPredictor.from_keras(model, dataset)

        accuracy = predictor.predict().zip(labels_rdd).map(
            lambda x: np.argmax(x[0]) == x[1]).mean()

        print("predict accuracy is %s" % accuracy)

    else:
        # using keras api for local evaluation
        model.compile(optimizer='rmsprop',
                      loss='sparse_categorical_crossentropy',
                      metrics=['accuracy'])

        (images_data, labels_data) = mnist.read_data_sets("/tmp/mnist", "test")
        images_data = normalizer(images_data, mnist.TRAIN_MEAN,
                                 mnist.TRAIN_STD)
        result = model.evaluate(images_data, labels_data)
        print(model.metrics_names)
        print(result)
Example #14
0
def _standarize_feature_dataset(dataset, model):
    input_names = model.input_names

    def _reorder(x, names):
        if isinstance(x, dict):
            return [x[name] for name in names]
        elif isinstance(x, list):
            return x
        else:
            return [x]

    rdd = dataset.rdd.map(lambda sample: _reorder(sample, input_names))
    feature_schema = _reorder(dataset.tensor_structure[0], input_names)

    dataset = TFDataset(rdd, feature_schema, dataset.batch_size, -1,
                        dataset.hard_code_batch_size)
    return dataset
Example #15
0
def main(max_epoch, data_num):
    sc = init_nncontext()

    # get data, pre-process and create TFDataset
    def get_data_rdd(dataset):
        (images_data, labels_data) = mnist.read_data_sets("/tmp/mnist", dataset)
        image_rdd = sc.parallelize(images_data[:data_num])
        labels_rdd = sc.parallelize(labels_data[:data_num])
        rdd = image_rdd.zip(labels_rdd) \
            .map(lambda rec_tuple: [normalizer(rec_tuple[0], mnist.TRAIN_MEAN, mnist.TRAIN_STD),
                                    np.array(rec_tuple[1])])
        return rdd

    training_rdd = get_data_rdd("train")
    testing_rdd = get_data_rdd("test")
    dataset = TFDataset.from_rdd(training_rdd,
                                 names=["features", "labels"],
                                 shapes=[[28, 28, 1], []],
                                 types=[tf.float32, tf.int32],
                                 batch_size=280,
                                 val_rdd=testing_rdd
                                 )

    data = Input(shape=[28, 28, 1])

    x = Flatten()(data)
    x = Dense(64, activation='relu')(x)
    x = Dense(64, activation='relu')(x)
    predictions = Dense(10, activation='softmax')(x)

    model = Model(input=data, output=predictions)

    model.compile(optimizer='rmsprop',
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])

    optimizer = TFOptimizer.from_keras(model, dataset)

    optimizer.set_train_summary(TrainSummary("/tmp/az_lenet", "lenet"))
    optimizer.set_val_summary(ValidationSummary("/tmp/az_lenet", "lenet"))
    # kick off training
    optimizer.optimize(end_trigger=MaxEpoch(max_epoch))

    saver = tf.train.Saver()
    saver.save(optimizer.sess, "/tmp/lenet/")
Example #16
0
    def evaluate(self,
                 x=None,
                 y=None,
                 batch_per_thread=None,
                 distributed=False):
        """
        Evaluate a model on a given dataset

        :param x: Input data. It could be:
            - a TFDataset object
            - A Numpy array (or array-like), or a list of arrays
               (in case the model has multiple inputs).
            - A dict mapping input names to the corresponding array/tensors,
            if the model has named inputs.
        :param y: Target data. Like the input data `x`,
          It should be consistent with `x` (you cannot have Numpy inputs and
          tensor targets, or inversely). If `x` is a TFDataset, `y` should
          not be specified (since targets will be obtained from `x`).
        :param batch_per_thread:
          The default value is 1.
          When distributed is True,the total batch size is batch_per_thread * rdd.getNumPartitions.
          When distributed is False the total batch size is batch_per_thread * numOfCores.
        :param distributed: Boolean. Whether to do prediction in distributed mode or local mode.
                     Default is True. In local mode, x must be a Numpy array.
        """
        if isinstance(x, TFDataset):
            if not x.has_batch:
                raise ValueError("The batch_per_thread of TFDataset must be " +
                                 "specified when used in KerasModel evaluate.")
            if isinstance(x, TFNdarrayDataset):
                x = _standarize_feature_label_dataset(x, self.model)
            # todo check arguments
            return self._evaluate_distributed(x)
        else:
            if distributed:
                dataset = TFDataset.from_ndarrays(
                    (x, y),
                    batch_per_thread=-1
                    if batch_per_thread is None else batch_per_thread)
                return self._evaluate_distributed(dataset)
            else:
                return self.model.evaluate(x=x,
                                           y=y,
                                           batch_size=batch_per_thread)
Example #17
0
def main():

    sc = init_nncontext()

    # get data, pre-process and create TFDataset
    (images_data, labels_data) = mnist.read_data_sets("/tmp/mnist", "test")
    image_rdd = sc.parallelize(images_data)
    labels_rdd = sc.parallelize(labels_data)
    rdd = image_rdd.zip(labels_rdd) \
        .map(lambda rec_tuple: [normalizer(rec_tuple[0], mnist.TRAIN_MEAN, mnist.TRAIN_STD),
                                np.array(rec_tuple[1])])

    dataset = TFDataset.from_rdd(rdd,
                                 names=["features", "labels"],
                                 shapes=[[28, 28, 1], [1]],
                                 types=[tf.float32, tf.int32],
                                 batch_per_thread=20)

    # construct the model from TFDataset
    images, labels = dataset.tensors

    labels = tf.squeeze(labels)

    with slim.arg_scope(lenet.lenet_arg_scope()):
        logits, end_points = lenet.lenet(images,
                                         num_classes=10,
                                         is_training=False)

    predictions = tf.to_int32(tf.argmax(logits, axis=1))
    correct = tf.expand_dims(tf.to_int32(tf.equal(predictions, labels)),
                             axis=1)

    saver = tf.train.Saver()

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        saver.restore(sess, "/tmp/lenet/")

        predictor = TFPredictor(sess, [correct])

        accuracy = predictor.predict().mean()

        print("predict accuracy is %s" % accuracy)
Example #18
0
def main():
    sc = init_nncontext()

    # get data, pre-process and create TFDataset
    (images_data, labels_data) = mnist.read_data_sets("/tmp/mnist", "train")
    image_rdd = sc.parallelize(images_data)
    labels_rdd = sc.parallelize(labels_data)
    rdd = image_rdd.zip(labels_rdd) \
        .map(lambda rec_tuple: [normalizer(rec_tuple[0], mnist.TRAIN_MEAN, mnist.TRAIN_STD),
                                np.array(rec_tuple[1])])

    dataset = TFDataset.from_rdd(rdd,
                                 names=["features", "labels"],
                                 shapes=[(None, 28, 28, 1), (None, 1)],
                                 types=[tf.float32, tf.int32]
                                 )

    # construct the model from TFDataset
    images, labels = dataset.inputs

    labels = tf.squeeze(labels)

    with slim.arg_scope(lenet.lenet_arg_scope()):
        logits, end_points = lenet.lenet(images, num_classes=10, is_training=True)

    loss = tf.reduce_mean(tf.losses.sparse_softmax_cross_entropy(logits=logits, labels=labels))

    # create a optimizer
    optimizer = TFOptimizer(loss, Adam(1e-3))
    # kick off training
    # you may change the MaxIteration to MaxEpoch(5) to make it converge
    optimizer.optimize(end_trigger=MaxIteration(20), batch_size=280)

    # evaluate
    (images_data, labels_data) = mnist.read_data_sets("/tmp/mnist", "test")
    images_data = normalizer(images_data, mnist.TRAIN_MEAN, mnist.TRAIN_STD)
    predictions = tf.argmax(logits, axis=1)
    predictions_data, loss_value = optimizer.sess.run([predictions, loss],
                                                      feed_dict={images: images_data,
                                                                 labels: labels_data})
    print(np.mean(np.equal(predictions_data, labels_data)))
    print(loss_value)
Example #19
0
def main():
    sc = init_nncontext()

    # get data, pre-process and create TFDataset
    (images_data, labels_data) = mnist.read_data_sets("/tmp/mnist", "train")
    image_rdd = sc.parallelize(images_data)
    labels_rdd = sc.parallelize(labels_data)
    rdd = image_rdd.zip(labels_rdd) \
        .map(lambda rec_tuple: [normalizer(rec_tuple[0], mnist.TRAIN_MEAN, mnist.TRAIN_STD),
                                np.array(rec_tuple[1])])

    dataset = TFDataset.from_rdd(rdd,
                                 names=["features", "labels"],
                                 shapes=[[28, 28, 1], [1]],
                                 types=[tf.float32, tf.int32],
                                 batch_size=280)

    # construct the model from TFDataset
    images, labels = dataset.tensors

    labels = tf.squeeze(labels)

    with slim.arg_scope(lenet.lenet_arg_scope()):
        logits, end_points = lenet.lenet(images,
                                         num_classes=10,
                                         is_training=True)

    loss = tf.reduce_mean(
        tf.losses.sparse_softmax_cross_entropy(logits=logits, labels=labels))

    # create a optimizer
    optimizer = TFOptimizer(loss, Adam(1e-3))
    optimizer.set_train_summary(TrainSummary("/tmp/az_lenet", "lenet"))
    # kick off training
    for i in range(5):
        optimizer.optimize(end_trigger=MaxEpoch(i + 1))

    saver = tf.train.Saver()
    saver.save(optimizer.sess, "/tmp/lenet/")
Example #20
0
    def test_tf_optimizer_with_sparse_gradient_using_keras(self):
        import tensorflow as tf

        ids = np.random.randint(0, 10, size=[40])
        labels = np.random.randint(0, 5, size=[40])
        id_rdd = self.sc.parallelize(ids)
        label_rdd = self.sc.parallelize(labels)
        training_rdd = id_rdd.zip(label_rdd).map(lambda x: [x[0], x[1]])
        with tf.Graph().as_default():
            dataset = TFDataset.from_rdd(training_rdd,
                                         names=["ids", "labels"],
                                         shapes=[[], []],
                                         types=[tf.int32, tf.int32],
                                         batch_size=8)
            from tensorflow.python.ops import variable_scope

            def variable_creator(**kwargs):
                kwargs["use_resource"] = False
                return variable_scope.default_variable_creator(None, **kwargs)

            getter = lambda next_creator, **kwargs: variable_creator(**kwargs)
            with variable_scope.variable_creator_scope(getter):
                words_input = tf.keras.layers.Input(shape=(),
                                                    name='words_input')
                embedding_layer = tf.keras.layers.Embedding(
                    input_dim=10, output_dim=5, name='word_embedding')
                word_embeddings = embedding_layer(words_input)
                embedding = tf.keras.layers.Flatten()(word_embeddings)
                output = tf.keras.layers.Dense(5,
                                               activation="softmax")(embedding)
                model = tf.keras.models.Model(inputs=[words_input],
                                              outputs=[output])
                model.compile(optimizer="sgd",
                              loss="sparse_categorical_crossentropy")

            optimizer = TFOptimizer.from_keras(model, dataset)
            optimizer.optimize(end_trigger=MaxEpoch(1))
            optimizer.sess.close()
Example #21
0
 def evaluate(self,
              x=None,
              y=None,
              batch_per_thread=None,
              distributed=False):
     if isinstance(x, TFDataset):
         if not x.has_batch:
             raise ValueError("The batch_per_thread of TFDataset must be " +
                              "specified when used in KerasModel evaluate.")
         if isinstance(x, TFNdarrayDataset):
             x = _standarize_feature_label_dataset(x, self.model)
         # todo check arguments
         return self._evaluate_distributed(x)
     else:
         if distributed:
             dataset = TFDataset.from_ndarrays(
                 (x, y),
                 batch_per_thread=-1
                 if batch_per_thread is None else batch_per_thread)
             return self._evaluate_distributed(dataset)
         else:
             return self.model.evaluate(x=x,
                                        y=y,
                                        batch_size=batch_per_thread)
Example #22
0
# In[10]:

# Let's have a look at one element of trainRDD
trainRDD.take(1)

# We can see that `features` is  now composed by the list of 801 particles with 19 features each (`shape=[801 19]`) plus the HLF (`shape=[14]`) and the encoded label (`shape=[3]`).

# In[11]:

from zoo.pipeline.api.net import TFDataset
from zoo.tfpark.model import KerasModel

# create TFDataset for TF training
dataset = TFDataset.from_rdd(trainRDD,
                             features=[(tf.float32, [801, 19]),
                                       (tf.float32, [14])],
                             labels=(tf.float32, [3]),
                             batch_size=256,
                             val_rdd=testRDD)

# ## Optimizer setup and training

# In[12]:

# Set of hyperparameters
numEpochs = 8

# The batch used by BDL must be a multiple of numExecutors * executorCores
# Because data will be equally distibuted inside each executor

workerBatch = 64
# numExecutors = int(sc._conf.get('spark.executor.instances'))
Example #23
0
        output_dim=FC_LINEAR_DIMENSION,  # 尺寸: 32 -> 64.
        activation="sigmoid"))

# BigDL 不支持 parameter sharing, 不得已而为之.
both_feature = TimeDistributed(layer=convolve_net,
                               input_shape=input_shape)(both_input)

encode_left = both_feature.index_select(1, 0)
encode_right = both_feature.index_select(1, 1)

distance = autograd.abs(encode_left - encode_right)
predict = Dense(output_dim=NUM_CLASS_LABEL, activation="sigmoid")(distance)

siamese_net = Model(input=both_input, output=predict)
siamese_net.compile(optimizer="adam",
                    loss='sparse_categorical_crossentropy',
                    metrics=["accuracy"])

# 构造分布式的数据集对象.
data_set = TFDataset.from_rdd(train_rdd,
                              shapes=[input_shape, [1]],
                              batch_size=args.batch_size,
                              val_rdd=test_rdd)

optimizer = TFOptimizer.from_keras(siamese_net, data_set)
app_name = "Siamese Network"

optimizer.set_train_summary(TrainSummary("tmp", app_name))
optimizer.set_val_summary(ValidationSummary("tmp", app_name))
optimizer.optimize(end_trigger=MaxEpoch(args.num_epoch))
Example #24
0
    def fit(self,
            x=None,
            y=None,
            batch_size=None,
            epochs=1,
            validation_split=0.,
            validation_data=None,
            distributed=False,
            **kwargs):
        """
        Train the model for a fixed num of epochs

        Arguments:
        :param x: Input data. It could be:
            - a TFDataset object
            - A Numpy array (or array-like), or a list of arrays
               (in case the model has multiple inputs).
            - A dict mapping input names to the corresponding array/tensors,
            if the model has named inputs.
        :param y: Target data. Like the input data `x`,
          It should be consistent with `x` (you cannot have Numpy inputs and
          tensor targets, or inversely). If `x` is a TFDataset, `y` should
          not be specified (since targets will be obtained from `x`).
        :param batch_size: Integer or `None`.
            Number of samples per gradient update.
            If `x` is a TFDataset, you do not need to specify batch_size.
        :param epochs: Integer. Number of epochs to train the model.
            An epoch is an iteration over the entire `x` and `y`
            data provided.
        :param validation_split: Float between 0 and 1.
            Fraction of the training data to be used as validation data.
            The model will set apart this fraction of the training data,
            will not train on it, and will evaluate
            the loss and any model metrics
            on this data at the end of each epoch.
        :param validation_data: Data on which to evaluate
            the loss and any model metrics at the end of each epoch.
            The model will not be trained on this data.
            `validation_data` will override `validation_split`.
            `validation_data` could be:
              - tuple `(x_val, y_val)` of Numpy arrays or tensors
              - `TFDataset`
        :param distributed: Boolean. Whether to do prediction in distributed mode or local mode.
                     Default is True. In local mode, x must be a Numpy array.
        """
        if isinstance(x, TFDataset):
            # todo check arguments
            if not x.has_batch:
                raise ValueError("The batch_size of TFDataset must be " +
                                 "specified when used in KerasModel fit.")
            if isinstance(x, TFNdarrayDataset):
                x = _standarize_feature_label_dataset(x, self.model)
            self._fit_distributed(x, validation_split, epochs, **kwargs)

        elif distributed:
            dataset = TFDataset.from_ndarrays((x, y),
                                              val_tensors=validation_data)
            self._fit_distributed(dataset, validation_split, epochs, **kwargs)

        else:
            self.model.fit(x=x,
                           y=y,
                           batch_size=batch_size,
                           epochs=epochs,
                           validation_split=validation_split,
                           validation_data=validation_data,
                           **kwargs)