def from_train_op(cls, train_op, loss, *, inputs=None, labels=None, metrics=None, updates=None,
                      sess=None, dataset=None, tensor_with_value=None, session_config=None,
                      model_dir=None):
        sess = TFOptimizer._get_or_create_session(sess)
        grads, variables = TFOptimizer._get_vars_grads_from_train_op(train_op)
        if dataset is None:
            dataset = TFOptimizer._get_dataset_from_loss(loss)
        _ = dataset.tensors  # trigger create tensors if not available
        dataset_inputs = dataset._original_tensors
        if isinstance(dataset_inputs, tuple) and len(dataset_inputs) == 2:
            if inputs is None:
                inputs = dataset_inputs[0]

            if labels is None:
                labels = dataset_inputs[1]
        else:
            if inputs is None:
                inputs = dataset_inputs

            if labels is None:
                labels = []

        inputs = nest.flatten(inputs)
        labels = nest.flatten(labels)
        return TFOptimizer._from_grads(loss=loss, sess=sess, inputs=inputs, labels=labels,
                                       grads=grads,
                                       variables=variables, dataset=dataset, metrics=metrics,
                                       tensor_with_value=tensor_with_value,
                                       optim_method=FakeOptimMethod(),
                                       session_config=session_config, updates=updates,
                                       model_dir=model_dir, train_op=train_op)
Exemple #2
0
    def evaluate(self,
                 data,
                 batch_size=32,
                 feature_cols=None,
                 labels_cols=None,
                 hard_code_batch_size=False):

        assert self.metrics is not None, \
            "metrics is None, it should not be None in evaluate"

        if isinstance(data, DataFrame):
            assert feature_cols is not None, \
                "feature columns is None; it should not be None in evaluation"
            assert labels_cols is not None, \
                "label columns is None; it should not be None in evaluation"

        dataset = to_dataset(data,
                             batch_size=-1,
                             batch_per_thread=batch_size,
                             validation_data=None,
                             feature_cols=feature_cols,
                             labels_cols=labels_cols,
                             hard_code_batch_size=hard_code_batch_size,
                             sequential_order=True,
                             shuffle=False)

        flat_inputs = nest.flatten(self.inputs)
        flat_labels = nest.flatten(self.labels)

        return evaluate_metrics(flat_inputs + flat_labels,
                                sess=self.sess,
                                dataset=dataset,
                                metrics=self.metrics)
Exemple #3
0
    def predict(self,
                data,
                batch_size=4,
                feature_cols=None,
                hard_code_batch_size=False):

        assert self.outputs is not None, \
            "output is None, it should not be None in prediction"
        if isinstance(data, DataFrame):
            assert feature_cols is not None, \
                "feature columns is None; it should not be None in prediction"

        dataset = to_dataset(data,
                             batch_size=-1,
                             batch_per_thread=batch_size,
                             validation_data=None,
                             feature_cols=feature_cols,
                             labels_cols=None,
                             hard_code_batch_size=hard_code_batch_size,
                             sequential_order=True,
                             shuffle=False)

        flat_inputs = nest.flatten(self.inputs)
        flat_outputs = nest.flatten(self.outputs)
        tfnet = TFNet.from_session(sess=self.sess,
                                   inputs=flat_inputs,
                                   outputs=flat_outputs)
        predicted_rdd = tfnet.predict(dataset)
        if isinstance(data, DataFrame):
            return convert_predict_to_dataframe(data, predicted_rdd)
        else:
            return predicted_rdd
    def predict(self, input_fn, checkpoint_path=None):
        with tf.Graph().as_default() as g:
            result = self.estimator._call_input_fn(
                input_fn, tf.estimator.ModeKeys.PREDICT)
            if isinstance(result, TFDataset):
                spec = self._call_model_fn(result.feature_tensors, None,
                                           tf.estimator.ModeKeys.PREDICT,
                                           self.config)
                latest_checkpoint = self.estimator.latest_checkpoint()

                if latest_checkpoint:
                    checkpoint_path = latest_checkpoint

                with tf.Session() as sess:
                    saver = tf.train.Saver()
                    if checkpoint_path:
                        saver.restore(sess, checkpoint_path)
                    else:
                        sess.run(tf.global_variables_initializer())
                    inputs = nest.flatten(result.feature_tensors)
                    outputs = nest.flatten(spec.predictions)
                    tfnet = TFNet.from_session(sess,
                                               inputs=inputs,
                                               outputs=outputs)

                    rdd = result.rdd.map(lambda t: Sample.from_ndarray(
                        nest.flatten(t), np.array([0.0])))

                    results = tfnet.predict(rdd, result.batch_per_thread)
                    return results

        return self.estimator.predict(input_fn,
                                      checkpoint_path=checkpoint_path)
Exemple #5
0
    def _create_placeholders(self):
        import tensorflow as tf
        if not self.hard_code_batch_size:
            tensors = nest.pack_sequence_as(self.tensor_structure, [
                tf.placeholder(
                    name=t.name, dtype=t.dtype, shape=[None] + list(t.shape))
                for t in nest.flatten(self.tensor_structure)
            ])
        else:
            if self.batch_per_thread > 0:
                tensors = nest.pack_sequence_as(self.tensor_structure, [
                    tf.placeholder(
                        name=t.name,
                        dtype=t.dtype,
                        shape=[self.batch_per_thread] + list(t.shape))
                    for t in nest.flatten(self.tensor_structure)
                ])
            else:
                tensors = nest.pack_sequence_as(self.tensor_structure, [
                    tf.placeholder(
                        name=t.name,
                        dtype=t.dtype,
                        shape=[self.batch_size // self.total_core_num] +
                        list(t.shape))
                    for t in nest.flatten(self.tensor_structure)
                ])

        for tensor in nest.flatten(tensors):
            tf.get_default_graph().clear_collection(tensor.name)
            tf.add_to_collection(tensor.name, self)

        return tensors
    def predict(
        self,
        data,
        batch_size=4,
        feature_cols=None,
        hard_code_batch_size=False,
        auto_shard_files=False,
    ):
        """
        Predict input data
        :param data: data to be predicted. It can be XShards, Spark DataFrame.
        If data is XShards, each element needs to be {'x': a feature numpy array
         or a tuple of feature numpy arrays}.
        :param batch_size: batch size per thread
        :param feature_cols: list of feature column names if input data is Spark DataFrame.
        :param hard_code_batch_size: whether to hard code batch size for prediction.
         The default value is False.
        :return: predicted result.
         If input data is XShards or tf.data.Dataset, the predict result is a XShards,
         and the schema for each result is: {'prediction': predicted numpy array or
          list of predicted numpy arrays}.
         If input data is Spark DataFrame, the predict result is a DataFrame which includes original
         columns plus 'prediction' column. The 'prediction' column can be FloatType, VectorUDT
         or Array of VectorUDT depending on model outputs shape.
        """

        assert self.outputs is not None, \
            "output is None, it should not be None in prediction"
        if isinstance(data, DataFrame):
            assert feature_cols is not None, \
                "feature columns is None; it should not be None in prediction"

        assert not is_tf_data_dataset(data), "tf.data.Dataset currently cannot be used for" \
                                             "estimator prediction"

        dataset = to_dataset(
            data,
            batch_size=-1,
            batch_per_thread=batch_size,
            validation_data=None,
            feature_cols=feature_cols,
            label_cols=None,
            hard_code_batch_size=hard_code_batch_size,
            sequential_order=True,
            shuffle=False,
            auto_shard_files=auto_shard_files,
        )

        flat_inputs = nest.flatten(self.inputs)
        flat_outputs = nest.flatten(self.outputs)
        tfnet = TFNet.from_session(sess=self.sess,
                                   inputs=flat_inputs,
                                   outputs=flat_outputs)
        predicted_rdd = tfnet.predict(dataset)
        if isinstance(data, DataFrame):
            return convert_predict_rdd_to_dataframe(data, predicted_rdd)
        elif isinstance(data, SparkXShards):
            return convert_predict_rdd_to_xshard(data, predicted_rdd)
        else:
            return predicted_rdd
Exemple #7
0
    def __init__(self, file_path, parse_fn, batch_size,
                 batch_per_thread, hard_code_batch_size=False, validation_file_path=None):
        import tensorflow as tf
        g = tf.Graph()
        with g.as_default():
            serialized_example = tf.placeholder(dtype=tf.string, shape=[])
            results = parse_fn(serialized_example)

            flattened = nest.flatten(results)
            output_names = [tf.cast(t, dtype=tf.float32).name for t in flattened]

        serialized_graph = bytearray(g.as_graph_def().SerializeToString())

        sc = getOrCreateSparkContext()
        train_rdd = callBigDlFunc("float", "createRDDFromTFRecords",
                                  file_path, sc, serialized_graph,
                                  serialized_example.name, output_names)
        validation_rdd = None
        if validation_file_path is not None:
            validation_rdd = callBigDlFunc("float", "createRDDFromTFRecords",
                                           validation_file_path, sc, serialized_graph,
                                           serialized_example.name, output_names)

        tensor_structure = nest.pack_sequence_as(results,
                                                 [TensorMeta(tf.as_dtype(t.dtype),
                                                  shape=t.shape,
                                                  name="data_%s" % i)
                                                  for i, t in enumerate(nest.flatten(results))])

        super(TFRecordDataset, self).__init__(tensor_structure, batch_size,
                                              batch_per_thread, hard_code_batch_size)

        self.train_rdd = train_rdd
        self.validation_rdd = validation_rdd
    def predict(self, input_fn, checkpoint_path=None):
        """Outputs predictions for given features.

        :param input_fn: A function that constructs the features.
              * A `TFDataset` object, each elements of which is a tuple `(features, None)`.
              * A `tf.data.Dataset` object: Outputs of `Dataset` object must have
                same constraints as below.
              * features: A `tf.Tensor` or a dictionary of string feature name to
                `Tensor`. features are consumed by `model_fn`. They should satisfy
                the expectation of `model_fn` from inputs.
              * A tuple, in which case the first item is extracted as features.

        :param checkpoint_path: Path of a specific checkpoint to predict. If `None`, the
            latest checkpoint in `model_dir` is used.  If there are no checkpoints
            in `model_dir`, prediction is run with newly initialized `Variables`
            instead of ones restored from checkpoint.


        Return:
          Evaluated values of `predictions` tensors.

        """
        with tf.Graph().as_default() as g:
            result = self.estimator._call_input_fn(input_fn, tf.estimator.ModeKeys.PREDICT)
            if isinstance(result, TFDataset):
                spec = self._call_model_fn(result.feature_tensors,
                                           None,
                                           tf.estimator.ModeKeys.PREDICT,
                                           self.config)
                latest_checkpoint = self.estimator.latest_checkpoint()

                if latest_checkpoint:
                    checkpoint_path = latest_checkpoint

                with tf.Session() as sess:
                    if checkpoint_path:
                        saver = tf.train.Saver()
                        saver.restore(sess, checkpoint_path)
                    else:
                        sess.run(tf.global_variables_initializer())
                    inputs = nest.flatten(result._original_tensors[0])
                    outputs = nest.flatten(spec.predictions)
                    tfnet = TFNet.from_session(sess, inputs=inputs, outputs=outputs)
                    predictions = tfnet.predict(result.get_prediction_data(), mini_batch=True)

                    # If predictions is a dict, add back the keys and results is a dict as well.
                    if isinstance(spec.predictions, dict):
                        # Given a list of outputs; return a dict of outputs.
                        def zip_key(outs, keys):
                            assert len(outs) == len(keys)
                            res_dict = {}
                            for out, key in zip(outs, keys):
                                res_dict[key] = out
                            return res_dict

                        pred_keys = sorted(spec.predictions.keys())
                        predictions = predictions.map(lambda res: zip_key(res, pred_keys))
                    return predictions

        return list(self.estimator.predict(input_fn, checkpoint_path=checkpoint_path))
Exemple #9
0
    def __init__(self, rdd, tensor_structure, batch_size,
                 batch_per_thread, hard_code_batch_size=False, val_rdd=None):
        '''
        TFDatasets represents a distributed collection of elements to be feed into Tensorflow
        graph. TFDatasets can be created using a RDD and each of its records is one or more
        numpy.ndarray of the same nested structure, representing the tensors to be feed into
        TensorFlow graph on each iteration. TFDatasets must be used with TFOptimizer or
        TFPredictor.
        '''

        if batch_size > 0 and batch_per_thread > 0:
            raise ValueError("bath_size and batch_per_thread should not be set simultaneously")

        self.has_batch = True
        node_num, core_num = get_node_and_core_number()
        self.total_core_num = node_num * core_num
        if batch_size > 0:
            if batch_size % self.total_core_num != 0:
                raise ValueError("batch_size should be a multiple " +
                                 "of total core number, but got batch_size: " +
                                 "%s where total core number is %s" % (batch_size,
                                                                       self.total_core_num))
        if batch_size <= 0 and batch_per_thread <= 0:
            batch_per_thread = 1
            batch_size = self.total_core_num
            self.has_batch = False

        self.batch_size = batch_size
        self.batch_per_thread = batch_per_thread
        self.hard_code_batch_size = hard_code_batch_size
        self.tensor_structure = tensor_structure

        self.val_rdd = val_rdd

        if not self.hard_code_batch_size:
            self.output_shapes = nest.pack_sequence_as(
                self.tensor_structure, [[None] + list(t.shape)
                                        if t is not None else None
                                        for t in nest.flatten(self.tensor_structure)])
        else:
            if self.batch_per_thread > 0:
                self.output_shapes = nest.pack_sequence_as(
                    self.tensor_structure, [[self.batch_per_thread] + t.shape
                                            if t is not None else None
                                            for t in nest.flatten(self.tensor_structure)])
            else:
                self.output_shapes = nest.pack_sequence_as(
                    self.tensor_structure, [[self.batch_size // self.total_core_num] + t.shape
                                            if t is not None else None
                                            for t in nest.flatten(self.tensor_structure)])

        self.rdd = rdd
        self.input_names = nest.pack_sequence_as(
            self.tensor_structure, [t.name
                                    if t is not None else None
                                    for t in nest.flatten(self.tensor_structure)])

        self._tensors = None
Exemple #10
0
    def evaluate(self,
                 input_fn,
                 eval_methods,
                 steps=None,
                 checkpoint_path=None):
        if not all(
                isinstance(metric, six.string_types)
                for metric in eval_methods):
            raise ValueError("All metrics should be string types")
        with tf.Graph().as_default() as g:
            result = self.estimator._call_input_fn(input_fn,
                                                   tf.estimator.ModeKeys.EVAL)
            if isinstance(result, TFDataset):
                spec = self._call_model_fn(result.feature_tensors,
                                           result.label_tensors,
                                           tf.estimator.ModeKeys.PREDICT,
                                           self.config)
                latest_checkpoint = self.estimator.latest_checkpoint()

                if latest_checkpoint:
                    checkpoint_path = latest_checkpoint

                with tf.Session() as sess:
                    if checkpoint_path:
                        saver = tf.train.Saver()
                        saver.restore(sess, checkpoint_path)
                    else:
                        sess.run(tf.global_variables_initializer())
                    inputs = nest.flatten(result._original_tensors[0])
                    outputs = nest.flatten(spec.predictions)
                    tfnet = TFNet.from_session(sess,
                                               inputs=inputs,
                                               outputs=outputs)

                    rdd = result.rdd.map(lambda t: Sample.from_ndarray(
                        nest.flatten(t[0]), nest.flatten(t[1])))
                    if result.batch_per_thread < 0:
                        batch_size = result.batch_size
                    else:
                        batch_size = result.batch_per_thread * result.rdd.getNumPartitions(
                        )

                    eval_methods = [
                        self._to_bigdl_metric(m) for m in eval_methods
                    ]
                    results = tfnet.evaluate(rdd, batch_size, eval_methods)
                    final_result = dict([(r.method, r.result)
                                         for r in results])
                    return final_result

        return self.estimator.evaluate(input_fn,
                                       steps,
                                       checkpoint_path=checkpoint_path)
Exemple #11
0
    def predict(self, data, batch_size=32):
        assert self.outputs is not None, \
            "output is None, it should not be None in prediction"

        dataset = _to_dataset(data, batch_size=-1, batch_per_thread=batch_size)

        flat_inputs = nest.flatten(self.inputs)
        flat_outputs = nest.flatten(self.outputs)
        tfnet = TFNet.from_session(sess=self.sess,
                                   inputs=flat_inputs,
                                   outputs=flat_outputs)
        return tfnet.predict(dataset)
Exemple #12
0
    def evaluate(
        self,
        data,
        batch_size=32,
        feature_cols=None,
        label_cols=None,
        auto_shard_files=False,
    ):
        """
        Evaluate model.

        :param data: evaluation data. It can be XShards, Spark DataFrame, tf.data.Dataset.
        If data is XShards, each partition is a dictionary of  {'x': feature, 'y': label}, where
        feature(label) is a numpy array or a tuple of numpy arrays.
        If data is tf.data.Dataset, each element is a tuple of input tensors.
        :param batch_size: batch size per thread.
        :param feature_cols: feature_cols: feature column names if train data is Spark DataFrame.
        :param label_cols: label column names if train data is Spark DataFrame.
        :param auto_shard_files: whether to automatically detect if the dataset is file-based and
        and apply sharding on files, otherwise sharding on records. Default is False.
        :return: evaluation result as a dictionary of {'metric name': metric value}
        """

        assert self.metrics is not None, \
            "metrics is None, it should not be None in evaluate"

        if isinstance(data, DataFrame):
            assert feature_cols is not None, \
                "feature columns is None; it should not be None in evaluation"
            assert label_cols is not None, \
                "label columns is None; it should not be None in evaluation"

        dataset = to_dataset(
            data,
            batch_size=-1,
            batch_per_thread=batch_size,
            validation_data=None,
            feature_cols=feature_cols,
            label_cols=label_cols,
            hard_code_batch_size=False,
            sequential_order=True,
            shuffle=False,
            auto_shard_files=auto_shard_files,
        )

        flat_inputs = nest.flatten(self.inputs)
        flat_labels = nest.flatten(self.labels)

        return evaluate_metrics(flat_inputs + flat_labels,
                                sess=self.sess,
                                dataset=dataset,
                                metrics=self.metrics)
Exemple #13
0
    def evaluate(
        self,
        data,
        batch_size=32,
        feature_cols=None,
        labels_cols=None,
        hard_code_batch_size=False,
        auto_shard_files=True,
    ):
        """
        Evaluate model.
        :param data: evaluation data. It can be XShards, Spark DataFrame, tf.data.Dataset.
        If data is XShards, each element needs to be {'x': a feature numpy array
         or a tuple of feature numpy arrays, 'y': a label numpy array or a tuple of
         label numpy arrays}
        If data is tf.data.Dataset, each element is a tuple of input tensors.
        :param batch_size: batch size per thread.
        :param feature_cols: feature_cols: feature column names if train data is Spark DataFrame.
        :param labels_cols: label column names if train data is Spark DataFrame.
        :param hard_code_batch_size: whether to hard code batch size for evaluation.
        :return: evaluation result as a dictionary of {'metric name': metric value}
        """

        assert self.metrics is not None, \
            "metrics is None, it should not be None in evaluate"

        if isinstance(data, DataFrame):
            assert feature_cols is not None, \
                "feature columns is None; it should not be None in evaluation"
            assert labels_cols is not None, \
                "label columns is None; it should not be None in evaluation"

        dataset = to_dataset(
            data,
            batch_size=-1,
            batch_per_thread=batch_size,
            validation_data=None,
            feature_cols=feature_cols,
            labels_cols=labels_cols,
            hard_code_batch_size=hard_code_batch_size,
            sequential_order=True,
            shuffle=False,
            auto_shard_files=auto_shard_files,
        )

        flat_inputs = nest.flatten(self.inputs)
        flat_labels = nest.flatten(self.labels)

        return evaluate_metrics(flat_inputs + flat_labels,
                                sess=self.sess,
                                dataset=dataset,
                                metrics=self.metrics)
Exemple #14
0
    def evaluate(self, data, batch_size=32):
        assert self.metrics is not None, \
            "metrics is None, it should not be None in evaluate"

        dataset = _to_dataset(data, batch_size=-1, batch_per_thread=batch_size)

        flat_inputs = nest.flatten(self.inputs)
        flat_labels = nest.flatten(self.labels)

        return evaluate_metrics(flat_inputs + flat_labels,
                                sess=self.sess,
                                dataset=dataset,
                                metrics=self.metrics)
Exemple #15
0
        def to_dataset(iter):
            data_list = list(iter)

            import tensorflow as tf
            if not data_list:
                return []

            datasets = [create_dataset_fn(data) for data in data_list]
            from functools import reduce
            dataset = reduce(lambda x, y: x.concatenate(y), datasets)
            dataset = dataset.batch(batch_per_shard, drop_remainder)
            iterator = dataset.make_initializable_iterator()
            train_next_ops = nest.flatten(iterator.get_next())
            output_types = [
                t.as_datatype_enum for t in nest.flatten(dataset.output_types)
            ]

            init_op_name = iterator.initializer.name
            table_init_op = tf.tables_initializer().name
            output_names = [op.name for op in train_next_ops]

            graph = train_next_ops[0].graph

            flatten_shapes = nest.flatten(dataset.output_shapes)

            flatten_shapes = [shape[1:] for shape in flatten_shapes]

            flatten_tensor_structure = [
                TensorMeta(dtype=output_types[i],
                           shape=list(flatten_shapes[i]),
                           name="zoo_input_{}".format(i))
                for i in range(len(flatten_shapes))
            ]
            structure = dataset.output_types
            if isinstance(structure, tf.DType):
                structure = (structure, )
            tensor_structure = nest.pack_sequence_as(structure,
                                                     flatten_tensor_structure)

            meta_info = {
                "init_op_name": init_op_name,
                "table_init_op": table_init_op,
                "output_names": output_names,
                "output_types": output_types,
                "tensor_structure": tensor_structure
            }

            return [(bytearray(graph.as_graph_def().SerializeToString()),
                     meta_info)]
    def _get_arguments_from_loss(loss, optim_method, session, val_outputs,
                                 val_labels, val_method):
        import tensorflow as tf
        if session is None:
            sess = tf.Session()
            sess.run(tf.global_variables_initializer())
        else:
            sess = session
        grads_vars = tf.train.GradientDescentOptimizer(0).compute_gradients(
            loss)
        grads_vars.sort(key=lambda grad_var: grad_var[1].name)
        variables = []
        grads = []
        for (grad, var) in grads_vars:
            if grad is not None:
                variables.append(var)
                grads.append(grad)

        all_required_inputs = _find_placeholders([loss])
        dataset = tf.get_collection(all_required_inputs[0].name)[0]

        inputs = nest.flatten(dataset._original_tensors)

        return [
            loss, optim_method, sess, dataset, inputs, grads, variables,
            loss.graph, val_outputs, val_labels, val_method
        ]
Exemple #17
0
def _tensors_to_rdd(tensors, sc, splits):
    import tensorflow as tf
    if isinstance(tensors, np.ndarray):
        tensors = (tensors,)

    if isinstance(tensors, list):
        for i in range(len(tensors)):
            if tensors[i].dtype == np.dtype("float64"):
                tensors[i] = np.float32(tensors[i])

        data_list = _splits(tensors)
        rdd = sc.parallelize(data_list, splits)
        tensor_structure = [TensorMeta(tf.as_dtype(t.dtype),
                                       shape=t.shape[1:],
                                       name="input_%s" % i)
                            for i, t in enumerate(tensors)]
    else:
        flattened = nest.flatten(tensors)
        for i in range(len(flattened)):
            if flattened[i].dtype == np.dtype("float64"):
                flattened[i] = np.float32(flattened[i])
        data_list = _splits(flattened)
        rdd = sc.parallelize(data_list, splits)
        rdd = rdd.map(lambda x: nest.pack_sequence_as(tensors, x))
        tensor_structure = nest.pack_sequence_as(tensors,
                                                 [TensorMeta(tf.as_dtype(t.dtype),
                                                             shape=t.shape[1:],
                                                             name="input_%s" % i)
                                                  for i, t in enumerate(flattened)])
    return rdd, tensor_structure
Exemple #18
0
 def get_training_data(self):
     sample_rdd = self.rdd.map(
         lambda t: Sample.from_ndarray(nest.flatten(t), np.array([0.0])))
     fs = FeatureSet.sample_rdd(sample_rdd,
                                sequential_order=self.sequential_order,
                                shuffle=self.shuffle)
     return fs
Exemple #19
0
 def get_prediction_data(self):
     rdd = self.rdd.map(lambda t: Sample.from_ndarray(
         nest.flatten(t[0]
                      if isinstance(t, tuple) else t), np.array([0.0])))
     rdd_wrapper = callZooFunc("float", "zooRDDSampleToMiniBatch", rdd,
                               self.batch_per_thread)
     return rdd_wrapper.value().toJavaRDD()
Exemple #20
0
    def predict(self, data, batch_size=32):
        assert self.outputs is not None, \
            "output is None, it should not be None in prediction"

        if isinstance(data, SparkXShards):
            dataset = _xshards_to_tf_dataset(data,
                                             batch_per_thread=batch_size)
        elif isinstance(data, Dataset):
            dataset = TFDataDataset2(data, batch_size=-1,
                                     batch_per_thread=batch_size)
        else:
            raise ValueError("data must be a SparkXShards or an orca.data.tf.Dataset")

        flat_inputs = nest.flatten(self.inputs)
        flat_outputs = nest.flatten(self.outputs)
        tfnet = TFNet.from_session(sess=self.sess, inputs=flat_inputs, outputs=flat_outputs)
        return tfnet.predict(dataset)
Exemple #21
0
    def from_loss(cls, loss, optim_method, session=None, val_outputs=None,
                  val_labels=None, val_method=None, val_split=0.0,
                  clip_norm=None, clip_value=None, metrics=None,
                  tensor_with_value=None, session_config=None, model_dir=None, updates=None):
        """
        Create a TFOptimizer from a TensorFlow loss tensor.
        The loss tensor must come from a TensorFlow graph that only takes TFDataset.tensors and
        the tensors in `tensor_with_value` as inputs.
        :param loss: The loss tensor of the TensorFlow model, should be a scalar
        :param optim_method: the optimization method to be used, such as bigdl.optim.optimizer.Adam
        :param session: the current TensorFlow Session, if you want to used a pre-trained model,
        you should use the Session to load the pre-trained variables and pass it to TFOptimizer.
        :param val_outputs: the validation output TensorFlow tensor to be used by val_methods
        :param val_labels: the validation label TensorFlow tensor to be used by val_methods
        :param val_method: the BigDL val_method(s) to be used.
        :param val_split: Float between 0 and 1. Fraction of the training data to be used as
        validation data.
        :param clip_norm: float >= 0. Gradients will be clipped when their L2 norm exceeds
        this value.
        :param clip_value: float >= 0. Gradients will be clipped when their absolute value
        exceeds this value.
        :param metrics: a dictionary. The key should be a string representing the metric's name
        and the value should be the corresponding TensorFlow tensor, which should be a scalar.
        :param tensor_with_value: a dictionary. The key is TensorFlow tensor, usually a
        placeholder, the value of the dictionary is a tuple of two elements. The first one of
        the tuple is the value to feed to the tensor in training phase and the second one
        is the value to feed to the tensor in validation phase.
        :return: a TFOptimizer
        """
        sess = TFOptimizer._get_or_create_session(session)
        grads, variables = TFOptimizer._get_vars_grads(loss)
        dataset = TFOptimizer._get_dataset_from_loss(loss)
        inputs = nest.flatten(dataset._original_tensors)

        if clip_value is not None:
            if isinstance(clip_value, float) or isinstance(clip_value, int):
                if clip_value <= 0:
                    ValueError("The clip_value argument should be positive number")
                clip_value = (-float(clip_value), float(clip_value))

            if not isinstance(clip_value, tuple):
                raise ValueError("The clip_value argument should be" +
                                 " a positive float/int which clips to" +
                                 " (-clip_value, clip_value); " +
                                 "or a tuple which clips to (min_value, max_value)")

        if val_method is not None:
            val_methods = to_list(val_method)
            if metrics is None:
                metrics = {}

            for i, method in enumerate(val_methods):
                metrics['bigdl_metirc_' + str(i)] = BigDLMetric(method, val_outputs, val_labels)

        return TFOptimizer._from_grads(loss, sess, inputs, grads, variables, dataset, optim_method,
                                       val_split, clip_norm, clip_value,
                                       metrics, tensor_with_value, session_config,
                                       model_dir, updates)
Exemple #22
0
    def partition(data, num_shards=None):
        """
        Partition local in memory data and form a SparkXShards
        :param data: np.ndarray, a tuple, list, dict of np.ndarray, or a nested structure
        made of tuple, list, dict with ndarray as the leaf value
        :param num_shards: the number of shards that the data will be partitioned into
        :return: a SparkXShards
        """
        sc = init_nncontext()
        node_num, core_num = get_node_and_core_number()
        shard_num = node_num * core_num if num_shards is None else num_shards
        import numpy as np
        type_err_msg = """
The types supported in zoo.orca.data.XShards.partition are
1. np.ndarray
2. a tuple, list, dict of np.ndarray
3. nested structure made of tuple, list, dict with ndarray as the leaf value

But got data of type {}
        """.format(type(data))
        supported_types = {list, tuple, dict}
        if isinstance(data, np.ndarray):
            if data.shape[0] < shard_num:
                raise ValueError(
                    "The length of data {} is smaller than the total number "
                    "of shards {}. Please adjust the num_shards option to be "
                    "at most {}.".format(data.shape[0], shard_num,
                                         data.shape[0]))
            arrays = np.array_split(data, shard_num)
            rdd = sc.parallelize(arrays)
        else:
            assert type(data) in supported_types, type_err_msg
            flattened = nest.flatten(data)
            data_length = len(flattened[0])
            data_to_be_shard = []
            if data_length < shard_num:
                raise ValueError(
                    "The length of data {} is smaller than the total number "
                    "of shards {}. Please adjust the num_shards option to be "
                    "at most {}.".format(data_length, shard_num, data_length))
            for i in range(shard_num):
                data_to_be_shard.append([])
            for x in flattened:
                assert len(x) == data_length, \
                    "the ndarrays in data must all have the same size in first dimension, " \
                    "got first ndarray of size {} and another {}".format(data_length, len(x))
                x_parts = np.array_split(x, shard_num)
                for idx, x_part in enumerate(x_parts):
                    data_to_be_shard[idx].append(x_part)

            data_to_be_shard = [
                nest.pack_sequence_as(data, shard)
                for shard in data_to_be_shard
            ]
            rdd = sc.parallelize(data_to_be_shard)

        data_shards = SparkXShards(rdd)
        return data_shards
Exemple #23
0
    def _get_evaluation_data(self):

        feature_length = len(nest.flatten(self.tensor_structure[0]))
        jvalue = callZooFunc("float", "createMiniBatchRDDFromTFDatasetEval",
                             self.rdd.map(lambda x: x[0]), self.init_op_name, self.table_init_op,
                             self.output_names,
                             self.output_types, self.shard_index_op_name, feature_length)
        rdd = jvalue.value().toJavaRDD()
        return rdd
Exemple #24
0
 def get_validation_data(self):
     if self.val_rdd is not None:
         sample_rdd = self.val_rdd.map(lambda t: Sample.from_ndarray(
             nest.flatten(t), np.array([0.0])))
         return FeatureSet.sample_rdd(
             sample_rdd,
             sequential_order=self.sequential_order,
             shuffle=self.shuffle)
     return None
Exemple #25
0
 def get_validation_data(self):
     if self.val_rdd is not None:
         sample_rdd = self.val_rdd.map(lambda t: Sample.from_ndarray(
             nest.flatten(t), np.array([0.0])))
         fs = FeatureSet.sample_rdd(sample_rdd,
                                    sequential_order=self.sequential_order,
                                    shuffle=self.shuffle)
         fs = fs.transform(SampleToMiniBatch(self.batch_size))
         return fs
     return None
Exemple #26
0
 def from_train_op(cls, train_op, loss, metrics=None, updates=None, sess=None, dataset=None,
                   tensor_with_value=None, session_config=None, model_dir=None):
     sess = TFOptimizer._get_or_create_session(sess)
     grads, variables = TFOptimizer._get_vars_grads_from_train_op(train_op)
     if dataset is None:
         dataset = TFOptimizer._get_dataset_from_loss(loss)
     inputs = nest.flatten(dataset._original_tensors)
     return TFOptimizer._from_grads(loss=loss, sess=sess, inputs=inputs, grads=grads,
                                    variables=variables, dataset=dataset, metrics=metrics,
                                    tensor_with_value=tensor_with_value,
                                    optim_method=FakeOptimMethod(),
                                    session_config=session_config, updates=updates,
                                    model_dir=model_dir, train_op=train_op)
    def _expand_inputs(inputs, tensors_with_value, loss):
        additional_inputs = []
        additional_values = []
        inputs = nest.flatten(inputs)
        names = set([i.name for i in inputs])

        if tensors_with_value:
            for t, v in tensors_with_value.items():
                if t.name in names:
                    msg = f"tensor {t} already in inputs, cannot put it in tensor_with_value"
                    raise ValueError(msg)
                additional_inputs.append(t)
                additional_values.append(v)

        return inputs, additional_inputs, additional_values
    def _expand_inputs(inputs, tensors_with_value, loss):
        additional_inputs = []
        additional_values = []
        all_required_inputs = find_placeholders([loss])
        all_required_inputs_names = [v.name for v in all_required_inputs]
        if tensors_with_value:
            for t, v in tensors_with_value.items():
                if t.name in all_required_inputs_names:
                    additional_inputs.append(t)
                    additional_values.append(v)

        if not isinstance(inputs, list):
            inputs = nest.flatten(inputs)

        return inputs, additional_inputs, additional_values
Exemple #29
0
    def partition(data):
        """
        Partition local in memory data and form a SparkXShards
        :param data: np.ndarray, a tuple, list, dict of np.ndarray, or a nested structure
        made of tuple, list, dict with ndarray as the leaf value
        :return: a SparkXShards
        """
        sc = init_nncontext()
        node_num, core_num = get_node_and_core_number()
        total_core_num = node_num * core_num
        import numpy as np
        type_err_msg = """
The types supported in zoo.orca.data.XShards.partition are
1. np.ndarray
2. a tuple, list, dict of np.ndarray
3. nested structure made of tuple, list, dict with ndarray as the leaf value

But got data of type {}
        """.format(type(data))
        supported_types = {list, tuple, dict}
        if isinstance(data, np.ndarray):
            arrays = np.array_split(data, total_core_num)
            rdd = sc.parallelize(arrays)
        else:
            assert type(data) in supported_types, type_err_msg
            flattened = nest.flatten(data)
            data_length = len(flattened[0])
            data_to_be_shard = []
            for i in range(total_core_num):
                data_to_be_shard.append([])
            for x in flattened:
                assert len(x) == data_length, \
                    "the ndarrays in data must all have the same size in first dimension, " \
                    "got first ndarray of size {} and another {}".format(data_length, len(x))
                x_parts = np.array_split(x, total_core_num)
                for idx, x_part in enumerate(x_parts):
                    data_to_be_shard[idx].append(x_part)

            data_to_be_shard = [
                nest.pack_sequence_as(data, shard)
                for shard in data_to_be_shard
            ]
            rdd = sc.parallelize(data_to_be_shard)

        data_shards = SparkXShards(rdd)
        return data_shards
    def _get_arguments_from_loss(loss, optim_method, session, val_outputs,
                                 val_labels, val_method):
        import tensorflow as tf
        if session is None:
            sess = tf.Session()
            sess.run(tf.global_variables_initializer())
        else:
            sess = session

        grads, variables = TFOptimizer._get_vars_grads(loss)
        all_required_inputs = _find_placeholders([loss])
        dataset = tf.get_collection(all_required_inputs[0].name)[0]

        inputs = nest.flatten(dataset._original_tensors)

        return [
            loss, optim_method, sess, dataset, inputs, grads, variables,
            loss.graph, val_outputs, val_labels, val_method
        ]