Beispiel #1
0
def xshards_to_tf_dataset(data_shard,
                          batch_size=-1, batch_per_thread=-1,
                          validation_data_shard=None,
                          hard_code_batch_size=False,
                          sequential_order=False,
                          shuffle=True):
    # todo data_shard.head ?
    feature_spec, label_spec = data_shard._for_each(get_spec(allow_tuple=True, allow_list=False))\
        .first()

    feature_spec = [(tf.dtypes.as_dtype(spec[0]), spec[1]) for spec in feature_spec]
    label_spec = [(tf.dtypes.as_dtype(spec[0]), spec[1]) for spec in label_spec] \
        if label_spec is not None else None

    assert batch_size != -1 or batch_per_thread != -1, \
        "one of batch_size and batch_per_thread should be specified"

    val_rdd = None if validation_data_shard is None \
        else validation_data_shard.rdd.flatMap(flatten_xy(allow_tuple=True, allow_list=False))

    dataset = TFDataset.from_rdd(data_shard.rdd.flatMap(flatten_xy(allow_tuple=True,
                                                                   allow_list=False)),
                                 features=feature_spec,
                                 labels=label_spec,
                                 batch_size=batch_size,
                                 batch_per_thread=batch_per_thread,
                                 val_rdd=val_rdd,
                                 hard_code_batch_size=hard_code_batch_size,
                                 sequential_order=sequential_order,
                                 shuffle=shuffle)

    return dataset
Beispiel #2
0
    def predict(self,
                x,
                batch_per_thread=None,
                distributed=False):

        """
        Use a model to do prediction.

        :param x: Input data. It could be:
            - a TFDataset object
            - A Numpy array (or array-like), or a list of arrays
               (in case the model has multiple inputs).
            - A dict mapping input names to the corresponding array/tensors,
            if the model has named inputs.
        :param batch_per_thread:
          The default value is 1.
          When distributed is True,the total batch size is batch_per_thread * rdd.getNumPartitions.
          When distributed is False the total batch size is batch_per_thread * numOfCores.
        :param distributed: Boolean. Whether to do prediction in distributed mode or local mode.
                     Default is True. In local mode, x must be a Numpy array.
        """

        if isinstance(x, TFDataset):
            # todo check arguments
            if not x.has_batch:
                raise ValueError("The batch_per_thread of TFDataset" +
                                 " must be specified when used in KerasModel predict.")
            if isinstance(x, TFNdarrayDataset):
                x = _standarize_feature_dataset(x, self.model)
            return self._predict_distributed(x)
        else:
            if distributed:
                sc = getOrCreateSparkContext()
                rdd, types, shapes = _create_rdd_x(x, self.model._feed_input_names, sc)

                dataset = TFDataset.from_rdd(rdd,
                                             names=self.model._feed_input_names,
                                             types=types,
                                             shapes=shapes,
                                             batch_per_thread=-1 if batch_per_thread is None
                                             else batch_per_thread)
                results = self._predict_distributed(dataset).collect()
                output_num = len(self.model.outputs)
                if output_num == 1:
                    return np.stack(results)
                else:
                    predictions = []
                    for i in range(0, output_num):
                        predictions.append(np.stack([res[i] for res in results]))
                    return predictions
            else:
                return self.model.predict(x=x,
                                          batch_size=batch_per_thread)
Beispiel #3
0
def _xshards_to_tf_dataset(data_shard,
                           batch_size=-1,
                           batch_per_thread=-1,
                           validation_data_shard=None):
    # todo data_shard.head ?
    import numpy as np

    def check_data_type_and_to_list(data):
        result = {}
        assert isinstance(data, dict), "each shard should be an dict"
        assert "x" in data, "key x should in each shard"
        x = data["x"]
        if isinstance(x, np.ndarray):
            new_x = [x]
        elif isinstance(x, tuple) and all(
            [isinstance(xi, np.ndarray) for xi in x]):
            new_x = x
        else:
            raise ValueError(
                "value of x should be a ndarray or a tuple of ndarrays")
        result["x"] = new_x
        if "y" in data:
            y = data["y"]
            if isinstance(y, np.ndarray):
                new_y = [y]
            elif isinstance(y, tuple) and all(
                [isinstance(yi, np.ndarray) for yi in y]):
                new_y = y
            else:
                raise ValueError(
                    "value of x should be a ndarray or a tuple of ndarrays")
            result["y"] = new_y
        return result

    def get_spec(data):
        data = check_data_type_and_to_list(data)
        feature_spec = [(feat.dtype, feat.shape[1:]) for feat in data["x"]]
        if "y" in data:
            label_spec = [(label.dtype, label.shape[1:])
                          for label in data["y"]]
        else:
            label_spec = None
        return (feature_spec, label_spec)

    (feature_spec, label_spec) = data_shard.rdd.map(get_spec).first()

    feature_spec = [(tf.dtypes.as_dtype(spec[0]), spec[1])
                    for spec in feature_spec]
    label_spec = [(tf.dtypes.as_dtype(spec[0]), spec[1]) for spec in label_spec] \
        if label_spec is not None else None

    assert batch_size != -1 or batch_per_thread != -1, \
        "one of batch_size and batch_per_thread should be specified"

    # todo this might be very slow
    def flatten(data):
        data = check_data_type_and_to_list(data)
        features = data["x"]

        has_label = "y" in data
        labels = data["y"] if has_label else None
        length = features[0].shape[0]

        for i in range(length):
            fs = [feat[i] for feat in features]
            if has_label:
                ls = [l[i] for l in labels]
                yield (fs, ls)
            else:
                yield (fs, )

    val_rdd = None if validation_data_shard is None \
        else validation_data_shard.rdd.flatMap(flatten)

    dataset = TFDataset.from_rdd(data_shard.rdd.flatMap(flatten),
                                 features=feature_spec,
                                 labels=label_spec,
                                 batch_size=batch_size,
                                 batch_per_thread=batch_per_thread,
                                 val_rdd=val_rdd)

    return dataset