Ejemplo n.º 1
0
    def _evaluate(self, rdd: RDD, **kwargs):
        yaml_model = self.master_network.to_yaml()
        optimizer = deserialize_optimizer(self.master_optimizer)
        loss = self.master_loss
        weights = self.master_network.get_weights()
        weights = rdd.context.broadcast(weights)
        custom_objects = self.custom_objects
        metrics = self.master_metrics

        def _evaluate(model, optimizer, loss, custom_objects, metrics, kwargs,
                      data_iterator):
            model = model_from_yaml(model, custom_objects)
            model.compile(optimizer, loss, metrics)
            model.set_weights(weights.value)
            feature_iterator, label_iterator = tee(data_iterator, 2)
            x_test = np.asarray([x for x, y in feature_iterator])
            y_test = np.asarray([y for x, y in label_iterator])
            return [model.evaluate(x_test, y_test, **kwargs)]

        if self.num_workers:
            rdd = rdd.repartition(self.num_workers)
        results = rdd.mapPartitions(
            partial(_evaluate, yaml_model, optimizer, loss, custom_objects,
                    metrics, kwargs))
        if not metrics:
            # if no metrics, we can just return the scalar corresponding to the loss value
            return results.mean()
        else:
            # if we do have metrics, we want to return a list of [loss value, metric value] - to match the keras API
            loss_value = results.map(lambda x: x[0]).mean()
            metric_value = results.map(lambda x: x[1]).mean()
            return [loss_value, metric_value]
Ejemplo n.º 2
0
    def _fit(self, rdd: RDD, **kwargs):
        """Protected train method to make wrapping of modes easier
        """
        self._master_network.compile(optimizer=get_optimizer(
            self.master_optimizer),
                                     loss=self.master_loss,
                                     metrics=self.master_metrics)
        if self.mode in ['asynchronous', 'hogwild']:
            self.start_server()
        train_config = kwargs
        freq = self.frequency
        optimizer = deserialize_optimizer(self.master_optimizer)
        loss = self.master_loss
        metrics = self.master_metrics
        custom = self.custom_objects

        yaml = self._master_network.to_yaml()
        init = self._master_network.get_weights()
        parameters = rdd.context.broadcast(init)

        if self.mode in ['asynchronous', 'hogwild']:
            print('>>> Initialize workers')
            worker = AsynchronousSparkWorker(yaml, parameters, self.client,
                                             train_config, freq, optimizer,
                                             loss, metrics, custom)
            print('>>> Distribute load')
            rdd.mapPartitions(worker.train).collect()
            print('>>> Async training complete.')
            new_parameters = self.client.get_parameters()
        elif self.mode == 'synchronous':
            worker = SparkWorker(yaml, parameters, train_config, optimizer,
                                 loss, metrics, custom)
            training_outcomes = rdd.mapPartitions(worker.train).collect()
            new_parameters = self._master_network.get_weights()
            number_of_sub_models = len(training_outcomes)
            for training_outcome in training_outcomes:
                grad, history = training_outcome
                self.training_histories.append(history)
                weighted_grad = divide_by(grad, number_of_sub_models)
                new_parameters = subtract_params(new_parameters, weighted_grad)
            print('>>> Synchronous training complete.')
        else:
            raise ValueError("Unsupported mode {}".format(self.mode))
        self._master_network.set_weights(new_parameters)
        if self.mode in ['asynchronous', 'hogwild']:
            self.stop_server()
Ejemplo n.º 3
0
    def _predict(self, rdd: RDD):
        if self.num_workers:
            rdd = rdd.repartition(self.num_workers)
        yaml_model = self.master_network.to_yaml()
        weights = self.master_network.get_weights()
        weights = rdd.context.broadcast(weights)
        custom_objects = self.custom_objects

        def _predict(model, custom_objects, data):
            model = model_from_yaml(model, custom_objects)
            model.set_weights(weights.value)
            data = np.array([x for x in data])
            return model.predict(data)

        predictions = rdd.mapPartitions(partial(_predict, yaml_model, custom_objects)).collect()
        return predictions
Ejemplo n.º 4
0
    def join(rdd: RDD, hash_value: dict) -> RDD:
        def map_partitions(rows, local_hash_value=None):
            rows = list(rows)
            result = []

            if local_hash_value is None:
                local_hash_value = {}

            for row in rows:
                key = row[0]
                value = row[1]
                result.append((key, (value, local_hash_value.get(key))))

            return result

        return rdd.mapPartitions(lambda rows: map_partitions(
            rows=rows, local_hash_value=hash_value))
Ejemplo n.º 5
0
def collapse_rdd_data(rdd: RDD, collapse_function: Callable[[Iterator, Any],
                                                            Generator],
                      *args: Any) -> RDD:
    """Apply a collapse function to reduce the size of the data
    set. The function is applied for each partition (mapPartitions).

    Parameters
    ----------
    rdd : RDD[list[float]]
        RDD of list of float. Each list is the coordinate of a point (x, y, z).
    collapse_function : function
        collapse function to reduce the size of the data set. See
        `CollapseFunctions` for more information.
    args: Any
        Any arguments that have to be passed to `collapse_function`.
        Must be comma-separated.

    Returns
    -------
    RDD
        RDD whose elements are the result of the collapse function for
        each partition.

    Examples
    -------
    List of coordinates (can be 2D, 3D, ..., nD)
    >>> mylist = [
    ...     [1., 2., 4.], [3., 4., 1.], [5., 6., 5.],
    ...     [9., 10., 7.], [1., 2., 7.], [3., 4., 6.],
    ...     [5., 6., 9.], [7., 8., 6.], [9., 10., 10.]]

    Distribute over 2 partitions
    >>> rdd = sc.parallelize(mylist, 2)

    Compute the centroid for each partition
    >>> cf = CollapseFunctions()
    >>> data = collapse_rdd_data(rdd, cf.kmeans, 1).collect()
    >>> print(data) # doctest: +NORMALIZE_WHITESPACE
    [(array([[ 4.5 ,  5.5 ,  4.25]]), 4), (array([[ 5. ,  6. ,  7.6]]), 5)]

    """
    return rdd.mapPartitions(
        lambda partition: collapse_function(partition, *args))
Ejemplo n.º 6
0
def _zipWithIndex(rdd: RDD, to_rows: bool = False) -> RDD:
    """
    Modified from
    https://github.com/davies/spark/blob/cebe5bfe263baf3349353f1473f097396821514a/python/pyspark/rdd.py

    """
    starts = [0]
    if rdd.getNumPartitions() > 1:
        nums = rdd.mapPartitions(lambda it: [sum(1 for i in it)]).collect()
        for i in range(len(nums) - 1):
            starts.append(starts[-1] + nums[i])

    def func1(k, it):  # pragma: no cover
        for i, v in enumerate(it, starts[k]):
            yield i, v

    def func2(k, it):  # pragma: no cover
        for i, v in enumerate(it, starts[k]):
            yield list(v) + [i]

    if not to_rows:
        return rdd.mapPartitionsWithIndex(func1)
    else:
        return rdd.mapPartitionsWithIndex(func2)