def _evaluate(self, rdd: RDD, **kwargs): yaml_model = self.master_network.to_yaml() optimizer = deserialize_optimizer(self.master_optimizer) loss = self.master_loss weights = self.master_network.get_weights() weights = rdd.context.broadcast(weights) custom_objects = self.custom_objects metrics = self.master_metrics def _evaluate(model, optimizer, loss, custom_objects, metrics, kwargs, data_iterator): model = model_from_yaml(model, custom_objects) model.compile(optimizer, loss, metrics) model.set_weights(weights.value) feature_iterator, label_iterator = tee(data_iterator, 2) x_test = np.asarray([x for x, y in feature_iterator]) y_test = np.asarray([y for x, y in label_iterator]) return [model.evaluate(x_test, y_test, **kwargs)] if self.num_workers: rdd = rdd.repartition(self.num_workers) results = rdd.mapPartitions( partial(_evaluate, yaml_model, optimizer, loss, custom_objects, metrics, kwargs)) if not metrics: # if no metrics, we can just return the scalar corresponding to the loss value return results.mean() else: # if we do have metrics, we want to return a list of [loss value, metric value] - to match the keras API loss_value = results.map(lambda x: x[0]).mean() metric_value = results.map(lambda x: x[1]).mean() return [loss_value, metric_value]
def _fit(self, rdd: RDD, **kwargs): """Protected train method to make wrapping of modes easier """ self._master_network.compile(optimizer=get_optimizer( self.master_optimizer), loss=self.master_loss, metrics=self.master_metrics) if self.mode in ['asynchronous', 'hogwild']: self.start_server() train_config = kwargs freq = self.frequency optimizer = deserialize_optimizer(self.master_optimizer) loss = self.master_loss metrics = self.master_metrics custom = self.custom_objects yaml = self._master_network.to_yaml() init = self._master_network.get_weights() parameters = rdd.context.broadcast(init) if self.mode in ['asynchronous', 'hogwild']: print('>>> Initialize workers') worker = AsynchronousSparkWorker(yaml, parameters, self.client, train_config, freq, optimizer, loss, metrics, custom) print('>>> Distribute load') rdd.mapPartitions(worker.train).collect() print('>>> Async training complete.') new_parameters = self.client.get_parameters() elif self.mode == 'synchronous': worker = SparkWorker(yaml, parameters, train_config, optimizer, loss, metrics, custom) training_outcomes = rdd.mapPartitions(worker.train).collect() new_parameters = self._master_network.get_weights() number_of_sub_models = len(training_outcomes) for training_outcome in training_outcomes: grad, history = training_outcome self.training_histories.append(history) weighted_grad = divide_by(grad, number_of_sub_models) new_parameters = subtract_params(new_parameters, weighted_grad) print('>>> Synchronous training complete.') else: raise ValueError("Unsupported mode {}".format(self.mode)) self._master_network.set_weights(new_parameters) if self.mode in ['asynchronous', 'hogwild']: self.stop_server()
def _predict(self, rdd: RDD): if self.num_workers: rdd = rdd.repartition(self.num_workers) yaml_model = self.master_network.to_yaml() weights = self.master_network.get_weights() weights = rdd.context.broadcast(weights) custom_objects = self.custom_objects def _predict(model, custom_objects, data): model = model_from_yaml(model, custom_objects) model.set_weights(weights.value) data = np.array([x for x in data]) return model.predict(data) predictions = rdd.mapPartitions(partial(_predict, yaml_model, custom_objects)).collect() return predictions
def join(rdd: RDD, hash_value: dict) -> RDD: def map_partitions(rows, local_hash_value=None): rows = list(rows) result = [] if local_hash_value is None: local_hash_value = {} for row in rows: key = row[0] value = row[1] result.append((key, (value, local_hash_value.get(key)))) return result return rdd.mapPartitions(lambda rows: map_partitions( rows=rows, local_hash_value=hash_value))
def collapse_rdd_data(rdd: RDD, collapse_function: Callable[[Iterator, Any], Generator], *args: Any) -> RDD: """Apply a collapse function to reduce the size of the data set. The function is applied for each partition (mapPartitions). Parameters ---------- rdd : RDD[list[float]] RDD of list of float. Each list is the coordinate of a point (x, y, z). collapse_function : function collapse function to reduce the size of the data set. See `CollapseFunctions` for more information. args: Any Any arguments that have to be passed to `collapse_function`. Must be comma-separated. Returns ------- RDD RDD whose elements are the result of the collapse function for each partition. Examples ------- List of coordinates (can be 2D, 3D, ..., nD) >>> mylist = [ ... [1., 2., 4.], [3., 4., 1.], [5., 6., 5.], ... [9., 10., 7.], [1., 2., 7.], [3., 4., 6.], ... [5., 6., 9.], [7., 8., 6.], [9., 10., 10.]] Distribute over 2 partitions >>> rdd = sc.parallelize(mylist, 2) Compute the centroid for each partition >>> cf = CollapseFunctions() >>> data = collapse_rdd_data(rdd, cf.kmeans, 1).collect() >>> print(data) # doctest: +NORMALIZE_WHITESPACE [(array([[ 4.5 , 5.5 , 4.25]]), 4), (array([[ 5. , 6. , 7.6]]), 5)] """ return rdd.mapPartitions( lambda partition: collapse_function(partition, *args))
def _zipWithIndex(rdd: RDD, to_rows: bool = False) -> RDD: """ Modified from https://github.com/davies/spark/blob/cebe5bfe263baf3349353f1473f097396821514a/python/pyspark/rdd.py """ starts = [0] if rdd.getNumPartitions() > 1: nums = rdd.mapPartitions(lambda it: [sum(1 for i in it)]).collect() for i in range(len(nums) - 1): starts.append(starts[-1] + nums[i]) def func1(k, it): # pragma: no cover for i, v in enumerate(it, starts[k]): yield i, v def func2(k, it): # pragma: no cover for i, v in enumerate(it, starts[k]): yield list(v) + [i] if not to_rows: return rdd.mapPartitionsWithIndex(func1) else: return rdd.mapPartitionsWithIndex(func2)