Esempi in Python per RDD.mapPartitions

Linguaggio di programmazione: Python

Spazio dei nomi/nome del pacchetto: pyspark

Classe/tipologia: RDD

Metodo/funzione: mapPartitions

Esempi su hotexamples.com: 6

RDD.mapPartitions in Python: 6 esempi trovati. Questi sono i migliori esempi reali in Python per pyspark.RDD.mapPartitions, estratti da progetti open source. Li puoi valutare, per aiutarci a migliorare la qualità dei nostri esempi.

Metodi utilizzati di frequente

Mostra Nascondi

RDD(30)

map(30)

flatMap(16)

count(11)

mapPartitionsWithIndex(10)

getNumPartitions(9)

filter(9)

repartition(6)

mapPartitions(6)

toDF(5)

collect(5)

mapValues(5)

groupByKey(4)

isEmpty(4)

coalesce(3)

cache(3)

take(3)

toDebugString(2)

persist(2)

unpersist(2)

zip(2)

zipWithIndex(2)

__init__(2)

_reserialize(2)

first(2)

distinct(2)

join(2)

sum(1)

_to_java_object_rdd(1)

union(1)

cogroup(1)

countApproxDistinct(1)

sortByKey(1)

subtractByKey(1)

sortBy(1)

sample(1)

randomSplit(1)

foreach(1)

name(1)

groupBy(1)

keys(1)

Esempio n. 1

Mostra file

File: spark_model.py Progetto: TalionSlade/elephas

    def _evaluate(self, rdd: RDD, **kwargs):
        yaml_model = self.master_network.to_yaml()
        optimizer = deserialize_optimizer(self.master_optimizer)
        loss = self.master_loss
        weights = self.master_network.get_weights()
        weights = rdd.context.broadcast(weights)
        custom_objects = self.custom_objects
        metrics = self.master_metrics

        def _evaluate(model, optimizer, loss, custom_objects, metrics, kwargs,
                      data_iterator):
            model = model_from_yaml(model, custom_objects)
            model.compile(optimizer, loss, metrics)
            model.set_weights(weights.value)
            feature_iterator, label_iterator = tee(data_iterator, 2)
            x_test = np.asarray([x for x, y in feature_iterator])
            y_test = np.asarray([y for x, y in label_iterator])
            return [model.evaluate(x_test, y_test, **kwargs)]

        if self.num_workers:
            rdd = rdd.repartition(self.num_workers)
        results = rdd.mapPartitions(
            partial(_evaluate, yaml_model, optimizer, loss, custom_objects,
                    metrics, kwargs))
        if not metrics:
            # if no metrics, we can just return the scalar corresponding to the loss value
            return results.mean()
        else:
            # if we do have metrics, we want to return a list of [loss value, metric value] - to match the keras API
            loss_value = results.map(lambda x: x[0]).mean()
            metric_value = results.map(lambda x: x[1]).mean()
            return [loss_value, metric_value]

Esempio n. 2

Mostra file

File: spark_model.py Progetto: TalionSlade/elephas

    def _fit(self, rdd: RDD, **kwargs):
        """Protected train method to make wrapping of modes easier
        """
        self._master_network.compile(optimizer=get_optimizer(
            self.master_optimizer),
                                     loss=self.master_loss,
                                     metrics=self.master_metrics)
        if self.mode in ['asynchronous', 'hogwild']:
            self.start_server()
        train_config = kwargs
        freq = self.frequency
        optimizer = deserialize_optimizer(self.master_optimizer)
        loss = self.master_loss
        metrics = self.master_metrics
        custom = self.custom_objects

        yaml = self._master_network.to_yaml()
        init = self._master_network.get_weights()
        parameters = rdd.context.broadcast(init)

        if self.mode in ['asynchronous', 'hogwild']:
            print('>>> Initialize workers')
            worker = AsynchronousSparkWorker(yaml, parameters, self.client,
                                             train_config, freq, optimizer,
                                             loss, metrics, custom)
            print('>>> Distribute load')
            rdd.mapPartitions(worker.train).collect()
            print('>>> Async training complete.')
            new_parameters = self.client.get_parameters()
        elif self.mode == 'synchronous':
            worker = SparkWorker(yaml, parameters, train_config, optimizer,
                                 loss, metrics, custom)
            training_outcomes = rdd.mapPartitions(worker.train).collect()
            new_parameters = self._master_network.get_weights()
            number_of_sub_models = len(training_outcomes)
            for training_outcome in training_outcomes:
                grad, history = training_outcome
                self.training_histories.append(history)
                weighted_grad = divide_by(grad, number_of_sub_models)
                new_parameters = subtract_params(new_parameters, weighted_grad)
            print('>>> Synchronous training complete.')
        else:
            raise ValueError("Unsupported mode {}".format(self.mode))
        self._master_network.set_weights(new_parameters)
        if self.mode in ['asynchronous', 'hogwild']:
            self.stop_server()

Esempio n. 3

Mostra file

File: spark_model.py Progetto: rishabh706/elephas

    def _predict(self, rdd: RDD):
        if self.num_workers:
            rdd = rdd.repartition(self.num_workers)
        yaml_model = self.master_network.to_yaml()
        weights = self.master_network.get_weights()
        weights = rdd.context.broadcast(weights)
        custom_objects = self.custom_objects

        def _predict(model, custom_objects, data):
            model = model_from_yaml(model, custom_objects)
            model.set_weights(weights.value)
            data = np.array([x for x in data])
            return model.predict(data)

        predictions = rdd.mapPartitions(partial(_predict, yaml_model, custom_objects)).collect()
        return predictions

Esempio n. 4

Mostra file

File: helper.py Progetto: y-xerxes/data_process

    def join(rdd: RDD, hash_value: dict) -> RDD:
        def map_partitions(rows, local_hash_value=None):
            rows = list(rows)
            result = []

            if local_hash_value is None:
                local_hash_value = {}

            for row in rows:
                key = row[0]
                value = row[1]
                result.append((key, (value, local_hash_value.get(key))))

            return result

        return rdd.mapPartitions(lambda rows: map_partitions(
            rows=rows, local_hash_value=hash_value))

Esempio n. 5

Mostra file

def collapse_rdd_data(rdd: RDD, collapse_function: Callable[[Iterator, Any],
                                                            Generator],
                      *args: Any) -> RDD:
    """Apply a collapse function to reduce the size of the data
    set. The function is applied for each partition (mapPartitions).

    Parameters
    ----------
    rdd : RDD[list[float]]
        RDD of list of float. Each list is the coordinate of a point (x, y, z).
    collapse_function : function
        collapse function to reduce the size of the data set. See
        `CollapseFunctions` for more information.
    args: Any
        Any arguments that have to be passed to `collapse_function`.
        Must be comma-separated.

    Returns
    -------
    RDD
        RDD whose elements are the result of the collapse function for
        each partition.

    Examples
    -------
    List of coordinates (can be 2D, 3D, ..., nD)
    >>> mylist = [
    ...     [1., 2., 4.], [3., 4., 1.], [5., 6., 5.],
    ...     [9., 10., 7.], [1., 2., 7.], [3., 4., 6.],
    ...     [5., 6., 9.], [7., 8., 6.], [9., 10., 10.]]

    Distribute over 2 partitions
    >>> rdd = sc.parallelize(mylist, 2)

    Compute the centroid for each partition
    >>> cf = CollapseFunctions()
    >>> data = collapse_rdd_data(rdd, cf.kmeans, 1).collect()
    >>> print(data) # doctest: +NORMALIZE_WHITESPACE
    [(array([[ 4.5 ,  5.5 ,  4.25]]), 4), (array([[ 5. ,  6. ,  7.6]]), 5)]

    """
    return rdd.mapPartitions(
        lambda partition: collapse_function(partition, *args))

Esempio n. 6

Mostra file

File: partition.py Progetto: yang-zhang-work/fugue

def _zipWithIndex(rdd: RDD, to_rows: bool = False) -> RDD:
    """
    Modified from
    https://github.com/davies/spark/blob/cebe5bfe263baf3349353f1473f097396821514a/python/pyspark/rdd.py

    """
    starts = [0]
    if rdd.getNumPartitions() > 1:
        nums = rdd.mapPartitions(lambda it: [sum(1 for i in it)]).collect()
        for i in range(len(nums) - 1):
            starts.append(starts[-1] + nums[i])

    def func1(k, it):  # pragma: no cover
        for i, v in enumerate(it, starts[k]):
            yield i, v

    def func2(k, it):  # pragma: no cover
        for i, v in enumerate(it, starts[k]):
            yield list(v) + [i]

    if not to_rows:
        return rdd.mapPartitionsWithIndex(func1)
    else:
        return rdd.mapPartitionsWithIndex(func2)