Python RDD.mapPartitionsWithIndex Examples

Programming Language: Python

Namespace/Package Name: pyspark

Class/Type: RDD

Method/Function: mapPartitionsWithIndex

Examples at hotexamples.com: 10

Python RDD.mapPartitionsWithIndex - 10 examples found. These are the top rated real world Python examples of pyspark.RDD.mapPartitionsWithIndex extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

RDD(30)

map(30)

flatMap(16)

count(11)

mapPartitionsWithIndex(10)

getNumPartitions(9)

filter(9)

repartition(6)

mapPartitions(6)

toDF(5)

collect(5)

mapValues(5)

groupByKey(4)

isEmpty(4)

coalesce(3)

cache(3)

take(3)

toDebugString(2)

persist(2)

unpersist(2)

zip(2)

zipWithIndex(2)

__init__(2)

_reserialize(2)

first(2)

distinct(2)

join(2)

sum(1)

_to_java_object_rdd(1)

union(1)

cogroup(1)

countApproxDistinct(1)

sortByKey(1)

subtractByKey(1)

sortBy(1)

sample(1)

randomSplit(1)

foreach(1)

name(1)

groupBy(1)

keys(1)

Example #1

Show file

def _flat_map(rdd: RDD, func):
    from itertools import chain

    def _fn(x):
        return func(x[0], x[1])

    def _func(_, iterator):
        return chain.from_iterable(map(fail_on_stopiteration(_fn), iterator))

    rdd.mapPartitionsWithIndex(_func, preservesPartitioning=False)

Example #2

Show file

def _save_as_func(rdd: RDD, name, namespace, partition, persistent):
    from arch.api import session
    dup = session.table(name=name,
                        namespace=namespace,
                        partition=partition,
                        persistent=persistent)

    def _func(_, it):
        eggroll_util.maybe_create_eggroll_client()
        dup.put_all(list(it))
        return 1,

    rdd.mapPartitionsWithIndex(_func, preservesPartitioning=False).collect()
    return dup

Example #3

Show file

def _filter(rdd: RDD, func):
    def _fn(x):
        return func(x[0], x[1])

    def _func(_, iterator):
        return filter(fail_on_stopiteration(_fn), iterator)

    return rdd.mapPartitionsWithIndex(_func, preservesPartitioning=True)

Example #4

Show file

    def __call__(self, head: RDD):
        index = self.partition_index

        def partition_filter(split_index, part):
            if split_index == index:
                for row in part:
                    yield row

        return head.mapPartitionsWithIndex(partition_filter, True)

Example #5

Show file

def _sample(rdd: RDD, fraction: float, seed: int):
    from pyspark.rddsampler import RDDSampler
    assert fraction >= 0.0, "Negative fraction value: %s" % fraction

    _sample_func = RDDSampler(False, fraction, seed).func

    def _func(split, iterator):
        return _sample_func(split, iterator)

    return rdd.mapPartitionsWithIndex(_func, preservesPartitioning=True)

Example #6

Show file

File: partition.py Project: yang-zhang-work/fugue

def _zipWithIndex(rdd: RDD, to_rows: bool = False) -> RDD:
    """
    Modified from
    https://github.com/davies/spark/blob/cebe5bfe263baf3349353f1473f097396821514a/python/pyspark/rdd.py

    """
    starts = [0]
    if rdd.getNumPartitions() > 1:
        nums = rdd.mapPartitions(lambda it: [sum(1 for i in it)]).collect()
        for i in range(len(nums) - 1):
            starts.append(starts[-1] + nums[i])

    def func1(k, it):  # pragma: no cover
        for i, v in enumerate(it, starts[k]):
            yield i, v

    def func2(k, it):  # pragma: no cover
        for i, v in enumerate(it, starts[k]):
            yield list(v) + [i]

    if not to_rows:
        return rdd.mapPartitionsWithIndex(func1)
    else:
        return rdd.mapPartitionsWithIndex(func2)

Example #7

Show file

def _glom(rdd: RDD):
    def _func(_, iterator):
        yield list(iterator)

    return rdd.mapPartitionsWithIndex(_func)

Example #8

Show file

def _map_partitions(rdd: RDD, func):
    def _func(_, iterator):
        return [(str(uuid.uuid1()), func(iterator))]

    return rdd.mapPartitionsWithIndex(_func, preservesPartitioning=False)

Example #9

Show file

def materialize(rdd: RDD):
    rdd.persist(STORAGE_LEVEL)
    rdd.mapPartitionsWithIndex(lambda ind, it: (1, )).collect()
    return rdd

Example #10

Show file

File: tera_sort.py Project: kowaalczyk/spark-minimal-algorithms

 def group(rdd: RDD, **kwargs: Any) -> RDD:
     rdd = rdd.mapPartitionsWithIndex(SampleAndAssignBuckets.extract_idx,
                                      preservesPartitioning=True)
     rdd = rdd.groupByKey()
     return rdd