def _flat_map(rdd: RDD, func): from itertools import chain def _fn(x): return func(x[0], x[1]) def _func(_, iterator): return chain.from_iterable(map(fail_on_stopiteration(_fn), iterator)) rdd.mapPartitionsWithIndex(_func, preservesPartitioning=False)
def _save_as_func(rdd: RDD, name, namespace, partition, persistent): from arch.api import session dup = session.table(name=name, namespace=namespace, partition=partition, persistent=persistent) def _func(_, it): eggroll_util.maybe_create_eggroll_client() dup.put_all(list(it)) return 1, rdd.mapPartitionsWithIndex(_func, preservesPartitioning=False).collect() return dup
def _filter(rdd: RDD, func): def _fn(x): return func(x[0], x[1]) def _func(_, iterator): return filter(fail_on_stopiteration(_fn), iterator) return rdd.mapPartitionsWithIndex(_func, preservesPartitioning=True)
def __call__(self, head: RDD): index = self.partition_index def partition_filter(split_index, part): if split_index == index: for row in part: yield row return head.mapPartitionsWithIndex(partition_filter, True)
def _sample(rdd: RDD, fraction: float, seed: int): from pyspark.rddsampler import RDDSampler assert fraction >= 0.0, "Negative fraction value: %s" % fraction _sample_func = RDDSampler(False, fraction, seed).func def _func(split, iterator): return _sample_func(split, iterator) return rdd.mapPartitionsWithIndex(_func, preservesPartitioning=True)
def _zipWithIndex(rdd: RDD, to_rows: bool = False) -> RDD: """ Modified from https://github.com/davies/spark/blob/cebe5bfe263baf3349353f1473f097396821514a/python/pyspark/rdd.py """ starts = [0] if rdd.getNumPartitions() > 1: nums = rdd.mapPartitions(lambda it: [sum(1 for i in it)]).collect() for i in range(len(nums) - 1): starts.append(starts[-1] + nums[i]) def func1(k, it): # pragma: no cover for i, v in enumerate(it, starts[k]): yield i, v def func2(k, it): # pragma: no cover for i, v in enumerate(it, starts[k]): yield list(v) + [i] if not to_rows: return rdd.mapPartitionsWithIndex(func1) else: return rdd.mapPartitionsWithIndex(func2)
def _glom(rdd: RDD): def _func(_, iterator): yield list(iterator) return rdd.mapPartitionsWithIndex(_func)
def _map_partitions(rdd: RDD, func): def _func(_, iterator): return [(str(uuid.uuid1()), func(iterator))] return rdd.mapPartitionsWithIndex(_func, preservesPartitioning=False)
def materialize(rdd: RDD): rdd.persist(STORAGE_LEVEL) rdd.mapPartitionsWithIndex(lambda ind, it: (1, )).collect() return rdd
def group(rdd: RDD, **kwargs: Any) -> RDD: rdd = rdd.mapPartitionsWithIndex(SampleAndAssignBuckets.extract_idx, preservesPartitioning=True) rdd = rdd.groupByKey() return rdd