def transform(self, fn, dtype, skip_undefined, seed): """ Implementation of apply(fn, dtype, skip_undefined, seed). Transform each element of the RDD by a given function. The result RDD is of type ``dtype``. ``fn`` should be a function that returns exactly one value which can be cast into the type specified by ``dtype``. """ self._entry(fn, dtype, skip_undefined, seed) if seed: distribute_seed(self._rdd, seed) random.seed(seed) def apply_and_cast(x, fn, dtype, skip_undefined): if is_missing(x) and skip_undefined: return None try: return dtype(fn(x)) except TypeError: return TypeError res = self._rdd.map(lambda x: apply_and_cast(x, fn, dtype, skip_undefined)) # search for type error and throw exception errs = res.filter(lambda x: x is TypeError).take(1) if len(errs) > 0: raise ValueError('type conversion failure') self._exit() return self._rv(res, dtype)
def flat_map(self, fn, dtype, skip_undefined, seed): """ Implementation of flat_map(fn, dtype, skip_undefined, seed). Transform each element of the RDD by a given function, then flatten. The result RDD is of type ``dtype``. ``fn`` should be a function that returns a list of values which can be cast into the type specified by ``dtype``. """ self._entry(fn, dtype, skip_undefined, seed) if seed: distribute_seed(self._rdd, seed) random.seed(seed) def apply_and_cast(x, fn, dtype, skip_undefined): if is_missing(x) and skip_undefined: return [] try: if skip_undefined: return [dtype(item) for item in fn(x) if not is_missing(item)] return [dtype(item) for item in fn(x)] except TypeError: return TypeError res = self._rdd.flatMap(lambda x: apply_and_cast(x, fn, dtype, skip_undefined)) # search for type error and throw exception try: errs = res.filter(lambda x: x is TypeError).take(1) except Exception: raise ValueError('type conversion failure') if len(errs) > 0: raise ValueError('type conversion failure') self._exit() return self._rv(res, dtype)
def filter(self, fn, skip_undefined, seed): """ Filter this RDD by a function. Returns a new RDD filtered by this RDD. If `fn` evaluates an element to True, this element is copied to the new RDD. If not, it isn't. Throws an exception if the return type of `fn` is not castable to a boolean value. """ self._entry(fn, skip_undefined, seed) if seed: distribute_seed(self._rdd, seed) random.seed(seed) def apply_filter(x, fn, skip_undefined): if x is None and skip_undefined: return None return fn(x) res = self._rdd.filter(lambda x: apply_filter(x, fn, skip_undefined)) self._exit() return self._rv(res)