def transform(self, fn, dtype, skip_undefined, seed): """ Implementation of apply(fn, dtype, skip_undefined, seed). Transform each element of the RDD by a given function. The result RDD is of type ``dtype``. ``fn`` should be a function that returns exactly one value which can be cast into the type specified by ``dtype``. """ self._entry(dtype=dtype, skip_undefined=skip_undefined, seed=seed) if seed: distribute_seed(self._rdd, seed) random.seed(seed) def array_typecode(val): if isinstance(val, int): return 'l' if isinstance(val, float): return 'd' return None # noinspection PyShadowingNames def apply_and_cast(x, fn, dtype, skip_undefined): if is_missing(x) and skip_undefined: return None # noinspection PyBroadException try: fnx = fn(x) except Exception: return ApplyError( 'Error evaluating function on "{}"'.format(x)) if is_missing(fnx) and skip_undefined: return None if dtype is None: return fnx try: if dtype in [array.array]: return array.array(array_typecode(fnx[0]), fnx) else: return dtype(fnx) except TypeError: return ApplyError('Error converting "{}" to {}'.format( fnx, dtype)) res = self._rdd.map( lambda x: apply_and_cast(x, fn, dtype, skip_undefined)) # search for type error and raise exception # TODO this forces evaluatuion -- consider not doing it errs = res.filter(lambda x: type(x) is ApplyError).take(100) if len(errs) > 0: raise ValueError('Transformation failures: errs {}'.format( len(errs))) return self._rv(res, dtype)
def transform(self, fn, dtype, skip_undefined, seed): """ Implementation of apply(fn, dtype, skip_undefined, seed). Transform each element of the RDD by a given function. The result RDD is of type ``dtype``. ``fn`` should be a function that returns exactly one value which can be cast into the type specified by ``dtype``. """ self._entry(dtype=dtype, skip_undefined=skip_undefined, seed=seed) if seed: distribute_seed(self._rdd, seed) random.seed(seed) def array_typecode(val): if isinstance(val, int): return 'l' if isinstance(val, float): return 'd' return None # noinspection PyShadowingNames def apply_and_cast(x, fn, dtype, skip_undefined): if is_missing(x) and skip_undefined: return None # noinspection PyBroadException try: fnx = fn(x) except Exception: return ApplyError('Error evaluating function on "{}"'.format(x)) if is_missing(fnx) and skip_undefined: return None if dtype is None: return fnx try: if dtype in [array.array]: return array.array(array_typecode(fnx[0]), fnx) else: return dtype(fnx) except TypeError: return ApplyError('Error converting "{}" to {}'.format(fnx, dtype)) res = self._rdd.map(lambda x: apply_and_cast(x, fn, dtype, skip_undefined)) # search for type error and raise exception # TODO this forces evaluatuion -- consider not doing it errs = res.filter(lambda x: type(x) is ApplyError).take(100) if len(errs) > 0: raise ValueError('Transformation failures: errs {}'.format(len(errs))) return self._rv(res, dtype)
def flat_map(self, fn, dtype, skip_undefined, seed): """ Implementation of flat_map(fn, dtype, skip_undefined, seed). Transform each element of the RDD by a given function, then flatten. The result RDD is of type ``dtype``. ``fn`` should be a function that returns a list of values which can be cast into the type specified by ``dtype``. """ self._entry(dtype=dtype, skip_undefined=skip_undefined, seed=seed) if seed: distribute_seed(self._rdd, seed) random.seed(seed) # noinspection PyShadowingNames def apply_and_cast(x, fn, dtype, skip_undefined): if is_missing(x) and skip_undefined: return [] try: # It is tempting to define the lambda function on the fly, but that # leads to serilization difficulties. if skip_undefined: if dtype is None: return [item for item in fn(x) if not is_missing(item)] return [ dtype(item) for item in fn(x) if not is_missing(item) ] if dtype is None: return [item for item in fn(x)] return [dtype(item) for item in fn(x)] except TypeError: return [ApplyError('TypeError')] res = self._rdd.flatMap( lambda x: apply_and_cast(x, fn, dtype, skip_undefined)) # search for type error and raise exception try: errs = res.filter(lambda x: type(x) is ApplyError).take(100) except Exception: raise ValueError('Type conversion failure: {}'.format(dtype)) if len(errs) > 0: raise ValueError('Type conversion failures errs: {}'.format( len(errs))) return self._rv(res, dtype)
def flat_map(self, fn, dtype, skip_undefined, seed): """ Implementation of flat_map(fn, dtype, skip_undefined, seed). Transform each element of the RDD by a given function, then flatten. The result RDD is of type ``dtype``. ``fn`` should be a function that returns a list of values which can be cast into the type specified by ``dtype``. """ self._entry(dtype=dtype, skip_undefined=skip_undefined, seed=seed) if seed: distribute_seed(self._rdd, seed) random.seed(seed) # noinspection PyShadowingNames def apply_and_cast(x, fn, dtype, skip_undefined): if is_missing(x) and skip_undefined: return [] try: # It is tempting to define the lambda function on the fly, but that # leads to serilization difficulties. if skip_undefined: if dtype is None: return [item for item in fn(x) if not is_missing(item)] return [dtype(item) for item in fn(x) if not is_missing(item)] if dtype is None: return [item for item in fn(x)] return [dtype(item) for item in fn(x)] except TypeError: return [ApplyError('TypeError')] res = self._rdd.flatMap(lambda x: apply_and_cast(x, fn, dtype, skip_undefined)) # search for type error and raise exception try: errs = res.filter(lambda x: type(x) is ApplyError).take(100) except Exception: raise ValueError('Type conversion failure: {}'.format(dtype)) if len(errs) > 0: raise ValueError('Type conversion failures errs: {}'.format(len(errs))) return self._rv(res, dtype)
def filter(self, fn, skip_undefined, seed): """ Filter this RDD by a function. Returns a new RDD filtered by this RDD. If `fn` evaluates an element to True, this element is copied to the new RDD. If not, it isn't. Throws an exception if the return type of `fn` is not castable to a boolean value. """ self._entry(skip_undefined=skip_undefined, seed=seed) if seed: distribute_seed(self._rdd, seed) random.seed(seed) # noinspection PyShadowingNames def apply_filter(x, fn, skip_undefined): if x is None and skip_undefined: return None return fn(x) res = self._rdd.filter(lambda x: apply_filter(x, fn, skip_undefined)) return self._rv(res)