Example #1
0
    def transform(self, fn, dtype, skip_undefined, seed):
        """
        Implementation of apply(fn, dtype, skip_undefined, seed).

        Transform each element of the RDD by a given function. The result
        RDD is of type ``dtype``. ``fn`` should be a function that returns
        exactly one value which can be cast into the type specified by
        ``dtype``. 
        """
        self._entry(dtype=dtype, skip_undefined=skip_undefined, seed=seed)
        if seed:
            distribute_seed(self._rdd, seed)
            random.seed(seed)

        def array_typecode(val):
            if isinstance(val, int):
                return 'l'
            if isinstance(val, float):
                return 'd'
            return None

        # noinspection PyShadowingNames
        def apply_and_cast(x, fn, dtype, skip_undefined):
            if is_missing(x) and skip_undefined:
                return None
            # noinspection PyBroadException
            try:
                fnx = fn(x)
            except Exception:
                return ApplyError(
                    'Error evaluating function on "{}"'.format(x))
            if is_missing(fnx) and skip_undefined:
                return None
            if dtype is None:
                return fnx
            try:
                if dtype in [array.array]:
                    return array.array(array_typecode(fnx[0]), fnx)
                else:
                    return dtype(fnx)
            except TypeError:
                return ApplyError('Error converting "{}" to {}'.format(
                    fnx, dtype))

        res = self._rdd.map(
            lambda x: apply_and_cast(x, fn, dtype, skip_undefined))
        # search for type error and raise exception
        # TODO this forces evaluatuion -- consider not doing it
        errs = res.filter(lambda x: type(x) is ApplyError).take(100)
        if len(errs) > 0:
            raise ValueError('Transformation failures: errs {}'.format(
                len(errs)))
        return self._rv(res, dtype)
Example #2
0
    def transform(self, fn, dtype, skip_undefined, seed):
        """
        Implementation of apply(fn, dtype, skip_undefined, seed).

        Transform each element of the RDD by a given function. The result
        RDD is of type ``dtype``. ``fn`` should be a function that returns
        exactly one value which can be cast into the type specified by
        ``dtype``. 
        """
        self._entry(dtype=dtype, skip_undefined=skip_undefined, seed=seed)
        if seed:
            distribute_seed(self._rdd, seed)
            random.seed(seed)

        def array_typecode(val):
            if isinstance(val, int):
                return 'l'
            if isinstance(val, float):
                return 'd'
            return None

        # noinspection PyShadowingNames
        def apply_and_cast(x, fn, dtype, skip_undefined):
            if is_missing(x) and skip_undefined:
                return None
            # noinspection PyBroadException
            try:
                fnx = fn(x)
            except Exception:
                return ApplyError('Error evaluating function on "{}"'.format(x))
            if is_missing(fnx) and skip_undefined:
                return None
            if dtype is None:
                return fnx
            try:
                if dtype in [array.array]:
                    return array.array(array_typecode(fnx[0]), fnx)
                else:
                    return dtype(fnx)
            except TypeError:
                return ApplyError('Error converting "{}" to {}'.format(fnx, dtype))

        res = self._rdd.map(lambda x: apply_and_cast(x, fn, dtype, skip_undefined))
        # search for type error and raise exception
        # TODO this forces evaluatuion -- consider not doing it
        errs = res.filter(lambda x: type(x) is ApplyError).take(100)
        if len(errs) > 0:
            raise ValueError('Transformation failures: errs {}'.format(len(errs)))
        return self._rv(res, dtype)
Example #3
0
    def flat_map(self, fn, dtype, skip_undefined, seed):
        """
        Implementation of flat_map(fn, dtype, skip_undefined, seed).

        Transform each element of the RDD by a given function, then flatten. The result
        RDD is of type ``dtype``. ``fn`` should be a function that returns
        a list of values which can be cast into the type specified by
        ``dtype``. 
        """
        self._entry(dtype=dtype, skip_undefined=skip_undefined, seed=seed)
        if seed:
            distribute_seed(self._rdd, seed)
            random.seed(seed)

        # noinspection PyShadowingNames
        def apply_and_cast(x, fn, dtype, skip_undefined):
            if is_missing(x) and skip_undefined:
                return []
            try:
                # It is tempting to define the lambda function on the fly, but that
                #  leads to serilization difficulties.
                if skip_undefined:
                    if dtype is None:
                        return [item for item in fn(x) if not is_missing(item)]
                    return [
                        dtype(item) for item in fn(x) if not is_missing(item)
                    ]
                if dtype is None:
                    return [item for item in fn(x)]
                return [dtype(item) for item in fn(x)]
            except TypeError:
                return [ApplyError('TypeError')]

        res = self._rdd.flatMap(
            lambda x: apply_and_cast(x, fn, dtype, skip_undefined))

        # search for type error and raise exception
        try:
            errs = res.filter(lambda x: type(x) is ApplyError).take(100)
        except Exception:
            raise ValueError('Type conversion failure: {}'.format(dtype))
        if len(errs) > 0:
            raise ValueError('Type conversion failures  errs: {}'.format(
                len(errs)))
        return self._rv(res, dtype)
Example #4
0
    def flat_map(self, fn, dtype, skip_undefined, seed):
        """
        Implementation of flat_map(fn, dtype, skip_undefined, seed).

        Transform each element of the RDD by a given function, then flatten. The result
        RDD is of type ``dtype``. ``fn`` should be a function that returns
        a list of values which can be cast into the type specified by
        ``dtype``. 
        """
        self._entry(dtype=dtype, skip_undefined=skip_undefined, seed=seed)
        if seed:
            distribute_seed(self._rdd, seed)
            random.seed(seed)

        # noinspection PyShadowingNames
        def apply_and_cast(x, fn, dtype, skip_undefined):
            if is_missing(x) and skip_undefined:
                return []
            try:
                # It is tempting to define the lambda function on the fly, but that
                #  leads to serilization difficulties.
                if skip_undefined:
                    if dtype is None:
                        return [item for item in fn(x) if not is_missing(item)]
                    return [dtype(item) for item in fn(x) if not is_missing(item)]
                if dtype is None:
                    return [item for item in fn(x)]
                return [dtype(item) for item in fn(x)]
            except TypeError:
                return [ApplyError('TypeError')]

        res = self._rdd.flatMap(lambda x: apply_and_cast(x, fn, dtype, skip_undefined))

        # search for type error and raise exception
        try:
            errs = res.filter(lambda x: type(x) is ApplyError).take(100)
        except Exception:
            raise ValueError('Type conversion failure: {}'.format(dtype))
        if len(errs) > 0:
            raise ValueError('Type conversion failures  errs: {}'.format(len(errs)))
        return self._rv(res, dtype)
Example #5
0
    def filter(self, fn, skip_undefined, seed):
        """
        Filter this RDD by a function.

        Returns a new RDD filtered by this RDD.  If `fn` evaluates an
        element to True, this element is copied to the new RDD. If not, it
        isn't. Throws an exception if the return type of `fn` is not castable
        to a boolean value.
        """
        self._entry(skip_undefined=skip_undefined, seed=seed)

        if seed:
            distribute_seed(self._rdd, seed)
            random.seed(seed)

        # noinspection PyShadowingNames
        def apply_filter(x, fn, skip_undefined):
            if x is None and skip_undefined:
                return None
            return fn(x)
        res = self._rdd.filter(lambda x: apply_filter(x, fn, skip_undefined))
        return self._rv(res)
Example #6
0
    def filter(self, fn, skip_undefined, seed):
        """
        Filter this RDD by a function.

        Returns a new RDD filtered by this RDD.  If `fn` evaluates an
        element to True, this element is copied to the new RDD. If not, it
        isn't. Throws an exception if the return type of `fn` is not castable
        to a boolean value.
        """
        self._entry(skip_undefined=skip_undefined, seed=seed)

        if seed:
            distribute_seed(self._rdd, seed)
            random.seed(seed)

        # noinspection PyShadowingNames
        def apply_filter(x, fn, skip_undefined):
            if x is None and skip_undefined:
                return None
            return fn(x)

        res = self._rdd.filter(lambda x: apply_filter(x, fn, skip_undefined))
        return self._rv(res)