コード例 #1
0
    def reduceByKeyAndWindow(self, func, invFunc, windowDuration, slideDuration=None,
                             numPartitions=None, filterFunc=None):
        """
        Return a new DStream by applying incremental `reduceByKey` over a sliding window.

        The reduced value of over a new window is calculated using the old window's reduce value :
         1. reduce the new values that entered the window (e.g., adding new counts)
         2. "inverse reduce" the old values that left the window (e.g., subtracting old counts)

        `invFunc` can be None, then it will reduce all the RDDs in window, could be slower
        than having `invFunc`.

        @param func:           associative and commutative reduce function
        @param invFunc:        inverse function of `reduceFunc`
        @param windowDuration: width of the window; must be a multiple of this DStream's
                              batching interval
        @param slideDuration:  sliding interval of the window (i.e., the interval after which
                              the new DStream will generate RDDs); must be a multiple of this
                              DStream's batching interval
        @param numPartitions:  number of partitions of each RDD in the new DStream.
        @param filterFunc:     function to filter expired key-value pairs;
                              only pairs that satisfy the function are retained
                              set this to null if you do not want to filter
        """
        self._validate_window_param(windowDuration, slideDuration)
        if numPartitions is None:
            numPartitions = self._sc.defaultParallelism

        reduced = self.reduceByKey(func, numPartitions)

        if invFunc:
            def reduceFunc(t, a, b):
                b = b.reduceByKey(func, numPartitions)
                r = a.union(b).reduceByKey(func, numPartitions) if a else b
                if filterFunc:
                    r = r.filter(filterFunc)
                return r

            def invReduceFunc(t, a, b):
                b = b.reduceByKey(func, numPartitions)
                joined = a.leftOuterJoin(b, numPartitions)
                return joined.mapValues(lambda kv: invFunc(kv[0], kv[1])
                                        if kv[1] is not None else kv[0])

            jreduceFunc = TransformFunction(self._sc, reduceFunc, reduced._jrdd_deserializer)
            jinvReduceFunc = TransformFunction(self._sc, invReduceFunc, reduced._jrdd_deserializer)
            if slideDuration is None:
                slideDuration = self._slideDuration
            dstream = self._sc._jvm.PythonReducedWindowedDStream(
                reduced._jdstream.dstream(),
                jreduceFunc, jinvReduceFunc,
                self._ssc._jduration(windowDuration),
                self._ssc._jduration(slideDuration))
            return DStream(dstream.asJavaDStream(), self._ssc, self._sc.serializer)
        else:
            return reduced.window(windowDuration, slideDuration).reduceByKey(func, numPartitions)
コード例 #2
0
    def updateStateByKey(self, updateFunc, numPartitions=None, initialRDD=None):
        """
        Return a new "state" DStream where the state for each key is updated by applying
        the given function on the previous state of the key and the new values of the key.

        @param updateFunc: State update function. If this function returns None, then
                           corresponding state key-value pair will be eliminated.
        """
        if numPartitions is None:
            numPartitions = self._sc.defaultParallelism

        if initialRDD and not isinstance(initialRDD, RDD):
            initialRDD = self._sc.parallelize(initialRDD)

        def reduceFunc(t, a, b):
            if a is None:
                g = b.groupByKey(numPartitions).mapValues(lambda vs: (list(vs), None))
            else:
                g = a.cogroup(b.partitionBy(numPartitions), numPartitions)
                g = g.mapValues(lambda ab: (list(ab[1]), list(ab[0])[0] if len(ab[0]) else None))
            state = g.mapValues(lambda vs_s: updateFunc(vs_s[0], vs_s[1]))
            return state.filter(lambda k_v: k_v[1] is not None)

        jreduceFunc = TransformFunction(self._sc, reduceFunc,
                                        self._sc.serializer, self._jrdd_deserializer)
        if initialRDD:
            initialRDD = initialRDD._reserialize(self._jrdd_deserializer)
            dstream = self._sc._jvm.PythonStateDStream(self._jdstream.dstream(), jreduceFunc,
                                                       initialRDD._jrdd)
        else:
            dstream = self._sc._jvm.PythonStateDStream(self._jdstream.dstream(), jreduceFunc)

        return DStream(dstream.asJavaDStream(), self._ssc, self._sc.serializer)
コード例 #3
0
ファイル: dstream.py プロジェクト: zhengruifeng/spark
    def transformWith(
        self: "DStream[T]",
        func: Union[Callable[[RDD[T], RDD[U]], RDD[V]],
                    Callable[[datetime, RDD[T], RDD[U]], RDD[V]], ],
        other: "DStream[U]",
        keepSerializer: bool = False,
    ) -> "DStream[V]":
        """
        Return a new DStream in which each RDD is generated by applying a function
        on each RDD of this DStream and 'other' DStream.

        `func` can have two arguments of (`rdd_a`, `rdd_b`) or have three
        arguments of (`time`, `rdd_a`, `rdd_b`)
        """
        if func.__code__.co_argcount == 2:
            oldfunc = func

            def func(_: datetime, a: RDD[T], b: RDD[U]) -> RDD[V]:
                return oldfunc(a, b)  # type: ignore[call-arg, arg-type]

        assert func.__code__.co_argcount == 3, "func should take two or three arguments"
        jfunc = TransformFunction(
            self._sc,
            func,
            self._jrdd_deserializer,
            other._jrdd_deserializer,
        )
        assert self._sc._jvm is not None
        dstream = self._sc._jvm.PythonTransformed2DStream(
            self._jdstream.dstream(), other._jdstream.dstream(), jfunc)
        jrdd_serializer = self._jrdd_deserializer if keepSerializer else self._sc.serializer
        return DStream(dstream.asJavaDStream(), self._ssc, jrdd_serializer)
コード例 #4
0
def foreachRDD_modified(self, func):
    """
        Apply a function to each RDD in this DStream.
        """
    if func.__code__.co_argcount == 1:
        old_func = func
        func = lambda t, rdd: old_func(rdd)
    jfunc = TransformFunction(self._sc, func, self._jrdd_deserializer)
    api = self._ssc._jvm.PythonDStream
    print("")
    print("############################################################")
    print("#                                                          #")
    print("#                  Opening DB Connection                   #")
    print("#                                                          #")
    print("############################################################")
    print("Spark Processing Starting: " +
          str(datetime.now().strftime('%Y-%m-%d %H:%M:%S')))
    print("")
    print("------------------------------------------------------------")
    print("")
    #Create DB connection here as Spark's lazy evaluation will only create the DB connection once at the start of job
    global conn
    conn = psycopg2.connect("dbname=%s user=%s password=%s host=%s" %
                            (os.environ['psqlDB'], os.environ['psqlUser'],
                             os.environ['psqlPwd'], os.environ['psql']))
    global cur
    cur = conn.cursor()
    api.callForeachRDD(self._jdstream, jfunc)
コード例 #5
0
    def updateStateByKey(self, updateFunc, numPartitions=None):
        """
        Return a new "state" DStream where the state for each key is updated by applying
        the given function on the previous state of the key and the new values of the key.

        @param updateFunc: State update function. If this function returns None, then
                           corresponding state key-value pair will be eliminated.
        """
        if numPartitions is None:
            numPartitions = self._sc.defaultParallelism

        def reduceFunc(t, a, b):
            if a is None:
                g = b.groupByKey(numPartitions).mapValues(lambda vs:
                                                          (list(vs), None))
            else:
                g = a.cogroup(b.partitionBy(numPartitions), numPartitions)
                g = g.mapValues(lambda (va, vb): (list(vb), list(va)[0]
                                                  if len(va) else None))
            state = g.mapValues(lambda (vs, s): updateFunc(vs, s))
            return state.filter(lambda (k, v): v is not None)

        jreduceFunc = TransformFunction(self._sc, reduceFunc,
                                        self._sc.serializer,
                                        self._jrdd_deserializer)
        dstream = self._sc._jvm.PythonStateDStream(self._jdstream.dstream(),
                                                   jreduceFunc)
        return DStream(dstream.asJavaDStream(), self._ssc, self._sc.serializer)
コード例 #6
0
ファイル: dstream.py プロジェクト: maduhu/HDP-spark
    def _jdstream(self):
        if self._jdstream_val is not None:
            return self._jdstream_val

        jfunc = TransformFunction(self._sc, self.func, self.prev._jrdd_deserializer)
        dstream = self._sc._jvm.PythonTransformedDStream(self.prev._jdstream.dstream(), jfunc)
        self._jdstream_val = dstream.asJavaDStream()
        return self._jdstream_val
コード例 #7
0
ファイル: kafka.py プロジェクト: taichi44/spark
    def _jdstream(self):
        if self._jdstream_val is not None:
            return self._jdstream_val

        jfunc = TransformFunction(self._sc, self.func, self.prev._jrdd_deserializer) \
            .rdd_wrapper(lambda jrdd, ctx, ser: KafkaRDD(jrdd, ctx, ser))
        dstream = self._sc._jvm.PythonTransformedDStream(self.prev._jdstream.dstream(), jfunc)
        self._jdstream_val = dstream.asJavaDStream()
        return self._jdstream_val
コード例 #8
0
ファイル: dstream.py プロジェクト: zhengruifeng/spark
    def updateStateByKey(
        self: "DStream[Tuple[K, V]]",
        updateFunc: Callable[[Iterable[V], Optional[S]], S],
        numPartitions: Optional[int] = None,
        initialRDD: Optional[Union[RDD[Tuple[K, S]],
                                   Iterable[Tuple[K, S]]]] = None,
    ) -> "DStream[Tuple[K, S]]":
        """
        Return a new "state" DStream where the state for each key is updated by applying
        the given function on the previous state of the key and the new values of the key.

        Parameters
        ----------
        updateFunc : function
            State update function. If this function returns None, then
            corresponding state key-value pair will be eliminated.
        """
        if numPartitions is None:
            numPartitions = self._sc.defaultParallelism

        if initialRDD and not isinstance(initialRDD, RDD):
            initialRDD = self._sc.parallelize(initialRDD)

        def reduceFunc(t: datetime, a: Any, b: Any) -> Any:
            if a is None:
                g = b.groupByKey(numPartitions).mapValues(lambda vs:
                                                          (list(vs), None))
            else:
                g = a.cogroup(b.partitionBy(cast(int, numPartitions)),
                              numPartitions)
                g = g.mapValues(lambda ab: (list(ab[1]), list(ab[0])[0]
                                            if len(ab[0]) else None))
            state = g.mapValues(lambda vs_s: updateFunc(vs_s[0], vs_s[1]))
            return state.filter(lambda k_v: k_v[1] is not None)

        jreduceFunc = TransformFunction(
            self._sc,
            reduceFunc,
            self._sc.serializer,
            self._jrdd_deserializer,
        )
        if initialRDD:
            initialRDD = cast(RDD[Tuple[K, S]],
                              initialRDD)._reserialize(self._jrdd_deserializer)
            assert self._sc._jvm is not None
            dstream = self._sc._jvm.PythonStateDStream(
                self._jdstream.dstream(),
                jreduceFunc,
                initialRDD._jrdd,
            )
        else:
            assert self._sc._jvm is not None
            dstream = self._sc._jvm.PythonStateDStream(
                self._jdstream.dstream(), jreduceFunc)

        return DStream(dstream.asJavaDStream(), self._ssc, self._sc.serializer)
 def foreachRDD(self, func):
     """
     Apply a function to each RDD in this DStream.
     """
     if func.__code__.co_argcount == 1:
         old_func = func
         func = lambda t, rdd: old_func(rdd)
     jfunc = TransformFunction(self._sc, func, self._jrdd_deserializer)
     api = self._ssc._jvm.PythonDStream
     api.callForeachRDD(self._jdstream, jfunc)
コード例 #10
0
ファイル: context.py プロジェクト: skambha/spark-ri
 def transform(self, dstreams, transformFunc):
     """
     Create a new DStream in which each RDD is generated by applying
     a function on RDDs of the DStreams. The order of the JavaRDDs in
     the transform function parameter will be the same as the order
     of corresponding DStreams in the list.
     """
     jdstreams = [d._jdstream for d in dstreams]
     # change the final serializer to sc.serializer
     func = TransformFunction(self._sc,
                              lambda t, *rdds: transformFunc(rdds),
                              *[d._jrdd_deserializer for d in dstreams])
     jfunc = self._jvm.TransformFunction(func)
     jdstream = self._jssc.transform(jdstreams, jfunc)
     return DStream(jdstream, self, self._sc.serializer)
コード例 #11
0
ファイル: dstream.py プロジェクト: maduhu/HDP-spark
    def transformWith(self, func, other, keepSerializer=False):
        """
        Return a new DStream in which each RDD is generated by applying a function
        on each RDD of this DStream and 'other' DStream.

        `func` can have two arguments of (`rdd_a`, `rdd_b`) or have three
        arguments of (`time`, `rdd_a`, `rdd_b`)
        """
        if func.__code__.co_argcount == 2:
            oldfunc = func
            func = lambda t, a, b: oldfunc(a, b)
        assert func.__code__.co_argcount == 3, "func should take two or three arguments"
        jfunc = TransformFunction(self._sc, func, self._jrdd_deserializer, other._jrdd_deserializer)
        dstream = self._sc._jvm.PythonTransformed2DStream(self._jdstream.dstream(),
                                                          other._jdstream.dstream(), jfunc)
        jrdd_serializer = self._jrdd_deserializer if keepSerializer else self._sc.serializer
        return DStream(dstream.asJavaDStream(), self._ssc, jrdd_serializer)
コード例 #12
0
ファイル: dstream.py プロジェクト: zhengruifeng/spark
    def foreachRDD(
        self: "DStream[T]",
        func: Union[Callable[[RDD[T]], None], Callable[[datetime, RDD[T]],
                                                       None]],
    ) -> None:
        """
        Apply a function to each RDD in this DStream.
        """
        if func.__code__.co_argcount == 1:
            old_func = func

            def func(_: datetime, rdd: "RDD[T]") -> None:
                return old_func(rdd)  # type: ignore[call-arg, arg-type]

        jfunc = TransformFunction(self._sc, func, self._jrdd_deserializer)
        assert self._ssc._jvm is not None
        api = self._ssc._jvm.PythonDStream
        api.callForeachRDD(self._jdstream, jfunc)
コード例 #13
0
ファイル: context.py プロジェクト: slachiewicz/spark
    def transform(self, dstreams: List["DStream[Any]"],
                  transformFunc: Callable[..., RDD[T]]) -> "DStream[T]":
        """
        Create a new DStream in which each RDD is generated by applying
        a function on RDDs of the DStreams. The order of the JavaRDDs in
        the transform function parameter will be the same as the order
        of corresponding DStreams in the list.
        """
        jdstreams = [d._jdstream
                     for d in dstreams]  # type: ignore[attr-defined]
        # change the final serializer to sc.serializer
        func = TransformFunction(
            self._sc,
            lambda t, *rdds: transformFunc(rdds),
            *[d._jrdd_deserializer
              for d in dstreams],  # type: ignore[attr-defined]
        )

        assert self._jvm is not None
        jfunc = self._jvm.TransformFunction(func)
        jdstream = self._jssc.transform(jdstreams, jfunc)
        return DStream(jdstream, self, self._sc.serializer)
コード例 #14
0
ファイル: dstream.py プロジェクト: zhengruifeng/spark
    def reduceByKeyAndWindow(
        self: "DStream[Tuple[K, V]]",
        func: Callable[[V, V], V],
        invFunc: Optional[Callable[[V, V], V]],
        windowDuration: int,
        slideDuration: Optional[int] = None,
        numPartitions: Optional[int] = None,
        filterFunc: Optional[Callable[[Tuple[K, V]], bool]] = None,
    ) -> "DStream[Tuple[K, V]]":
        """
        Return a new DStream by applying incremental `reduceByKey` over a sliding window.

        The reduced value of over a new window is calculated using the old window's reduce value :
         1. reduce the new values that entered the window (e.g., adding new counts)
         2. "inverse reduce" the old values that left the window (e.g., subtracting old counts)

        `invFunc` can be None, then it will reduce all the RDDs in window, could be slower
        than having `invFunc`.

        Parameters
        ----------
        func : function
            associative and commutative reduce function
        invFunc : function
            inverse function of `reduceFunc`
        windowDuration : int
            width of the window; must be a multiple of this DStream's
            batching interval
        slideDuration : int, optional
            sliding interval of the window (i.e., the interval after which
            the new DStream will generate RDDs); must be a multiple of this
            DStream's batching interval
        numPartitions : int, optional
            number of partitions of each RDD in the new DStream.
        filterFunc : function, optional
            function to filter expired key-value pairs;
            only pairs that satisfy the function are retained
            set this to null if you do not want to filter
        """
        self._validate_window_param(windowDuration, slideDuration)
        if numPartitions is None:
            numPartitions = self._sc.defaultParallelism

        reduced = self.reduceByKey(func, numPartitions)

        if invFunc:

            def reduceFunc(t: datetime, a: Any, b: Any) -> Any:
                b = b.reduceByKey(func, numPartitions)
                r = a.union(b).reduceByKey(func, numPartitions) if a else b
                if filterFunc:
                    r = r.filter(filterFunc)
                return r

            def invReduceFunc(t: datetime, a: Any, b: Any) -> Any:
                b = b.reduceByKey(func, numPartitions)
                joined = a.leftOuterJoin(b, numPartitions)
                return joined.mapValues(
                    lambda kv: invFunc(kv[0], kv[1])  # type: ignore[misc]
                    if kv[1] is not None else kv[0])

            jreduceFunc = TransformFunction(self._sc, reduceFunc,
                                            reduced._jrdd_deserializer)
            jinvReduceFunc = TransformFunction(self._sc, invReduceFunc,
                                               reduced._jrdd_deserializer)
            if slideDuration is None:
                slideDuration = self._slideDuration
            assert self._sc._jvm is not None
            dstream = self._sc._jvm.PythonReducedWindowedDStream(
                reduced._jdstream.dstream(),
                jreduceFunc,
                jinvReduceFunc,
                self._ssc._jduration(windowDuration),
                self._ssc._jduration(slideDuration),  # type: ignore[arg-type]
            )
            return DStream(dstream.asJavaDStream(), self._ssc,
                           self._sc.serializer)
        else:
            return reduced.window(windowDuration, slideDuration).reduceByKey(
                func,
                numPartitions  # type: ignore[arg-type]
            )