コード例 #1
0
    def topk_index(self, topk, reverse):
        """
        Create an RDD indicating which elements are in the top k.

        Entries are '1' if the corresponding element in the current RDD is a
        part of the top k elements, and '0' if that corresponding element is
        not. 
        """
        self._entry(topk=topk, reverse=reverse)
        if not isinstance(topk, int):
            raise TypeError(
                "'Topk_index' -- topk must be integer ({})".format(topk))

        if topk == 0:
            res = self._rdd.map(lambda y: 0)
        else:
            pairs = self._rdd.zipWithIndex()
            # we are going to use this twice
            cache(pairs)
            # takeOrdered always sorts ascending
            # topk needs to sort descending if reverse is False, ascending if True
            if reverse:
                top_pairs = pairs.takeOrdered(topk, lambda x: x[0])
            else:
                top_pairs = pairs.takeOrdered(topk, lambda x: ReverseCmp(x[0]))
            top_ranks = [v[1] for v in top_pairs]
            res = pairs.map(lambda z: z[1] in top_ranks)
            uncache(pairs)
        return self._rv(res)
コード例 #2
0
    def topk_index(self, topk, reverse):
        """
        Create an RDD indicating which elements are in the top k.

        Entries are '1' if the corresponding element in the current RDD is a
        part of the top k elements, and '0' if that corresponding element is
        not. 
        """
        self._entry(topk=topk, reverse=reverse)
        if not isinstance(topk, int):
            raise TypeError("'Topk_index' -- topk must be integer ({})".format(topk))

        if topk == 0:
            res = self._rdd.map(lambda y: 0)
        else:
            pairs = self._rdd.zipWithIndex()
            # we are going to use this twice
            cache(pairs)
            # takeOrdered always sorts ascending
            # topk needs to sort descending if reverse is False, ascending if True
            if reverse:
                top_pairs = pairs.takeOrdered(topk, lambda x: x[0])
            else:
                top_pairs = pairs.takeOrdered(topk, lambda x: ReverseCmp(x[0]))
            top_ranks = [v[1] for v in top_pairs]
            res = pairs.map(lambda z: z[1] in top_ranks)
            uncache(pairs)
        return self._rv(res)
コード例 #3
0
 def tail(self, n):
     self._entry(n=n)
     pairs = self._rdd.zipWithIndex()
     cache(pairs)
     start = pairs.count() - n
     filtered_pairs = pairs.filter(lambda x: x[1] >= start)
     uncache(pairs)
     res = filtered_pairs.keys()
     return self._rv(res)
コード例 #4
0
 def tail(self, n):
     self._entry(n=n)
     pairs = self._rdd.zipWithIndex()
     cache(pairs)
     start = pairs.count() - n
     filtered_pairs = pairs.filter(lambda x: x[1] >= start)
     uncache(pairs)
     res = filtered_pairs.keys()
     return self._rv(res)