コード例 #1
0
    def topk_index(self, topk, reverse):
        """
        Create an RDD indicating which elements are in the top k.

        Entries are '1' if the corresponding element in the current RDD is a
        part of the top k elements, and '0' if that corresponding element is
        not. 
        """
        self._entry(topk, reverse)
        if type(topk) is not int:
            raise TypeError("'Topk_index' -- topk must be integer ({})".format(topk))

        if topk == 0:
            res = self._rdd.map(lambda x: 0)
        else:
            pairs = self._rdd.zipWithIndex()
            # we are going to use this twice
            cache(pairs)
            # takeOrdered always sorts ascending
            # topk needs to sort descending if reverse is False, ascending if True
            if reverse:
                order_fn = lambda x: x
            else:
                order_fn = lambda x: ReverseCmp(x)
            top_pairs = pairs.takeOrdered(topk, lambda x: order_fn(x[0]))
            top_ranks = [x[1] for x in top_pairs]
            res = pairs.map(lambda x: x[1] in top_ranks)
            uncache(pairs)
        self._exit()
        return self._rv(res)
コード例 #2
0
 def tail(self, n):
     self._entry(n)
     pairs = self._rdd.zipWithIndex()
     cache(pairs)
     start = pairs.count() - n
     filtered_pairs = pairs.filter(lambda x: x[1] >= start)
     uncache(pairs)
     res = filtered_pairs.keys()
     self._exit()
     return self._rv(res)