def topk_index(self, topk, reverse): """ Create an RDD indicating which elements are in the top k. Entries are '1' if the corresponding element in the current RDD is a part of the top k elements, and '0' if that corresponding element is not. """ self._entry(topk, reverse) if type(topk) is not int: raise TypeError("'Topk_index' -- topk must be integer ({})".format(topk)) if topk == 0: res = self._rdd.map(lambda x: 0) else: pairs = self._rdd.zipWithIndex() # we are going to use this twice cache(pairs) # takeOrdered always sorts ascending # topk needs to sort descending if reverse is False, ascending if True if reverse: order_fn = lambda x: x else: order_fn = lambda x: ReverseCmp(x) top_pairs = pairs.takeOrdered(topk, lambda x: order_fn(x[0])) top_ranks = [x[1] for x in top_pairs] res = pairs.map(lambda x: x[1] in top_ranks) uncache(pairs) self._exit() return self._rv(res)
def tail(self, n): self._entry(n) pairs = self._rdd.zipWithIndex() cache(pairs) start = pairs.count() - n filtered_pairs = pairs.filter(lambda x: x[1] >= start) uncache(pairs) res = filtered_pairs.keys() self._exit() return self._rv(res)