def __call__(self, head: RDD): if self.keymap is None: return head.coalesce(self.partitions, self.shuffle) # partitionBy the key extracted using self.keymap try: # this checks if keymap is an identity probe = self.keymap("probe") except: # noqa: E722 probe = None if probe != "probe": head = head.map(lambda x: (self.keymap(x), x)) return head \ .partitionBy(self.partitions) \ .map(lambda x: x[1])
def repar_rdd(rdd: RDD, rdd_count: int, example_per_par=100000, coalesce_only=True): """ repar rdd based on number of example. if coalesce_only is False and expected partition is greater than current partition then nothing will happen """ num_partition = rdd.getNumPartitions() expect_partition = max(1, int(rdd_count / example_per_par)) if expect_partition < num_partition: rdd = rdd.coalesce(expect_partition) elif expect_partition > num_partition and coalesce_only is False: rdd = rdd.repartition(expect_partition) return rdd
def __call__(self, head: RDD): return head.coalesce(self.partitions, self.shuffle)