Ejemplo n.º 1
0
 def __call__(self, head: RDD):
     if self.keymap is None:
         return head.coalesce(self.partitions, self.shuffle)
     # partitionBy the key extracted using self.keymap
     try:
         # this checks if keymap is an identity
         probe = self.keymap("probe")
     except:  # noqa: E722
         probe = None
     if probe != "probe":
         head = head.map(lambda x: (self.keymap(x), x))
     return head \
         .partitionBy(self.partitions) \
         .map(lambda x: x[1])
def repar_rdd(rdd: RDD,
              rdd_count: int,
              example_per_par=100000,
              coalesce_only=True):
    """
    repar rdd based on number of example. if coalesce_only is False and expected
    partition is greater than current partition then nothing will happen
    """
    num_partition = rdd.getNumPartitions()
    expect_partition = max(1, int(rdd_count / example_per_par))

    if expect_partition < num_partition:
        rdd = rdd.coalesce(expect_partition)
    elif expect_partition > num_partition and coalesce_only is False:
        rdd = rdd.repartition(expect_partition)

    return rdd
Ejemplo n.º 3
0
 def __call__(self, head: RDD):
     return head.coalesce(self.partitions, self.shuffle)