def add_shuffle_key(split, iterator): buckets = defaultdict(list) c, batch = 0, min(10 * numPartitions, 1000) for k, v in iterator: buckets[partitionFunc(k) % numPartitions].append((k, v)) c += 1 # check used memory and avg size of chunk of objects if (c % 1000 == 0 and get_used_memory() > limit or c > batch): n, size = len(buckets), 0 for split in buckets.keys(): yield pack_long(split) d = outputSerializer.dumps(buckets[split]) del buckets[split] yield d size += len(d) avg = (size / n) >> 20 # let 1M < avg < 10M if avg < 1: batch *= 1.5 elif avg > 10: batch = max(batch / 1.5, 1) c = 0 for split, items in buckets.iteritems(): yield pack_long(split) yield outputSerializer.dumps(items)
def add_shuffle_key(split, iterator): buckets = defaultdict(list) for (k, v) in iterator: buckets[partitionFunc(k) % numPartitions].append((k, v)) for (split, items) in buckets.iteritems(): yield pack_long(split) yield dump_pickle(Batch(items))
def add_shuffle_key(split, iterator): buckets = defaultdict(list) for (k, v) in iterator: buckets[partitionFunc(k) % numPartitions].append((k, v)) for (split, items) in buckets.iteritems(): yield pack_long(split) yield outputSerializer.dumps(items)