from spark_pytools.jobs.init import sc nums = sc.parallelize([1,3,5,9]) sumCount = nums.aggregate((0, 0), (lambda acc, value: (acc[0] + value, acc[1] + 1)), (lambda acc1, acc2: (acc1[0] + acc2[0], acc1[1] + acc2[1]))) print(sumCount[0] / float(sumCount[1]))
# compute Per-key average as in Example 4-12. from spark_pytools.jobs.init import sc nums = sc.parallelize(((1, 2), (3, 4), (3, 6))) sum_count = nums.combineByKey( (lambda x: (x, 1)), # initial value creator (lambda x, y: (x[0] + y, x[1] + 1)), # accumulator (lambda x, y: (x[0] + y[0], x[1] + y[1])) # combiner ) print(sum_count.mapValues(lambda p: float(p[0]) / p[1]).collectAsMap())
from spark_pytools.jobs.init import sc nums = sc.parallelize([1, 3, 5, 9]) sumCount = nums.aggregate( (0, 0), (lambda acc, value: (acc[0] + value, acc[1] + 1)), (lambda acc1, acc2: (acc1[0] + acc2[0], acc1[1] + acc2[1]))) print(sumCount[0] / float(sumCount[1]))
# compute Per-key average as in Example 4-12. from spark_pytools.jobs.init import sc nums = sc.parallelize(((1, 2), (3, 4), (3,6))) sum_count = nums.combineByKey( (lambda x: (x, 1)), # initial value creator (lambda x, y: (x[0] + y, x[1] + 1)), # accumulator (lambda x, y: (x[0] + y[0], x[1] + y[1])) # combiner ) print(sum_count.mapValues(lambda p: float(p[0]) / p[1]).collectAsMap())