sc = Context() file1 = sys.argv[1] lines = sc.textFile(file1) rdd_part_1 = (lines.flatMap(lambda x: re.sub("[^\w]", " ", x).split()).map( lambda x: (x.lower(), 1)).reduceByKey(lambda x, y: x + y).filter( lambda x: x[1] >= 20).map(lambda x: (x[0], (x[1], file1)))) file2 = sys.argv[2] lines = sc.textFile(file2) rdd_part_2 = (lines.flatMap(lambda x: re.sub("[^\w]", " ", x).split()).map( lambda x: (x.lower(), 1)).reduceByKey(lambda x, y: x + y).filter( lambda x: x[1] >= 20).map(lambda x: (x[0], (x[1], file2)))) file3 = sys.argv[3] lines = sc.textFile(file3) rdd_part_3 = (lines.flatMap(lambda x: re.sub("[^\w]", " ", x).split()).map( lambda x: (x.lower(), 1)).reduceByKey(lambda x, y: x + y).filter( lambda x: x[1] >= 20).map(lambda x: (x[0], (x[1], file3)))) rdd_max = sc.union([rdd_part_1, rdd_part_2, rdd_part_3]).groupByKey().sortByKey() vals = rdd_max.collect() for item in vals: print item
def test_union(): sc = Context() rdd1 = sc.parallelize(["Hello"]) rdd2 = sc.parallelize(["World"]) union = sc.union([rdd1, rdd2]).collect() assert len(union) == 2 and "Hello" in union and "World" in union
def test_union(): my_rdd = Context().parallelize([4, 9, 7, 3, 2, 5], 3) assert my_rdd.union(my_rdd).count() == 12
def test_union(): sc = Context() rdd1 = sc.parallelize(['Hello']) rdd2 = sc.parallelize(['World']) union = sc.union([rdd1, rdd2]).collect() assert len(union) == 2 and 'Hello' in union and 'World' in union