Ejemplo n.º 1
0
sc = Context()

file1 = sys.argv[1]
lines = sc.textFile(file1)

rdd_part_1 = (lines.flatMap(lambda x: re.sub("[^\w]", " ", x).split()).map(
    lambda x: (x.lower(), 1)).reduceByKey(lambda x, y: x + y).filter(
        lambda x: x[1] >= 20).map(lambda x: (x[0], (x[1], file1))))

file2 = sys.argv[2]
lines = sc.textFile(file2)

rdd_part_2 = (lines.flatMap(lambda x: re.sub("[^\w]", " ", x).split()).map(
    lambda x: (x.lower(), 1)).reduceByKey(lambda x, y: x + y).filter(
        lambda x: x[1] >= 20).map(lambda x: (x[0], (x[1], file2))))

file3 = sys.argv[3]
lines = sc.textFile(file3)

rdd_part_3 = (lines.flatMap(lambda x: re.sub("[^\w]", " ", x).split()).map(
    lambda x: (x.lower(), 1)).reduceByKey(lambda x, y: x + y).filter(
        lambda x: x[1] >= 20).map(lambda x: (x[0], (x[1], file3))))

rdd_max = sc.union([rdd_part_1, rdd_part_2,
                    rdd_part_3]).groupByKey().sortByKey()

vals = rdd_max.collect()

for item in vals:
    print item
Ejemplo n.º 2
0
def test_union():
    sc = Context()
    rdd1 = sc.parallelize(["Hello"])
    rdd2 = sc.parallelize(["World"])
    union = sc.union([rdd1, rdd2]).collect()
    assert len(union) == 2 and "Hello" in union and "World" in union
Ejemplo n.º 3
0
def test_union():
    my_rdd = Context().parallelize([4, 9, 7, 3, 2, 5], 3)
    assert my_rdd.union(my_rdd).count() == 12
Ejemplo n.º 4
0
def test_union():
    sc = Context()
    rdd1 = sc.parallelize(['Hello'])
    rdd2 = sc.parallelize(['World'])
    union = sc.union([rdd1, rdd2]).collect()
    assert len(union) == 2 and 'Hello' in union and 'World' in union