Ejemplo n.º 1
0
def test_lineage():
    Scope.reset()

    dc = DparkContext()
    rdd1 = dc.union([dc.makeRDD([(1, 2)]) for _ in range(5)])
    assert len(rdd1.dep_lineage_counts) == 1

    rdd2 = dc.union([dc.makeRDD([(1, 2)]) for _ in range(3)])
    rdd3 = rdd1.union(rdd2)
    assert len(rdd3.dep_lineage_counts) == 2

    rdd4 = dc.union([dc.union([dc.makeRDD([(1, 2)]) for _ in range(2)]) for _ in range(4)])
    assert len(rdd4.dep_lineage_counts) == 1
    assert len(list(rdd4.dependencies)[0].rdd.dep_lineage_counts) == 1
    rdd5 = rdd3.groupWith(rdd4)

    print("rdd1", rdd1.id, rdd1.dep_lineage_counts)
    stage = dc.scheduler.newStage(rdd1, None)
    pprint(stage.pipelines)
    pprint(stage.pipeline_edges)
    assert list(stage.pipelines.keys()) == [rdd1.id]
    assert stage.pipeline_edges == {}

    stage = dc.scheduler.newStage(rdd3, None)
    pprint(stage.pipelines)
    pprint(stage.pipeline_edges)
    assert sorted(list(stage.pipelines.keys())) == [rdd1.id, rdd2.id, rdd3.id]
    assert stage.pipeline_edges == {((-1, rdd1.id), (-1, rdd3.id)):1,
                                    ((-1, rdd2.id), (-1, rdd3.id)):1}

    stage = dc.scheduler.newStage(rdd4, None)
    pprint(stage.pipelines)
    pprint(stage.pipeline_edges)
    assert list(stage.pipelines.keys()) == [rdd4.id]
    assert stage.pipeline_edges == {}

    print("rdd5", rdd5.id, rdd3.id, rdd4.id)
    stage = dc.scheduler.newStage(rdd5, None)
    pprint(stage.pipelines)
    pprint(stage.pipeline_edges)
    assert sorted(list(stage.pipelines.keys())) == [rdd5.id]
    assert sorted(stage.pipeline_edges) == sorted([((s.id, s.rdd.id), (-1, rdd5.id)) for s in stage.parents])

    print('-' * 100)
    pprint(stage.get_pipeline_graph())

    for s in stage.parents:
        if s.rdd.id == rdd4.id:
            assert list(s.pipelines.keys()) == [rdd4.id]
            assert s.pipeline_edges == {}
        elif s.rdd.id == rdd3.id:
            assert sorted(list(s.pipelines.keys())) == [rdd1.id, rdd2.id, rdd3.id]
            assert s.pipeline_edges == {((-1, rdd1.id), (-1, rdd3.id)): 1,
                                        ((-1, rdd2.id), (-1, rdd3.id)): 1}
        else:
            assert False

        pprint(s.get_pipeline_graph())
Ejemplo n.º 2
0
    def get_rdd(self):
        dpark = DparkContext()

        return dpark.union(
            [dpark.textFile(path, splitSize=64 << 20)
             for path in self.paths]
        ).map(Weblog.from_line)
Ejemplo n.º 3
0
def test_call_graph_union():
    dc = DparkContext()
    Scope.reset()
    r1 = dc.union([dc.makeRDD([(1, 2)]) for _ in range(2)])
    r2 = dc.union([dc.makeRDD([(3, 4)]) for _ in range(2)])
    rdd = r1.union(r2)
    dc.scheduler.current_scope = Scope.get("")
    g = dc.scheduler.get_call_graph(rdd)
    # pprint(g)
    fg = dc.scheduler.fmt_call_graph(g)
    # pprint(fg)
    assert g == ([0, 1, 2, 3, 4, 5], {(0, 1): 2, (1, 4): 1, (2, 3): 2, (3, 4): 1, (4, 5): 1})
Ejemplo n.º 4
0
def test_call_graph_union():
    dc = DparkContext()
    Scope.reset()
    r1 = dc.union([dc.makeRDD([(1, 2)]) for _ in range(2)])
    r2 = dc.union([dc.makeRDD([(3, 4)]) for _ in range(2)])
    rdd = r1.union(r2)
    dc.scheduler.current_scope = Scope.get("")
    g = dc.scheduler.get_call_graph(rdd)
    # pprint(g)
    fg = dc.scheduler.fmt_call_graph(g)
    # pprint(fg)
    assert g == ([0, 1, 2, 3, 4, 5], {
        (0, 1): 2,
        (1, 4): 1,
        (2, 3): 2,
        (3, 4): 1,
        (4, 5): 1
    })
Ejemplo n.º 5
0
def test_lineage():
    Scope.reset()

    dc = DparkContext()
    rdd1 = dc.union([dc.makeRDD([(1, 2)]) for _ in range(5)])
    assert len(rdd1.dep_lineage_counts) == 1

    rdd2 = dc.union([dc.makeRDD([(1, 2)]) for _ in range(3)])
    rdd3 = rdd1.union(rdd2)
    assert len(rdd3.dep_lineage_counts) == 2

    rdd4 = dc.union(
        [dc.union([dc.makeRDD([(1, 2)]) for _ in range(2)]) for _ in range(4)])
    assert len(rdd4.dep_lineage_counts) == 1
    assert len(list(rdd4.dependencies)[0].rdd.dep_lineage_counts) == 1
    rdd5 = rdd3.groupWith(rdd4)

    print("rdd1", rdd1.id, rdd1.dep_lineage_counts)
    stage = dc.scheduler.newStage(rdd1, None)
    pprint(stage.pipelines)
    pprint(stage.pipeline_edges)
    assert list(stage.pipelines.keys()) == [rdd1.id]
    assert stage.pipeline_edges == {}

    stage = dc.scheduler.newStage(rdd3, None)
    pprint(stage.pipelines)
    pprint(stage.pipeline_edges)
    assert sorted(list(stage.pipelines.keys())) == [rdd1.id, rdd2.id, rdd3.id]
    assert stage.pipeline_edges == {
        ((-1, rdd1.id), (-1, rdd3.id)): 1,
        ((-1, rdd2.id), (-1, rdd3.id)): 1
    }

    stage = dc.scheduler.newStage(rdd4, None)
    pprint(stage.pipelines)
    pprint(stage.pipeline_edges)
    assert list(stage.pipelines.keys()) == [rdd4.id]
    assert stage.pipeline_edges == {}

    print("rdd5", rdd5.id, rdd3.id, rdd4.id)
    stage = dc.scheduler.newStage(rdd5, None)
    pprint(stage.pipelines)
    pprint(stage.pipeline_edges)
    assert sorted(list(stage.pipelines.keys())) == [rdd5.id]
    assert sorted(stage.pipeline_edges) == sorted([
        ((s.id, s.rdd.id), (-1, rdd5.id)) for s in stage.parents
    ])

    print('-' * 100)
    pprint(stage.get_pipeline_graph())

    for s in stage.parents:
        if s.rdd.id == rdd4.id:
            assert list(s.pipelines.keys()) == [rdd4.id]
            assert s.pipeline_edges == {}
        elif s.rdd.id == rdd3.id:
            assert sorted(list(
                s.pipelines.keys())) == [rdd1.id, rdd2.id, rdd3.id]
            assert s.pipeline_edges == {
                ((-1, rdd1.id), (-1, rdd3.id)): 1,
                ((-1, rdd2.id), (-1, rdd3.id)): 1
            }
        else:
            assert False

        pprint(s.get_pipeline_graph())
Ejemplo n.º 6
0
def m(x):
    return x


def r(x, y):
    return x + y


def src():
    return dc.makeRDD([(1,1)],2)


dc = DparkContext("mesos")

rdd1 = src()
rdd2 = src().reduceByKey(r)

to_union_1_a = [src() for _ in range(2)]
to_union_1_b = [src()]
to_union_2_a = [dc.union(to_union_1_a + to_union_1_b) for _ in range(2)]
to_union_2_b = [rdd2, rdd1]
to_union_3_a = [dc.union(to_union_2_a + to_union_2_b).map(m).reduceByKey(r)]
to_union_3_b = [rdd2]
rdd3 = dc.union(to_union_3_a + to_union_3_b)
rdd4 = rdd2.join(rdd2)

rdd1.collect()
rdd2.collect()
rdd3.collect()
rdd4.collect()
Ejemplo n.º 7
0
    def get_rdd(self):
        dpark = DparkContext()

        return dpark.union([
            dpark.textFile(path, splitSize=64 << 20) for path in self.paths
        ]).map(Weblog.from_line)
Ejemplo n.º 8
0
# -*- coding: utf-8 -*-

from dpark import DparkContext


dc = DparkContext()


def get_rdd():
    return dc.makeRDD([(1, 1)])


rdd1 = get_rdd()
rdd2 = dc.union([get_rdd() for i in range(2)])
rdd3 = get_rdd().groupByKey()
dc.union([rdd1, rdd2, rdd3]).collect()
Ejemplo n.º 9
0
def m(x):
    return x


def r(x, y):
    return x + y


def src():
    return dc.makeRDD([(1, 1)], 2)


dc = DparkContext("mesos")

rdd1 = src()
rdd2 = src().reduceByKey(r)

to_union_1_a = [src() for _ in range(2)]
to_union_1_b = [src()]
to_union_2_a = [dc.union(to_union_1_a + to_union_1_b) for _ in range(2)]
to_union_2_b = [rdd2, rdd1]
to_union_3_a = [dc.union(to_union_2_a + to_union_2_b).map(m).reduceByKey(r)]
to_union_3_b = [rdd2]
rdd3 = dc.union(to_union_3_a + to_union_3_b)
rdd4 = rdd2.join(rdd2)

rdd1.collect()
rdd2.collect()
rdd3.collect()
rdd4.collect()
Ejemplo n.º 10
0
# -*- coding: utf-8 -*-

from dpark import DparkContext

dc = DparkContext()


def get_rdd():
    return dc.makeRDD([(1, 1)])


rdd1 = get_rdd()
rdd2 = dc.union([get_rdd() for i in range(2)])
rdd3 = get_rdd().groupByKey()
dc.union([rdd1, rdd2, rdd3]).collect()