Beispiel #1
0
def test_simple_word_count():
    ray.init(_load_code_from_local=True, _include_java=True)
    ctx = StreamingContext.Builder() \
        .build()
    sink_file = "/tmp/ray_streaming_test_simple_word_count.txt"
    if os.path.exists(sink_file):
        os.remove(sink_file)

    def sink_func(x):
        with open(sink_file, "a") as f:
            line = "{}:{},".format(x[0], x[1])
            print("sink_func", line)
            f.write(line)

    ctx.from_values("a", "b", "c") \
        .set_parallelism(1) \
        .flat_map(lambda x: [x, x]) \
        .map(lambda x: (x, 1)) \
        .key_by(lambda x: x[0]) \
        .reduce(lambda old_value, new_value:
                (old_value[0], old_value[1] + new_value[1])) \
        .sink(sink_func)
    ctx.submit("word_count")
    import time
    time.sleep(3)
    ray.shutdown()
    with open(sink_file, "r") as f:
        result = f.read()
        assert "a:2" in result
        assert "b:2" in result
        assert "c:2" in result
def test_simple_word_count():
    ray.init(_load_code_from_local=True)
    ctx = StreamingContext.Builder() \
        .build()
    sink_file = "/tmp/ray_streaming_test_simple_word_count.txt"
    if os.path.exists(sink_file):
        os.remove(sink_file)

    def sink_func(x):
        with open(sink_file, "a") as f:
            line = "{}:{},".format(x[0], x[1])
            print("sink_func", line)
            f.write(line)

    ctx.from_values("a", "b", "c") \
        .set_parallelism(1) \
        .flat_map(lambda x: [x, x]) \
        .map(lambda x: (x, 1)) \
        .key_by(lambda x: x[0]) \
        .reduce(lambda old_value, new_value:
                (old_value[0], old_value[1] + new_value[1])) \
        .sink(sink_func)
    ctx.submit("word_count")

    def check_succeed():
        if os.path.exists(sink_file):
            with open(sink_file, "r") as f:
                result = f.read()
                return "a:2" in result and "b:2" in result and "c:2" in result
        return False

    wait_for_condition(check_succeed, timeout=60, retry_interval_ms=1000)
    print("Execution succeed")
    ray.shutdown()
Beispiel #3
0
def test_word_count():
    try:
        ray.init(job_config=ray.job_config.JobConfig(
            code_search_path=sys.path))
        # time.sleep(10) # for gdb to attach
        ctx = StreamingContext.Builder() \
            .option("streaming.context-backend.type", "local_file") \
            .option(
                "streaming.context-backend.file-state.root",
                "/tmp/ray/cp_files/"
            ) \
            .option("streaming.checkpoint.timeout.secs", "3") \
            .build()

        print("-----------submit job-------------")

        ctx.read_text_file(__file__) \
            .set_parallelism(1) \
            .flat_map(lambda x: x.split()) \
            .map(lambda x: (x, 1)) \
            .key_by(lambda x: x[0]) \
            .reduce(lambda old_value, new_value:
                    (old_value[0], old_value[1] + new_value[1])) \
            .filter(lambda x: "ray" not in x) \
            .sink(lambda x: print("####result", x))
        ctx.submit("word_count")

        print("-----------checking output-------------")
        retry_count = 180 / 5  # wait for 3min
        while not has_sink_output():
            time.sleep(5)
            retry_count -= 1
            if retry_count <= 0:
                raise RuntimeError("Can not find output")

        print("-----------killing worker-------------")
        time.sleep(5)
        kill_all_worker()

        print("-----------checking checkpoint-------------")
        cp_ok_num = checkpoint_success_num()
        retry_count = 300000 / 5  # wait for 5min
        while True:
            cur_cp_num = checkpoint_success_num()
            print("-----------checking checkpoint"
                  ", cur_cp_num={}, old_cp_num={}-------------".format(
                      cur_cp_num, cp_ok_num))
            if cur_cp_num > cp_ok_num:
                print("--------------TEST OK!------------------")
                break
            time.sleep(5)
            retry_count -= 1
            if retry_count <= 0:
                raise RuntimeError(
                    "Checkpoint keeps failing after fail-over, test failed!")
    finally:
        ray.shutdown()
Beispiel #4
0
def test_hybrid_stream():
    subprocess.check_call(
        [
            "bazel",
            "build",
            "@com_github_ray_streaming//java:all_streaming_tests_deploy.jar",
        ]
    )
    current_dir = os.path.abspath(os.path.dirname(__file__))
    jar_path = os.path.join(
        current_dir, "../../../bazel-bin/streaming/java/all_streaming_tests_deploy.jar"
    )
    jar_path = os.path.abspath(jar_path)
    print("jar_path", jar_path)
    assert not ray.is_initialized()
    ray.init(
        job_config=ray.job_config.JobConfig(code_search_path=sys.path + [jar_path])
    )

    sink_file = "/tmp/ray_streaming_test_hybrid_stream.txt"
    if os.path.exists(sink_file):
        os.remove(sink_file)

    def sink_func(x):
        print("HybridStreamTest", x)
        with open(sink_file, "a") as f:
            f.write(str(x))
            f.flush()

    ctx = StreamingContext.Builder().build()
    ctx.from_values("a", "b", "c").as_java_stream().map(
        "io.ray.streaming.runtime.demo.HybridStreamTest$Mapper1"
    ).filter(
        "io.ray.streaming.runtime.demo.HybridStreamTest$Filter1"
    ).as_python_stream().sink(
        sink_func
    )
    ctx.submit("HybridStreamTest")

    def check_succeed():
        if os.path.exists(sink_file):
            import time

            time.sleep(3)  # Wait all data be written
            with open(sink_file, "r") as f:
                result = f.read()
                assert "a" in result
                assert "b" not in result
                assert "c" in result
            print("Execution succeed")
            return True
        return False

    wait_for_condition(check_succeed, timeout=60, retry_interval_ms=1000)
    print("Execution succeed")
    ray.shutdown()
Beispiel #5
0
def test_data_stream():
    ray.init(job_config=ray.job_config.JobConfig(code_search_path=sys.path))
    ctx = StreamingContext.Builder().build()
    stream = ctx.from_values(1, 2, 3)
    java_stream = stream.as_java_stream()
    python_stream = java_stream.as_python_stream()
    assert stream.get_id() == java_stream.get_id()
    assert stream.get_id() == python_stream.get_id()
    python_stream.set_parallelism(10)
    assert stream.get_parallelism() == java_stream.get_parallelism()
    assert stream.get_parallelism() == python_stream.get_parallelism()
    ray.shutdown()
def test_data_stream():
    ray.init(_load_code_from_local=True)
    ctx = StreamingContext.Builder().build()
    stream = ctx.from_values(1, 2, 3)
    java_stream = stream.as_java_stream()
    python_stream = java_stream.as_python_stream()
    assert stream.get_id() == java_stream.get_id()
    assert stream.get_id() == python_stream.get_id()
    python_stream.set_parallelism(10)
    assert stream.get_parallelism() == java_stream.get_parallelism()
    assert stream.get_parallelism() == python_stream.get_parallelism()
    ray.shutdown()
Beispiel #7
0
def test_hybrid_stream():
    subprocess.check_call(
        ["bazel", "build", "//streaming/java:all_streaming_tests_deploy.jar"])
    current_dir = os.path.abspath(os.path.dirname(__file__))
    jar_path = os.path.join(
        current_dir,
        "../../../bazel-bin/streaming/java/all_streaming_tests_deploy.jar")
    jar_path = os.path.abspath(jar_path)
    print("jar_path", jar_path)
    java_worker_options = json.dumps(["-classpath", jar_path])
    print("java_worker_options", java_worker_options)
    assert not ray.is_initialized()
    ray.init(_load_code_from_local=True,
             _include_java=True,
             _java_worker_options=java_worker_options,
             _system_config={"num_workers_per_process_java": 1})

    sink_file = "/tmp/ray_streaming_test_hybrid_stream.txt"
    if os.path.exists(sink_file):
        os.remove(sink_file)

    def sink_func(x):
        print("HybridStreamTest", x)
        with open(sink_file, "a") as f:
            f.write(str(x))
            f.flush()

    ctx = StreamingContext.Builder().build()
    ctx.from_values("a", "b", "c") \
        .as_java_stream() \
        .map("io.ray.streaming.runtime.demo.HybridStreamTest$Mapper1") \
        .filter("io.ray.streaming.runtime.demo.HybridStreamTest$Filter1") \
        .as_python_stream() \
        .sink(sink_func)
    ctx.submit("HybridStreamTest")

    def check_succeed():
        if os.path.exists(sink_file):
            import time
            time.sleep(3)  # Wait all data be written
            with open(sink_file, "r") as f:
                result = f.read()
                assert "a" in result
                assert "b" not in result
                assert "c" in result
            print("Execution succeed")
            return True
        return False

    wait_for_condition(check_succeed, timeout=60, retry_interval_ms=1000)
    print("Execution succeed")
    ray.shutdown()
Beispiel #8
0
def test_key_data_stream():
    ray.init(job_config=ray.job_config.JobConfig(code_search_path=sys.path))
    ctx = StreamingContext.Builder().build()
    key_stream = (ctx.from_values(
        "a", "b", "c").map(lambda x: (x, 1)).key_by(lambda x: x[0]))
    java_stream = key_stream.as_java_stream()
    python_stream = java_stream.as_python_stream()
    assert key_stream.get_id() == java_stream.get_id()
    assert key_stream.get_id() == python_stream.get_id()
    python_stream.set_parallelism(10)
    assert key_stream.get_parallelism() == java_stream.get_parallelism()
    assert key_stream.get_parallelism() == python_stream.get_parallelism()
    ray.shutdown()
def test_key_data_stream():
    ray.init(_load_code_from_local=True)
    ctx = StreamingContext.Builder().build()
    key_stream = ctx.from_values(
        "a", "b", "c").map(lambda x: (x, 1)).key_by(lambda x: x[0])
    java_stream = key_stream.as_java_stream()
    python_stream = java_stream.as_python_stream()
    assert key_stream.get_id() == java_stream.get_id()
    assert key_stream.get_id() == python_stream.get_id()
    python_stream.set_parallelism(10)
    assert key_stream.get_parallelism() == java_stream.get_parallelism()
    assert key_stream.get_parallelism() == python_stream.get_parallelism()
    ray.shutdown()
Beispiel #10
0
def test_stream_config():
    ray.init(job_config=ray.job_config.JobConfig(code_search_path=sys.path))
    ctx = StreamingContext.Builder().build()
    stream = ctx.from_values(1, 2, 3)
    stream.with_config("k1", "v1")
    print("config", stream.get_config())
    assert stream.get_config() == {"k1": "v1"}
    stream.with_config(conf={"k2": "v2", "k3": "v3"})
    print("config", stream.get_config())
    assert stream.get_config() == {"k1": "v1", "k2": "v2", "k3": "v3"}
    java_stream = stream.as_java_stream()
    java_stream.with_config(conf={"k4": "v4"})
    config = java_stream.get_config()
    print("config", config)
    assert config == {"k1": "v1", "k2": "v2", "k3": "v3", "k4": "v4"}
    ray.shutdown()
def test_stream_config():
    ray.init(_load_code_from_local=True)
    ctx = StreamingContext.Builder().build()
    stream = ctx.from_values(1, 2, 3)
    stream.with_config("k1", "v1")
    print("config", stream.get_config())
    assert stream.get_config() == {"k1": "v1"}
    stream.with_config(conf={"k2": "v2", "k3": "v3"})
    print("config", stream.get_config())
    assert stream.get_config() == {"k1": "v1", "k2": "v2", "k3": "v3"}
    java_stream = stream.as_java_stream()
    java_stream.with_config(conf={"k4": "v4"})
    config = java_stream.get_config()
    print("config", config)
    assert config == {"k1": "v1", "k2": "v2", "k3": "v3", "k4": "v4"}
    ray.shutdown()
def test_word_count():
    ray.init(_load_code_from_local=True)
    ctx = StreamingContext.Builder() \
        .build()
    ctx.read_text_file(__file__) \
        .set_parallelism(1) \
        .flat_map(lambda x: x.split()) \
        .map(lambda x: (x, 1)) \
        .key_by(lambda x: x[0]) \
        .reduce(lambda old_value, new_value:
                (old_value[0], old_value[1] + new_value[1])) \
        .filter(lambda x: "ray" not in x) \
        .sink(lambda x: print("result", x))
    ctx.submit("word_count")
    import time
    time.sleep(3)
    ray.shutdown()
def test_union_stream():
    ray.init(job_config=ray.job_config.JobConfig(code_search_path=sys.path))
    ctx = StreamingContext.Builder() \
        .option("streaming.metrics.reporters", "") \
        .build()
    sink_file = "/tmp/test_union_stream.txt"
    if os.path.exists(sink_file):
        os.remove(sink_file)

    def sink_func(x):
        with open(sink_file, "a") as f:
            print("sink_func", x)
            f.write(str(x))

    stream1 = ctx.from_values(1, 2)
    stream2 = ctx.from_values(3, 4)
    stream3 = ctx.from_values(5, 6)
    stream1.union(stream2, stream3).sink(sink_func)
    ctx.submit("test_union_stream")
    import time
    slept_time = 0
    while True:
        if os.path.exists(sink_file):
            time.sleep(3)
            with open(sink_file, "r") as f:
                result = f.read()
                print("sink result", result)
                assert set(result) == {"1", "2", "3", "4", "5", "6"}
            print("Execution succeed")
            break
        if slept_time >= 60:
            raise Exception("Execution not finished")
        slept_time = slept_time + 1
        print("Wait finish...")
        time.sleep(1)

    ray.shutdown()
Beispiel #14
0
# Splits input line into words and
# outputs records of the form (word,1)
def splitter(line):
    return [(word, 1) for word in line.split()]


if __name__ == "__main__":
    # Get program parameters
    args = parser.parse_args()
    titles_file = str(args.titles_file)

    ray.init(_load_code_from_local=True)

    ctx = StreamingContext.Builder() \
        .option(Config.CHANNEL_TYPE, Config.NATIVE_CHANNEL) \
        .build()
    # A Ray streaming environment with the default configuration
    ctx.set_parallelism(1)  # Each operator will be executed by two actors

    # Reads articles from wikipedia, splits them in words,
    # shuffles words, and counts the occurrences of each word.
    stream = ctx.source(Wikipedia(titles_file)) \
        .flat_map(splitter) \
        .key_by(lambda x: x[0]) \
        .reduce(lambda old_value, new_value:
                (old_value[0], old_value[1] + new_value[1])) \
        .sink(print)
    start = time.time()
    ctx.execute("wordcount")
    end = time.time()