def test_simple_word_count(): ray.init(_load_code_from_local=True, _include_java=True) ctx = StreamingContext.Builder() \ .build() sink_file = "/tmp/ray_streaming_test_simple_word_count.txt" if os.path.exists(sink_file): os.remove(sink_file) def sink_func(x): with open(sink_file, "a") as f: line = "{}:{},".format(x[0], x[1]) print("sink_func", line) f.write(line) ctx.from_values("a", "b", "c") \ .set_parallelism(1) \ .flat_map(lambda x: [x, x]) \ .map(lambda x: (x, 1)) \ .key_by(lambda x: x[0]) \ .reduce(lambda old_value, new_value: (old_value[0], old_value[1] + new_value[1])) \ .sink(sink_func) ctx.submit("word_count") import time time.sleep(3) ray.shutdown() with open(sink_file, "r") as f: result = f.read() assert "a:2" in result assert "b:2" in result assert "c:2" in result
def test_simple_word_count(): ray.init(_load_code_from_local=True) ctx = StreamingContext.Builder() \ .build() sink_file = "/tmp/ray_streaming_test_simple_word_count.txt" if os.path.exists(sink_file): os.remove(sink_file) def sink_func(x): with open(sink_file, "a") as f: line = "{}:{},".format(x[0], x[1]) print("sink_func", line) f.write(line) ctx.from_values("a", "b", "c") \ .set_parallelism(1) \ .flat_map(lambda x: [x, x]) \ .map(lambda x: (x, 1)) \ .key_by(lambda x: x[0]) \ .reduce(lambda old_value, new_value: (old_value[0], old_value[1] + new_value[1])) \ .sink(sink_func) ctx.submit("word_count") def check_succeed(): if os.path.exists(sink_file): with open(sink_file, "r") as f: result = f.read() return "a:2" in result and "b:2" in result and "c:2" in result return False wait_for_condition(check_succeed, timeout=60, retry_interval_ms=1000) print("Execution succeed") ray.shutdown()
def test_word_count(): try: ray.init(job_config=ray.job_config.JobConfig( code_search_path=sys.path)) # time.sleep(10) # for gdb to attach ctx = StreamingContext.Builder() \ .option("streaming.context-backend.type", "local_file") \ .option( "streaming.context-backend.file-state.root", "/tmp/ray/cp_files/" ) \ .option("streaming.checkpoint.timeout.secs", "3") \ .build() print("-----------submit job-------------") ctx.read_text_file(__file__) \ .set_parallelism(1) \ .flat_map(lambda x: x.split()) \ .map(lambda x: (x, 1)) \ .key_by(lambda x: x[0]) \ .reduce(lambda old_value, new_value: (old_value[0], old_value[1] + new_value[1])) \ .filter(lambda x: "ray" not in x) \ .sink(lambda x: print("####result", x)) ctx.submit("word_count") print("-----------checking output-------------") retry_count = 180 / 5 # wait for 3min while not has_sink_output(): time.sleep(5) retry_count -= 1 if retry_count <= 0: raise RuntimeError("Can not find output") print("-----------killing worker-------------") time.sleep(5) kill_all_worker() print("-----------checking checkpoint-------------") cp_ok_num = checkpoint_success_num() retry_count = 300000 / 5 # wait for 5min while True: cur_cp_num = checkpoint_success_num() print("-----------checking checkpoint" ", cur_cp_num={}, old_cp_num={}-------------".format( cur_cp_num, cp_ok_num)) if cur_cp_num > cp_ok_num: print("--------------TEST OK!------------------") break time.sleep(5) retry_count -= 1 if retry_count <= 0: raise RuntimeError( "Checkpoint keeps failing after fail-over, test failed!") finally: ray.shutdown()
def test_hybrid_stream(): subprocess.check_call( [ "bazel", "build", "@com_github_ray_streaming//java:all_streaming_tests_deploy.jar", ] ) current_dir = os.path.abspath(os.path.dirname(__file__)) jar_path = os.path.join( current_dir, "../../../bazel-bin/streaming/java/all_streaming_tests_deploy.jar" ) jar_path = os.path.abspath(jar_path) print("jar_path", jar_path) assert not ray.is_initialized() ray.init( job_config=ray.job_config.JobConfig(code_search_path=sys.path + [jar_path]) ) sink_file = "/tmp/ray_streaming_test_hybrid_stream.txt" if os.path.exists(sink_file): os.remove(sink_file) def sink_func(x): print("HybridStreamTest", x) with open(sink_file, "a") as f: f.write(str(x)) f.flush() ctx = StreamingContext.Builder().build() ctx.from_values("a", "b", "c").as_java_stream().map( "io.ray.streaming.runtime.demo.HybridStreamTest$Mapper1" ).filter( "io.ray.streaming.runtime.demo.HybridStreamTest$Filter1" ).as_python_stream().sink( sink_func ) ctx.submit("HybridStreamTest") def check_succeed(): if os.path.exists(sink_file): import time time.sleep(3) # Wait all data be written with open(sink_file, "r") as f: result = f.read() assert "a" in result assert "b" not in result assert "c" in result print("Execution succeed") return True return False wait_for_condition(check_succeed, timeout=60, retry_interval_ms=1000) print("Execution succeed") ray.shutdown()
def test_data_stream(): ray.init(job_config=ray.job_config.JobConfig(code_search_path=sys.path)) ctx = StreamingContext.Builder().build() stream = ctx.from_values(1, 2, 3) java_stream = stream.as_java_stream() python_stream = java_stream.as_python_stream() assert stream.get_id() == java_stream.get_id() assert stream.get_id() == python_stream.get_id() python_stream.set_parallelism(10) assert stream.get_parallelism() == java_stream.get_parallelism() assert stream.get_parallelism() == python_stream.get_parallelism() ray.shutdown()
def test_data_stream(): ray.init(_load_code_from_local=True) ctx = StreamingContext.Builder().build() stream = ctx.from_values(1, 2, 3) java_stream = stream.as_java_stream() python_stream = java_stream.as_python_stream() assert stream.get_id() == java_stream.get_id() assert stream.get_id() == python_stream.get_id() python_stream.set_parallelism(10) assert stream.get_parallelism() == java_stream.get_parallelism() assert stream.get_parallelism() == python_stream.get_parallelism() ray.shutdown()
def test_hybrid_stream(): subprocess.check_call( ["bazel", "build", "//streaming/java:all_streaming_tests_deploy.jar"]) current_dir = os.path.abspath(os.path.dirname(__file__)) jar_path = os.path.join( current_dir, "../../../bazel-bin/streaming/java/all_streaming_tests_deploy.jar") jar_path = os.path.abspath(jar_path) print("jar_path", jar_path) java_worker_options = json.dumps(["-classpath", jar_path]) print("java_worker_options", java_worker_options) assert not ray.is_initialized() ray.init(_load_code_from_local=True, _include_java=True, _java_worker_options=java_worker_options, _system_config={"num_workers_per_process_java": 1}) sink_file = "/tmp/ray_streaming_test_hybrid_stream.txt" if os.path.exists(sink_file): os.remove(sink_file) def sink_func(x): print("HybridStreamTest", x) with open(sink_file, "a") as f: f.write(str(x)) f.flush() ctx = StreamingContext.Builder().build() ctx.from_values("a", "b", "c") \ .as_java_stream() \ .map("io.ray.streaming.runtime.demo.HybridStreamTest$Mapper1") \ .filter("io.ray.streaming.runtime.demo.HybridStreamTest$Filter1") \ .as_python_stream() \ .sink(sink_func) ctx.submit("HybridStreamTest") def check_succeed(): if os.path.exists(sink_file): import time time.sleep(3) # Wait all data be written with open(sink_file, "r") as f: result = f.read() assert "a" in result assert "b" not in result assert "c" in result print("Execution succeed") return True return False wait_for_condition(check_succeed, timeout=60, retry_interval_ms=1000) print("Execution succeed") ray.shutdown()
def test_key_data_stream(): ray.init(job_config=ray.job_config.JobConfig(code_search_path=sys.path)) ctx = StreamingContext.Builder().build() key_stream = (ctx.from_values( "a", "b", "c").map(lambda x: (x, 1)).key_by(lambda x: x[0])) java_stream = key_stream.as_java_stream() python_stream = java_stream.as_python_stream() assert key_stream.get_id() == java_stream.get_id() assert key_stream.get_id() == python_stream.get_id() python_stream.set_parallelism(10) assert key_stream.get_parallelism() == java_stream.get_parallelism() assert key_stream.get_parallelism() == python_stream.get_parallelism() ray.shutdown()
def test_key_data_stream(): ray.init(_load_code_from_local=True) ctx = StreamingContext.Builder().build() key_stream = ctx.from_values( "a", "b", "c").map(lambda x: (x, 1)).key_by(lambda x: x[0]) java_stream = key_stream.as_java_stream() python_stream = java_stream.as_python_stream() assert key_stream.get_id() == java_stream.get_id() assert key_stream.get_id() == python_stream.get_id() python_stream.set_parallelism(10) assert key_stream.get_parallelism() == java_stream.get_parallelism() assert key_stream.get_parallelism() == python_stream.get_parallelism() ray.shutdown()
def test_stream_config(): ray.init(job_config=ray.job_config.JobConfig(code_search_path=sys.path)) ctx = StreamingContext.Builder().build() stream = ctx.from_values(1, 2, 3) stream.with_config("k1", "v1") print("config", stream.get_config()) assert stream.get_config() == {"k1": "v1"} stream.with_config(conf={"k2": "v2", "k3": "v3"}) print("config", stream.get_config()) assert stream.get_config() == {"k1": "v1", "k2": "v2", "k3": "v3"} java_stream = stream.as_java_stream() java_stream.with_config(conf={"k4": "v4"}) config = java_stream.get_config() print("config", config) assert config == {"k1": "v1", "k2": "v2", "k3": "v3", "k4": "v4"} ray.shutdown()
def test_stream_config(): ray.init(_load_code_from_local=True) ctx = StreamingContext.Builder().build() stream = ctx.from_values(1, 2, 3) stream.with_config("k1", "v1") print("config", stream.get_config()) assert stream.get_config() == {"k1": "v1"} stream.with_config(conf={"k2": "v2", "k3": "v3"}) print("config", stream.get_config()) assert stream.get_config() == {"k1": "v1", "k2": "v2", "k3": "v3"} java_stream = stream.as_java_stream() java_stream.with_config(conf={"k4": "v4"}) config = java_stream.get_config() print("config", config) assert config == {"k1": "v1", "k2": "v2", "k3": "v3", "k4": "v4"} ray.shutdown()
def test_word_count(): ray.init(_load_code_from_local=True) ctx = StreamingContext.Builder() \ .build() ctx.read_text_file(__file__) \ .set_parallelism(1) \ .flat_map(lambda x: x.split()) \ .map(lambda x: (x, 1)) \ .key_by(lambda x: x[0]) \ .reduce(lambda old_value, new_value: (old_value[0], old_value[1] + new_value[1])) \ .filter(lambda x: "ray" not in x) \ .sink(lambda x: print("result", x)) ctx.submit("word_count") import time time.sleep(3) ray.shutdown()
def test_union_stream(): ray.init(job_config=ray.job_config.JobConfig(code_search_path=sys.path)) ctx = StreamingContext.Builder() \ .option("streaming.metrics.reporters", "") \ .build() sink_file = "/tmp/test_union_stream.txt" if os.path.exists(sink_file): os.remove(sink_file) def sink_func(x): with open(sink_file, "a") as f: print("sink_func", x) f.write(str(x)) stream1 = ctx.from_values(1, 2) stream2 = ctx.from_values(3, 4) stream3 = ctx.from_values(5, 6) stream1.union(stream2, stream3).sink(sink_func) ctx.submit("test_union_stream") import time slept_time = 0 while True: if os.path.exists(sink_file): time.sleep(3) with open(sink_file, "r") as f: result = f.read() print("sink result", result) assert set(result) == {"1", "2", "3", "4", "5", "6"} print("Execution succeed") break if slept_time >= 60: raise Exception("Execution not finished") slept_time = slept_time + 1 print("Wait finish...") time.sleep(1) ray.shutdown()
# Splits input line into words and # outputs records of the form (word,1) def splitter(line): return [(word, 1) for word in line.split()] if __name__ == "__main__": # Get program parameters args = parser.parse_args() titles_file = str(args.titles_file) ray.init(_load_code_from_local=True) ctx = StreamingContext.Builder() \ .option(Config.CHANNEL_TYPE, Config.NATIVE_CHANNEL) \ .build() # A Ray streaming environment with the default configuration ctx.set_parallelism(1) # Each operator will be executed by two actors # Reads articles from wikipedia, splits them in words, # shuffles words, and counts the occurrences of each word. stream = ctx.source(Wikipedia(titles_file)) \ .flat_map(splitter) \ .key_by(lambda x: x[0]) \ .reduce(lambda old_value, new_value: (old_value[0], old_value[1] + new_value[1])) \ .sink(print) start = time.time() ctx.execute("wordcount") end = time.time()