def serve(self, data): from pyjava.api.mlsql import PythonContext if not self.is_bind: raise SocketNotBindException( "Please invoke server.bind() before invoke server.serve") conn, addr = self.socket.accept() sockfile = conn.makefile("rwb", int( os.environ.get("BUFFER_SIZE", 65536))) infile = sockfile # os.fdopen(os.dup(conn.fileno()), "rb", 65536) out = sockfile # os.fdopen(os.dup(conn.fileno()), "wb", 65536) try: write_int(SpecialLengths.START_ARROW_STREAM, out) out_data = ([df[name] for name in df] for df in PythonContext.build_chunk_result(data, 1024)) self.out_ser.dump_stream(out_data, out) write_int(SpecialLengths.END_OF_DATA_SECTION, out) write_int(SpecialLengths.END_OF_STREAM, out) out.flush() if self.is_dev: print("all data in ray task have been consumed.") read_int(infile) except Exception: try: write_int(SpecialLengths.ARROW_STREAM_CRASH, out) ex = traceback.format_exc() print(ex) write_int(SpecialLengths.PYTHON_EXCEPTION_THROWN, out) write_with_length(ex.encode("utf-8"), out) out.flush() read_int(infile) except IOError: # JVM close the socket pass except Exception: # Write the error to stderr if it happened while serializing print("Py worker failed with exception:") print(traceback.format_exc()) pass conn.close()
def process(): input_data = ser.load_stream(infile) code = compile(command, '<string>', 'exec') if is_interactive: global data_manager data_manager = Data(input_data, conf) global globals_namespace exec(code, globals_namespace, globals_namespace) else: data_manager = Data(input_data, conf) n_local = {"data_manager": data_manager} g_local = {} exec(code, g_local, n_local) out_iter = data_manager.output() try: write_int(SpecialLengths.START_ARROW_STREAM, outfile) out_ser.dump_stream(out_iter, outfile) finally: if hasattr(out_iter, 'close'): out_iter.close()
def process(): try: input_data = ser.load_stream(infile) code = CodeCache.get(command) if is_interactive: global data_manager global context data_manager = PythonContext(context_id, input_data, conf) context = data_manager global globals_namespace exec(code, globals_namespace, globals_namespace) else: data_manager = PythonContext(context_id, input_data, conf) n_local = { "data_manager": data_manager, "context": data_manager } exec(code, n_local, n_local) out_iter = data_manager.output() write_int(SpecialLengths.START_ARROW_STREAM, outfile) out_ser.dump_stream(out_iter, outfile) finally: try: import shutil shutil.rmtree(context_id) except: pass try: if hasattr(out_iter, 'close'): out_iter.close() except: pass try: del data_manager except: pass
def main(infile, outfile): try: try: import ray except ImportError: pass # set up memory limits memory_limit_mb = int(os.environ.get('PY_EXECUTOR_MEMORY', "-1")) if memory_limit_mb > 0 and has_resource_module: total_memory = resource.RLIMIT_AS try: (soft_limit, hard_limit) = resource.getrlimit(total_memory) msg = "Current mem limits: {0} of max {1}\n".format( soft_limit, hard_limit) print(msg, file=sys.stderr) # convert to bytes new_limit = memory_limit_mb * 1024 * 1024 if soft_limit == resource.RLIM_INFINITY or new_limit < soft_limit: msg = "Setting mem limits to {0} of max {1}\n".format( new_limit, new_limit) print(msg, file=sys.stderr) resource.setrlimit(total_memory, (new_limit, new_limit)) except (resource.error, OSError, ValueError) as e: # not all systems support resource limits, so warn instead of failing print("WARN: Failed to set memory limit: {0}\n".format(e), file=sys.stderr) split_index = read_int(infile) print("split_index:%s" % split_index) if split_index == -1: # for unit tests sys.exit(-1) is_barrier = read_bool(infile) bound_port = read_int(infile) conf = {} for i in range(read_int(infile)): k = utf8_deserializer.loads(infile) v = utf8_deserializer.loads(infile) conf[k] = v command = utf8_deserializer.loads(infile) ser = ArrowStreamSerializer() timezone = conf["timezone"] if "timezone" in conf else None out_ser = ArrowStreamPandasSerializer(timezone, True, True) is_interactive = os.environ.get('PY_INTERACTIVE', "no") == "yes" import uuid context_id = str(uuid.uuid4()) if not os.path.exists(context_id): os.mkdir(context_id) def process(): try: input_data = ser.load_stream(infile) code = CodeCache.get(command) if is_interactive: global data_manager global context data_manager = PythonContext(context_id, input_data, conf) context = data_manager global globals_namespace exec(code, globals_namespace, globals_namespace) else: data_manager = PythonContext(context_id, input_data, conf) n_local = { "data_manager": data_manager, "context": data_manager } exec(code, n_local, n_local) out_iter = data_manager.output() write_int(SpecialLengths.START_ARROW_STREAM, outfile) out_ser.dump_stream(out_iter, outfile) finally: try: import shutil shutil.rmtree(context_id) except: pass try: if hasattr(out_iter, 'close'): out_iter.close() except: pass try: del data_manager except: pass process() except Exception: try: write_int(SpecialLengths.ARROW_STREAM_CRASH, outfile) write_int(SpecialLengths.PYTHON_EXCEPTION_THROWN, outfile) write_with_length(traceback.format_exc().encode("utf-8"), outfile) except IOError: # JVM close the socket pass except Exception: # Write the error to stderr if it happened while serializing print("Py worker failed with exception:", file=sys.stderr) print(traceback.format_exc(), file=sys.stderr) sys.exit(-1) write_int(SpecialLengths.END_OF_DATA_SECTION, outfile) flag = read_int(infile) if flag == SpecialLengths.END_OF_STREAM: write_int(SpecialLengths.END_OF_STREAM, outfile) else: # write a different value to tell JVM to not reuse this worker write_int(SpecialLengths.END_OF_DATA_SECTION, outfile) sys.exit(-1)
def manager(): # Create a new process group to corral our children os.setpgid(0, 0) # Create a listening socket on the AF_INET loopback interface listen_sock = socket.socket(AF_INET, SOCK_STREAM) listen_sock.bind(('127.0.0.1', 0)) listen_sock.listen(max(1024, SOMAXCONN)) listen_host, listen_port = listen_sock.getsockname() # re-open stdin/stdout in 'wb' mode stdin_bin = os.fdopen(sys.stdin.fileno(), 'rb', 4) stdout_bin = os.fdopen(sys.stdout.fileno(), 'wb', 4) write_int(listen_port, stdout_bin) stdout_bin.flush() def shutdown(code): signal.signal(SIGTERM, SIG_DFL) # Send SIGHUP to notify workers of shutdown os.kill(0, SIGHUP) sys.exit(code) def handle_sigterm(*args): shutdown(1) signal.signal(SIGTERM, handle_sigterm) # Gracefully exit on SIGTERM signal.signal(SIGHUP, SIG_IGN) # Don't die on SIGHUP signal.signal(SIGCHLD, SIG_IGN) reuse = os.environ.get("PY_WORKER_REUSE") # Initialization complete try: while True: try: ready_fds = select.select([0, listen_sock], [], [], 1)[0] except select.error as ex: if ex[0] == EINTR: continue else: raise if 0 in ready_fds: try: worker_pid = read_int(stdin_bin) except EOFError: # Spark told us to exit by closing stdin shutdown(0) try: os.kill(worker_pid, signal.SIGKILL) except OSError: pass # process already died if listen_sock in ready_fds: try: sock, _ = listen_sock.accept() except OSError as e: if e.errno == EINTR: continue raise # Launch a worker process try: pid = os.fork() except OSError as e: if e.errno in (EAGAIN, EINTR): time.sleep(1) pid = os.fork() # error here will shutdown daemon else: outfile = sock.makefile(mode='wb') write_int(e.errno, outfile) # Signal that the fork failed outfile.flush() outfile.close() sock.close() continue if pid == 0: # in child process listen_sock.close() # It should close the standard input in the child process so that # Python native function executions stay intact. # # Note that if we just close the standard input (file descriptor 0), # the lowest file descriptor (file descriptor 0) will be allocated, # later when other file descriptors should happen to open. # # Therefore, here we redirects it to '/dev/null' by duplicating # another file descriptor for '/dev/null' to the standard input (0). # See SPARK-26175. devnull = open(os.devnull, 'r') os.dup2(devnull.fileno(), 0) devnull.close() try: # Acknowledge that the fork was successful outfile = sock.makefile(mode="wb") write_int(os.getpid(), outfile) outfile.flush() outfile.close() while True: code = worker(sock) if not reuse or code: # wait for closing try: while sock.recv(1024): pass except Exception: pass break gc.collect() except: traceback.print_exc() os._exit(1) else: os._exit(0) else: sock.close() finally: shutdown(1)
import os import socket from pyjava.serializers import ArrowStreamPandasSerializer, read_int, write_int out_ser = ArrowStreamPandasSerializer("Asia/Harbin", False, None) HOST = "127.0.0.1" PORT = 11111 with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: sock.connect((HOST, PORT)) buffer_size = int(os.environ.get("SPARK_BUFFER_SIZE", 65536)) infile = os.fdopen(os.dup(sock.fileno()), "rb", buffer_size) outfile = os.fdopen(os.dup(sock.fileno()), "wb", buffer_size) # arrow start print(read_int(infile)) kk = out_ser.load_stream(infile) for item in kk: print(item) # end data print(read_int(infile)) # end stream print(read_int(infile)) write_int(-4,outfile)