def setUpClass(cls): cls.tempdir = tempfile.mkdtemp() os.environ["FLINK_TESTING"] = "1" _find_flink_home() logging.info("Using %s as FLINK_HOME...", os.environ["FLINK_HOME"])
def setUpClass(cls): cls.tempdir = tempfile.mkdtemp() os.environ["FLINK_TESTING"] = "1" os.environ['_python_worker_execution_mode'] = "process" _find_flink_home() logging.info("Using %s as FLINK_HOME...", os.environ["FLINK_HOME"])
def construct_classpath(): flink_home = _find_flink_home() # get the realpath of tainted path value to avoid CWE22 problem that constructs a path or URI # using the tainted value and might allow an attacker to access, modify, or test the existence # of critical or sensitive files. real_flink_home = os.path.realpath(flink_home) if 'FLINK_LIB_DIR' in os.environ: flink_lib_directory = os.path.realpath(os.environ['FLINK_LIB_DIR']) else: flink_lib_directory = os.path.join(real_flink_home, "lib") if 'FLINK_OPT_DIR' in os.environ: flink_opt_directory = os.path.realpath(os.environ['FLINK_OPT_DIR']) else: flink_opt_directory = os.path.join(real_flink_home, "opt") if on_windows(): # The command length is limited on Windows. To avoid the problem we should shorten the # command length as much as possible. lib_jars = os.path.join(flink_lib_directory, "*") else: lib_jars = os.pathsep.join( glob.glob(os.path.join(flink_lib_directory, "*.jar"))) flink_python_jars = glob.glob( os.path.join(flink_opt_directory, "flink-python*.jar")) if len(flink_python_jars) < 1: print( "The flink-python jar is not found in the opt folder of the FLINK_HOME: %s" % flink_home) return lib_jars flink_python_jar = flink_python_jars[0] return os.pathsep.join([lib_jars, flink_python_jar])
def construct_log_settings(): templates = [ "-Dlog.file=${flink_log_dir}/flink-${flink_ident_string}-python-${hostname}.log", "-Dlog4j.configuration=${flink_conf_dir}/log4j-cli.properties", "-Dlog4j.configurationFile=${flink_conf_dir}/log4j-cli.properties", "-Dlogback.configurationFile=${flink_conf_dir}/logback.xml" ] flink_home = _find_flink_home() flink_conf_dir = os.path.join(flink_home, "conf") flink_log_dir = os.path.join(flink_home, "log") if "FLINK_IDENT_STRING" in os.environ: flink_ident_string = os.environ["FLINK_IDENT_STRING"] else: flink_ident_string = getpass.getuser() hostname = socket.gethostname() log_settings = [] for template in templates: log_settings.append( Template(template).substitute( flink_conf_dir=flink_conf_dir, flink_log_dir=flink_log_dir, flink_ident_string=flink_ident_string, hostname=hostname)) return log_settings
def _remote_execute_func(exec_func, write_func, exec_dict, jm, py): func_stdout = '{}/exec_{}_stdout.log'.format(get_file_dir(__file__), exec_func) func_stderr = '{}/exec_{}_stderr.log'.format(get_file_dir(__file__), exec_func) with open(func_stdout, 'a') as out, open(func_stderr, 'a') as err: # execute `flink run -m <remote> -py function.py` to submit batch job submitted_process = Popen( args= "{}/bin/flink run -m {} -py {}/exec_function.py -pyexec {} {} {} '{}'" .format(_find_flink_home(), jm, get_file_dir(__file__), py, exec_func, write_func, json.dumps(exec_dict)), shell=True, stdout=out, stderr=err) submitted_process.wait() # decode execution result from table sink file. execute_result = cloudpickle.loads( codecs.decode( pd.DataFrame( pd.read_csv(write_func))['func'].values[0].encode(), 'base64')) # remove table sink file to clear ineffective files. os.remove(write_func) return execute_result
def find_java_executable(): java_executable = "java.exe" if on_windows() else "java" flink_home = _find_flink_home() flink_conf_path = os.path.join(flink_home, "conf", "flink-conf.yaml") java_home = None # get the realpath of tainted path value to avoid CWE22 problem that constructs a path or URI # using the tainted value and might allow an attacker to access, modify, or test the existence # of critical or sensitive files. real_flink_conf_path = os.path.realpath(flink_conf_path) if os.path.isfile(real_flink_conf_path): with open(real_flink_conf_path, "r") as f: flink_conf_yaml = f.read() java_homes = re.findall(r'^[ ]*env\.java\.home[ ]*: ([^#]*).*$', flink_conf_yaml) if len(java_homes) > 1: java_home = java_homes[len(java_homes) - 1].strip() if java_home is None and "JAVA_HOME" in os.environ: java_home = os.environ["JAVA_HOME"] if java_home is not None: java_executable = os.path.join(java_home, "bin", java_executable) return java_executable
def prepare_environment_variables(env): flink_home = _find_flink_home() # get the realpath of tainted path value to avoid CWE22 problem that constructs a path or URI # using the tainted value and might allow an attacker to access, modify, or test the existence # of critical or sensitive files. real_flink_home = os.path.realpath(flink_home) if 'FLINK_CONF_DIR' in env: flink_conf_directory = os.path.realpath(env['FLINK_CONF_DIR']) else: flink_conf_directory = os.path.join(real_flink_home, "conf") env['FLINK_CONF_DIR'] = flink_conf_directory if 'FLINK_LIB_DIR' in env: flink_lib_directory = os.path.realpath(env['FLINK_LIB_DIR']) else: flink_lib_directory = os.path.join(real_flink_home, "lib") env['FLINK_LIB_DIR'] = flink_lib_directory if 'FLINK_OPT_DIR' in env: flink_opt_directory = os.path.realpath(env['FLINK_OPT_DIR']) else: flink_opt_directory = os.path.join(real_flink_home, "opt") env['FLINK_OPT_DIR'] = flink_opt_directory if 'FLINK_PLUGINS_DIR' in env: flink_plugins_directory = os.path.realpath(env['FLINK_PLUGINS_DIR']) else: flink_plugins_directory = os.path.join(real_flink_home, "plugins") env['FLINK_PLUGINS_DIR'] = flink_plugins_directory env["FLINK_BIN_DIR"] = os.path.join(real_flink_home, "bin")
def launch_gateway(): # type: () -> JavaGateway """ launch jvm gateway """ FLINK_HOME = _find_flink_home() # TODO windows support on_windows = platform.system() == "Windows" if on_windows: raise Exception("Windows system is not supported currently.") script = "./bin/pyflink-gateway-server.sh" command = [os.path.join(FLINK_HOME, script)] command += ['-c', 'org.apache.flink.api.python.PythonGatewayServer'] # Create a temporary directory where the gateway server should write the connection information. conn_info_dir = tempfile.mkdtemp() try: fd, conn_info_file = tempfile.mkstemp(dir=conn_info_dir) os.close(fd) os.unlink(conn_info_file) env = dict(os.environ) env["_PYFLINK_CONN_INFO_PATH"] = conn_info_file def preexec_func(): # ignore ctrl-c / SIGINT signal.signal(signal.SIGINT, signal.SIG_IGN) # Launch the Java gateway. # We open a pipe to stdin so that the Java gateway can die when the pipe is broken p = Popen(command, stdin=PIPE, preexec_fn=preexec_func, env=env) while not p.poll() and not os.path.isfile(conn_info_file): time.sleep(0.1) if not os.path.isfile(conn_info_file): raise Exception("Java gateway process exited before sending its port number") with open(conn_info_file, "rb") as info: gateway_port = struct.unpack("!I", info.read(4))[0] finally: shutil.rmtree(conn_info_dir) # Connect to the gateway gateway = JavaGateway(gateway_parameters=GatewayParameters(port=gateway_port, auto_convert=True)) # Import the classes used by PyFlink java_import(gateway.jvm, "org.apache.flink.table.api.*") java_import(gateway.jvm, "org.apache.flink.table.api.java.*") java_import(gateway.jvm, "org.apache.flink.table.api.dataview.*") java_import(gateway.jvm, "org.apache.flink.table.sources.*") java_import(gateway.jvm, "org.apache.flink.table.sinks.*") java_import(gateway.jvm, "org.apache.flink.api.common.typeinfo.TypeInformation") java_import(gateway.jvm, "org.apache.flink.api.common.typeinfo.Types") java_import(gateway.jvm, "org.apache.flink.api.java.ExecutionEnvironment") java_import(gateway.jvm, "org.apache.flink.streaming.api.environment.StreamExecutionEnvironment") return gateway
def launch_gateway(): # type: () -> JavaGateway """ launch jvm gateway """ if is_launch_gateway_disabled(): raise Exception( "It's launching the PythonGatewayServer during Python UDF execution " "which is unexpected. It usually happens when the job codes are " "in the top level of the Python script file and are not enclosed in a " "`if name == 'main'` statement.") FLINK_HOME = _find_flink_home() # TODO windows support on_windows = platform.system() == "Windows" if on_windows: raise Exception("Windows system is not supported currently.") script = "./bin/pyflink-gateway-server.sh" command = [os.path.join(FLINK_HOME, script)] command += ['-c', 'org.apache.flink.client.python.PythonGatewayServer'] submit_args = os.environ.get("SUBMIT_ARGS", "local") command += shlex.split(submit_args) # Create a temporary directory where the gateway server should write the connection information. conn_info_dir = tempfile.mkdtemp() try: fd, conn_info_file = tempfile.mkstemp(dir=conn_info_dir) os.close(fd) os.unlink(conn_info_file) env = dict(os.environ) env["_PYFLINK_CONN_INFO_PATH"] = conn_info_file def preexec_func(): # ignore ctrl-c / SIGINT signal.signal(signal.SIGINT, signal.SIG_IGN) # Launch the Java gateway. # We open a pipe to stdin so that the Java gateway can die when the pipe is broken p = Popen(command, stdin=PIPE, preexec_fn=preexec_func, env=env) while not p.poll() and not os.path.isfile(conn_info_file): time.sleep(0.1) if not os.path.isfile(conn_info_file): raise Exception( "Java gateway process exited before sending its port number") with open(conn_info_file, "rb") as info: gateway_port = struct.unpack("!I", info.read(4))[0] finally: shutil.rmtree(conn_info_dir) # Connect to the gateway gateway = JavaGateway(gateway_parameters=GatewayParameters( port=gateway_port, auto_convert=True)) return gateway
def prepare_environment_variable(env): flink_home = _find_flink_home() env = dict(env) env["FLINK_CONF_DIR"] = os.path.join(flink_home, "conf") env["FLINK_BIN_DIR"] = os.path.join(flink_home, "bin") env["FLINK_PLUGINS_DIR"] = os.path.join(flink_home, "plugins") env["FLINK_LIB_DIR"] = os.path.join(flink_home, "lib") env["FLINK_OPT_DIR"] = os.path.join(flink_home, "opt") return env
def test_end_to_end(): tmp_dir = tempfile.gettempdir() source_path = tmp_dir + '/streaming.csv' if os.path.isfile(source_path): os.remove(source_path) with open(source_path, 'w') as f: lines = '1,hi,hello\n' + '2,hi,hello\n' f.write(lines) f.close() _find_flink_home() print("using %s as FLINK_HOME..." % os.environ["FLINK_HOME"]) t_config = TableConfig.Builder().as_streaming_execution().set_parallelism(1).build() t_env = TableEnvironment.get_table_environment(t_config) field_names = ["a", "b", "c"] field_types = [DataTypes.INT, DataTypes.STRING, DataTypes.STRING] # register Orders table in table environment t_env.register_table_source( "Orders", CsvTableSource(source_path, field_names, field_types)) # register Results table in table environment tmp_dir = tempfile.gettempdir() tmp_csv = tmp_dir + '/streaming2.csv' if os.path.isfile(tmp_csv): os.remove(tmp_csv) t_env.register_table_sink( "Results", field_names, field_types, CsvTableSink(tmp_csv)) t_env.scan("Orders") \ .where("a > 0") \ .select("a + 1, b, c") \ .insert_into("Results") t_env.execute() with open(tmp_csv, 'r') as f: lines = f.read() assert lines == '2,hi,hello\n' + '3,hi,hello\n' print("test passed, the log file is under this directory: %s/log" % os.environ["FLINK_HOME"])
def find_java_executable(): java_executable = "java.exe" if on_windows() else "java" flink_home = _find_flink_home() flink_conf_file = os.path.join(flink_home, "conf", "flink-conf.yaml") java_home = read_from_config(KEY_ENV_JAVA_HOME, None, flink_conf_file) if java_home is None and "JAVA_HOME" in os.environ: java_home = os.environ["JAVA_HOME"] if java_home is not None: java_executable = os.path.join(java_home, "bin", java_executable) return java_executable
def _cancel_jobs(self): for job in self._jobs: cancel_stdout = '{}/cancel_{}_stdout.log'.format( get_file_dir(__file__), job) cancel_stderr = '{}/cancel_{}_stderr.log'.format( get_file_dir(__file__), job) with open(cancel_stdout, 'a') as out, open(cancel_stderr, 'a') as err: # execute `flink cancel <jobID>` to cancel batch job Popen(args='{}/bin/flink cancel {}'.format( _find_flink_home(), job), shell=True, stdout=out, stderr=err)
def launch_gateway(): # type: () -> JavaGateway """ launch jvm gateway """ FLINK_HOME = _find_flink_home() # TODO windows support on_windows = platform.system() == "Windows" if on_windows: raise Exception("Windows system is not supported currently.") script = "./bin/pyflink-gateway-server.sh" command = [os.path.join(FLINK_HOME, script)] command += ['-c', 'org.apache.flink.client.python.PythonGatewayServer'] # Create a temporary directory where the gateway server should write the connection information. conn_info_dir = tempfile.mkdtemp() try: fd, conn_info_file = tempfile.mkstemp(dir=conn_info_dir) os.close(fd) os.unlink(conn_info_file) env = dict(os.environ) env["_PYFLINK_CONN_INFO_PATH"] = conn_info_file def preexec_func(): # ignore ctrl-c / SIGINT signal.signal(signal.SIGINT, signal.SIG_IGN) # Launch the Java gateway. # We open a pipe to stdin so that the Java gateway can die when the pipe is broken p = Popen(command, stdin=PIPE, preexec_fn=preexec_func, env=env) while not p.poll() and not os.path.isfile(conn_info_file): time.sleep(0.1) if not os.path.isfile(conn_info_file): raise Exception("Java gateway process exited before sending its port number") with open(conn_info_file, "rb") as info: gateway_port = struct.unpack("!I", info.read(4))[0] finally: shutil.rmtree(conn_info_dir) # Connect to the gateway gateway = JavaGateway( gateway_parameters=GatewayParameters(port=gateway_port, auto_convert=True)) return gateway
def construct_log_settings(env): templates = [ "-Dlog.file=${flink_log_dir}/flink-${flink_ident_string}-python-${hostname}.log", "-Dlog4j.configuration=${log4j_properties}", "-Dlog4j.configurationFile=${log4j_properties}", "-Dlogback.configurationFile=${logback_xml}" ] flink_home = os.path.realpath(_find_flink_home()) flink_conf_dir = env['FLINK_CONF_DIR'] flink_conf_file = os.path.join(env['FLINK_CONF_DIR'], "flink-conf.yaml") if "FLINK_LOG_DIR" in env: flink_log_dir = env["FLINK_LOG_DIR"] else: flink_log_dir = read_from_config(KEY_ENV_LOG_DIR, os.path.join(flink_home, "log"), flink_conf_file) if "LOG4J_PROPERTIES" in env: log4j_properties = env["LOG4J_PROPERTIES"] else: log4j_properties = "%s/log4j-cli.properties" % flink_conf_dir if "LOGBACK_XML" in env: logback_xml = env["LOGBACK_XML"] else: logback_xml = "%s/logback.xml" % flink_conf_dir if "FLINK_IDENT_STRING" in env: flink_ident_string = env["FLINK_IDENT_STRING"] else: flink_ident_string = getpass.getuser() hostname = socket.gethostname() log_settings = [] for template in templates: log_settings.append( Template(template).substitute( log4j_properties=log4j_properties, logback_xml=logback_xml, flink_log_dir=flink_log_dir, flink_ident_string=flink_ident_string, hostname=hostname)) return log_settings
def construct_flink_classpath(env): flink_home = _find_flink_home() flink_lib_directory = env['FLINK_LIB_DIR'] flink_opt_directory = env['FLINK_OPT_DIR'] if on_windows(): # The command length is limited on Windows. To avoid the problem we should shorten the # command length as much as possible. lib_jars = os.path.join(flink_lib_directory, "*") else: lib_jars = os.pathsep.join(glob.glob(os.path.join(flink_lib_directory, "*.jar"))) flink_python_jars = glob.glob(os.path.join(flink_opt_directory, "flink-python*.jar")) if len(flink_python_jars) < 1: print("The flink-python jar is not found in the opt folder of the FLINK_HOME: %s" % flink_home) return lib_jars flink_python_jar = flink_python_jars[0] return os.pathsep.join([lib_jars, flink_python_jar])
def construct_log_settings(): templates = [ "-Dlog.file=${flink_log_dir}/flink-${flink_ident_string}-python-${hostname}.log", "-Dlog4j.configuration=${log4j_properties}", "-Dlog4j.configurationFile=${log4j_properties}", "-Dlogback.configurationFile=${logback_xml}" ] flink_home = _find_flink_home() flink_conf_dir = os.path.join(flink_home, "conf") if "FLINK_LOG_DIR" in os.environ: flink_log_dir = os.environ["FLINK_LOG_DIR"] else: flink_log_dir = os.path.join(flink_home, "log") if "LOG4J_PROPERTIES" in os.environ: log4j_properties = os.environ["LOG4J_PROPERTIES"] else: log4j_properties = "%s/log4j-cli.properties" % flink_conf_dir if "LOGBACK_XML" in os.environ: logback_xml = os.environ["LOGBACK_XML"] else: logback_xml = "%s/logback.xml" % flink_conf_dir if "FLINK_IDENT_STRING" in os.environ: flink_ident_string = os.environ["FLINK_IDENT_STRING"] else: flink_ident_string = getpass.getuser() hostname = socket.gethostname() log_settings = [] for template in templates: log_settings.append( Template(template).substitute( log4j_properties=log4j_properties, logback_xml=logback_xml, flink_log_dir=flink_log_dir, flink_ident_string=flink_ident_string, hostname=hostname)) return log_settings
def find_java_executable(): java_executable = "java.exe" if on_windows() else "java" flink_home = _find_flink_home() flink_conf_path = os.path.join(flink_home, "conf", "flink-conf.yaml") java_home = None if os.path.isfile(flink_conf_path): with open(flink_conf_path, "r") as f: flink_conf_yaml = f.read() java_homes = re.findall(r'^[ ]*env\.java\.home[ ]*: ([^#]*).*$', flink_conf_yaml) if len(java_homes) > 1: java_home = java_homes[len(java_homes) - 1].strip() if java_home is None and "JAVA_HOME" in os.environ: java_home = os.environ["JAVA_HOME"] if java_home is not None: java_executable = os.path.join(java_home, "bin", java_executable) return java_executable