Esempio n. 1
0
    def setUpClass(cls):
        cls.tempdir = tempfile.mkdtemp()

        os.environ["FLINK_TESTING"] = "1"
        _find_flink_home()

        logging.info("Using %s as FLINK_HOME...", os.environ["FLINK_HOME"])
    def setUpClass(cls):
        cls.tempdir = tempfile.mkdtemp()

        os.environ["FLINK_TESTING"] = "1"
        _find_flink_home()

        logging.info("Using %s as FLINK_HOME...", os.environ["FLINK_HOME"])
Esempio n. 3
0
    def setUpClass(cls):
        cls.tempdir = tempfile.mkdtemp()

        os.environ["FLINK_TESTING"] = "1"
        os.environ['_python_worker_execution_mode'] = "process"
        _find_flink_home()

        logging.info("Using %s as FLINK_HOME...", os.environ["FLINK_HOME"])
Esempio n. 4
0
def construct_classpath():
    flink_home = _find_flink_home()
    # get the realpath of tainted path value to avoid CWE22 problem that constructs a path or URI
    # using the tainted value and might allow an attacker to access, modify, or test the existence
    # of critical or sensitive files.
    real_flink_home = os.path.realpath(flink_home)
    if 'FLINK_LIB_DIR' in os.environ:
        flink_lib_directory = os.path.realpath(os.environ['FLINK_LIB_DIR'])
    else:
        flink_lib_directory = os.path.join(real_flink_home, "lib")
    if 'FLINK_OPT_DIR' in os.environ:
        flink_opt_directory = os.path.realpath(os.environ['FLINK_OPT_DIR'])
    else:
        flink_opt_directory = os.path.join(real_flink_home, "opt")
    if on_windows():
        # The command length is limited on Windows. To avoid the problem we should shorten the
        # command length as much as possible.
        lib_jars = os.path.join(flink_lib_directory, "*")
    else:
        lib_jars = os.pathsep.join(
            glob.glob(os.path.join(flink_lib_directory, "*.jar")))

    flink_python_jars = glob.glob(
        os.path.join(flink_opt_directory, "flink-python*.jar"))
    if len(flink_python_jars) < 1:
        print(
            "The flink-python jar is not found in the opt folder of the FLINK_HOME: %s"
            % flink_home)
        return lib_jars
    flink_python_jar = flink_python_jars[0]

    return os.pathsep.join([lib_jars, flink_python_jar])
Esempio n. 5
0
def construct_log_settings():
    templates = [
        "-Dlog.file=${flink_log_dir}/flink-${flink_ident_string}-python-${hostname}.log",
        "-Dlog4j.configuration=${flink_conf_dir}/log4j-cli.properties",
        "-Dlog4j.configurationFile=${flink_conf_dir}/log4j-cli.properties",
        "-Dlogback.configurationFile=${flink_conf_dir}/logback.xml"
    ]

    flink_home = _find_flink_home()
    flink_conf_dir = os.path.join(flink_home, "conf")
    flink_log_dir = os.path.join(flink_home, "log")
    if "FLINK_IDENT_STRING" in os.environ:
        flink_ident_string = os.environ["FLINK_IDENT_STRING"]
    else:
        flink_ident_string = getpass.getuser()
    hostname = socket.gethostname()
    log_settings = []
    for template in templates:
        log_settings.append(
            Template(template).substitute(
                flink_conf_dir=flink_conf_dir,
                flink_log_dir=flink_log_dir,
                flink_ident_string=flink_ident_string,
                hostname=hostname))
    return log_settings
Esempio n. 6
0
 def _remote_execute_func(exec_func, write_func, exec_dict, jm, py):
     func_stdout = '{}/exec_{}_stdout.log'.format(get_file_dir(__file__),
                                                  exec_func)
     func_stderr = '{}/exec_{}_stderr.log'.format(get_file_dir(__file__),
                                                  exec_func)
     with open(func_stdout, 'a') as out, open(func_stderr, 'a') as err:
         # execute `flink run -m <remote> -py function.py` to submit batch job
         submitted_process = Popen(
             args=
             "{}/bin/flink run -m {} -py {}/exec_function.py -pyexec {} {} {} '{}'"
             .format(_find_flink_home(), jm, get_file_dir(__file__), py,
                     exec_func, write_func, json.dumps(exec_dict)),
             shell=True,
             stdout=out,
             stderr=err)
         submitted_process.wait()
     # decode execution result from table sink file.
     execute_result = cloudpickle.loads(
         codecs.decode(
             pd.DataFrame(
                 pd.read_csv(write_func))['func'].values[0].encode(),
             'base64'))
     # remove table sink file to clear ineffective files.
     os.remove(write_func)
     return execute_result
Esempio n. 7
0
def find_java_executable():
    java_executable = "java.exe" if on_windows() else "java"
    flink_home = _find_flink_home()
    flink_conf_path = os.path.join(flink_home, "conf", "flink-conf.yaml")
    java_home = None

    # get the realpath of tainted path value to avoid CWE22 problem that constructs a path or URI
    # using the tainted value and might allow an attacker to access, modify, or test the existence
    # of critical or sensitive files.
    real_flink_conf_path = os.path.realpath(flink_conf_path)
    if os.path.isfile(real_flink_conf_path):
        with open(real_flink_conf_path, "r") as f:
            flink_conf_yaml = f.read()
        java_homes = re.findall(r'^[ ]*env\.java\.home[ ]*: ([^#]*).*$',
                                flink_conf_yaml)
        if len(java_homes) > 1:
            java_home = java_homes[len(java_homes) - 1].strip()

    if java_home is None and "JAVA_HOME" in os.environ:
        java_home = os.environ["JAVA_HOME"]

    if java_home is not None:
        java_executable = os.path.join(java_home, "bin", java_executable)

    return java_executable
Esempio n. 8
0
def prepare_environment_variables(env):
    flink_home = _find_flink_home()
    # get the realpath of tainted path value to avoid CWE22 problem that constructs a path or URI
    # using the tainted value and might allow an attacker to access, modify, or test the existence
    # of critical or sensitive files.
    real_flink_home = os.path.realpath(flink_home)

    if 'FLINK_CONF_DIR' in env:
        flink_conf_directory = os.path.realpath(env['FLINK_CONF_DIR'])
    else:
        flink_conf_directory = os.path.join(real_flink_home, "conf")
    env['FLINK_CONF_DIR'] = flink_conf_directory

    if 'FLINK_LIB_DIR' in env:
        flink_lib_directory = os.path.realpath(env['FLINK_LIB_DIR'])
    else:
        flink_lib_directory = os.path.join(real_flink_home, "lib")
    env['FLINK_LIB_DIR'] = flink_lib_directory

    if 'FLINK_OPT_DIR' in env:
        flink_opt_directory = os.path.realpath(env['FLINK_OPT_DIR'])
    else:
        flink_opt_directory = os.path.join(real_flink_home, "opt")
    env['FLINK_OPT_DIR'] = flink_opt_directory

    if 'FLINK_PLUGINS_DIR' in env:
        flink_plugins_directory = os.path.realpath(env['FLINK_PLUGINS_DIR'])
    else:
        flink_plugins_directory = os.path.join(real_flink_home, "plugins")
    env['FLINK_PLUGINS_DIR'] = flink_plugins_directory

    env["FLINK_BIN_DIR"] = os.path.join(real_flink_home, "bin")
Esempio n. 9
0
def launch_gateway():
    # type: () -> JavaGateway
    """
    launch jvm gateway
    """

    FLINK_HOME = _find_flink_home()
    # TODO windows support
    on_windows = platform.system() == "Windows"
    if on_windows:
        raise Exception("Windows system is not supported currently.")
    script = "./bin/pyflink-gateway-server.sh"
    command = [os.path.join(FLINK_HOME, script)]
    command += ['-c', 'org.apache.flink.api.python.PythonGatewayServer']

    # Create a temporary directory where the gateway server should write the connection information.
    conn_info_dir = tempfile.mkdtemp()
    try:
        fd, conn_info_file = tempfile.mkstemp(dir=conn_info_dir)
        os.close(fd)
        os.unlink(conn_info_file)

        env = dict(os.environ)
        env["_PYFLINK_CONN_INFO_PATH"] = conn_info_file

        def preexec_func():
            # ignore ctrl-c / SIGINT
            signal.signal(signal.SIGINT, signal.SIG_IGN)

        # Launch the Java gateway.
        # We open a pipe to stdin so that the Java gateway can die when the pipe is broken
        p = Popen(command, stdin=PIPE, preexec_fn=preexec_func, env=env)

        while not p.poll() and not os.path.isfile(conn_info_file):
            time.sleep(0.1)

        if not os.path.isfile(conn_info_file):
            raise Exception("Java gateway process exited before sending its port number")

        with open(conn_info_file, "rb") as info:
            gateway_port = struct.unpack("!I", info.read(4))[0]
    finally:
        shutil.rmtree(conn_info_dir)

    # Connect to the gateway
    gateway = JavaGateway(gateway_parameters=GatewayParameters(port=gateway_port, auto_convert=True))

    # Import the classes used by PyFlink
    java_import(gateway.jvm, "org.apache.flink.table.api.*")
    java_import(gateway.jvm, "org.apache.flink.table.api.java.*")
    java_import(gateway.jvm, "org.apache.flink.table.api.dataview.*")
    java_import(gateway.jvm, "org.apache.flink.table.sources.*")
    java_import(gateway.jvm, "org.apache.flink.table.sinks.*")
    java_import(gateway.jvm, "org.apache.flink.api.common.typeinfo.TypeInformation")
    java_import(gateway.jvm, "org.apache.flink.api.common.typeinfo.Types")
    java_import(gateway.jvm, "org.apache.flink.api.java.ExecutionEnvironment")
    java_import(gateway.jvm, "org.apache.flink.streaming.api.environment.StreamExecutionEnvironment")

    return gateway
Esempio n. 10
0
def launch_gateway():
    # type: () -> JavaGateway
    """
    launch jvm gateway
    """
    if is_launch_gateway_disabled():
        raise Exception(
            "It's launching the PythonGatewayServer during Python UDF execution "
            "which is unexpected. It usually happens when the job codes are "
            "in the top level of the Python script file and are not enclosed in a "
            "`if name == 'main'` statement.")
    FLINK_HOME = _find_flink_home()
    # TODO windows support
    on_windows = platform.system() == "Windows"
    if on_windows:
        raise Exception("Windows system is not supported currently.")
    script = "./bin/pyflink-gateway-server.sh"
    command = [os.path.join(FLINK_HOME, script)]
    command += ['-c', 'org.apache.flink.client.python.PythonGatewayServer']

    submit_args = os.environ.get("SUBMIT_ARGS", "local")
    command += shlex.split(submit_args)

    # Create a temporary directory where the gateway server should write the connection information.
    conn_info_dir = tempfile.mkdtemp()
    try:
        fd, conn_info_file = tempfile.mkstemp(dir=conn_info_dir)
        os.close(fd)
        os.unlink(conn_info_file)

        env = dict(os.environ)
        env["_PYFLINK_CONN_INFO_PATH"] = conn_info_file

        def preexec_func():
            # ignore ctrl-c / SIGINT
            signal.signal(signal.SIGINT, signal.SIG_IGN)

        # Launch the Java gateway.
        # We open a pipe to stdin so that the Java gateway can die when the pipe is broken
        p = Popen(command, stdin=PIPE, preexec_fn=preexec_func, env=env)

        while not p.poll() and not os.path.isfile(conn_info_file):
            time.sleep(0.1)

        if not os.path.isfile(conn_info_file):
            raise Exception(
                "Java gateway process exited before sending its port number")

        with open(conn_info_file, "rb") as info:
            gateway_port = struct.unpack("!I", info.read(4))[0]
    finally:
        shutil.rmtree(conn_info_dir)

    # Connect to the gateway
    gateway = JavaGateway(gateway_parameters=GatewayParameters(
        port=gateway_port, auto_convert=True))

    return gateway
Esempio n. 11
0
def prepare_environment_variable(env):
    flink_home = _find_flink_home()
    env = dict(env)
    env["FLINK_CONF_DIR"] = os.path.join(flink_home, "conf")
    env["FLINK_BIN_DIR"] = os.path.join(flink_home, "bin")
    env["FLINK_PLUGINS_DIR"] = os.path.join(flink_home, "plugins")
    env["FLINK_LIB_DIR"] = os.path.join(flink_home, "lib")
    env["FLINK_OPT_DIR"] = os.path.join(flink_home, "opt")
    return env
Esempio n. 12
0
def test_end_to_end():
    tmp_dir = tempfile.gettempdir()
    source_path = tmp_dir + '/streaming.csv'
    if os.path.isfile(source_path):
        os.remove(source_path)
    with open(source_path, 'w') as f:
        lines = '1,hi,hello\n' + '2,hi,hello\n'
        f.write(lines)
        f.close()
    _find_flink_home()
    print("using %s as FLINK_HOME..." % os.environ["FLINK_HOME"])

    t_config = TableConfig.Builder().as_streaming_execution().set_parallelism(1).build()
    t_env = TableEnvironment.get_table_environment(t_config)

    field_names = ["a", "b", "c"]
    field_types = [DataTypes.INT, DataTypes.STRING, DataTypes.STRING]

    # register Orders table in table environment
    t_env.register_table_source(
        "Orders",
        CsvTableSource(source_path, field_names, field_types))

    # register Results table in table environment
    tmp_dir = tempfile.gettempdir()
    tmp_csv = tmp_dir + '/streaming2.csv'
    if os.path.isfile(tmp_csv):
        os.remove(tmp_csv)

    t_env.register_table_sink(
        "Results",
        field_names, field_types, CsvTableSink(tmp_csv))

    t_env.scan("Orders") \
         .where("a > 0") \
         .select("a + 1, b, c") \
         .insert_into("Results")

    t_env.execute()
    with open(tmp_csv, 'r') as f:
        lines = f.read()
        assert lines == '2,hi,hello\n' + '3,hi,hello\n'
    print("test passed, the log file is under this directory: %s/log" % os.environ["FLINK_HOME"])
Esempio n. 13
0
def find_java_executable():
    java_executable = "java.exe" if on_windows() else "java"
    flink_home = _find_flink_home()
    flink_conf_file = os.path.join(flink_home, "conf", "flink-conf.yaml")
    java_home = read_from_config(KEY_ENV_JAVA_HOME, None, flink_conf_file)

    if java_home is None and "JAVA_HOME" in os.environ:
        java_home = os.environ["JAVA_HOME"]

    if java_home is not None:
        java_executable = os.path.join(java_home, "bin", java_executable)

    return java_executable
Esempio n. 14
0
 def _cancel_jobs(self):
     for job in self._jobs:
         cancel_stdout = '{}/cancel_{}_stdout.log'.format(
             get_file_dir(__file__), job)
         cancel_stderr = '{}/cancel_{}_stderr.log'.format(
             get_file_dir(__file__), job)
         with open(cancel_stdout, 'a') as out, open(cancel_stderr,
                                                    'a') as err:
             # execute `flink cancel <jobID>` to cancel batch job
             Popen(args='{}/bin/flink cancel {}'.format(
                 _find_flink_home(), job),
                   shell=True,
                   stdout=out,
                   stderr=err)
Esempio n. 15
0
def launch_gateway():
    # type: () -> JavaGateway
    """
    launch jvm gateway
    """

    FLINK_HOME = _find_flink_home()
    # TODO windows support
    on_windows = platform.system() == "Windows"
    if on_windows:
        raise Exception("Windows system is not supported currently.")
    script = "./bin/pyflink-gateway-server.sh"
    command = [os.path.join(FLINK_HOME, script)]
    command += ['-c', 'org.apache.flink.client.python.PythonGatewayServer']

    # Create a temporary directory where the gateway server should write the connection information.
    conn_info_dir = tempfile.mkdtemp()
    try:
        fd, conn_info_file = tempfile.mkstemp(dir=conn_info_dir)
        os.close(fd)
        os.unlink(conn_info_file)

        env = dict(os.environ)
        env["_PYFLINK_CONN_INFO_PATH"] = conn_info_file

        def preexec_func():
            # ignore ctrl-c / SIGINT
            signal.signal(signal.SIGINT, signal.SIG_IGN)

        # Launch the Java gateway.
        # We open a pipe to stdin so that the Java gateway can die when the pipe is broken
        p = Popen(command, stdin=PIPE, preexec_fn=preexec_func, env=env)

        while not p.poll() and not os.path.isfile(conn_info_file):
            time.sleep(0.1)

        if not os.path.isfile(conn_info_file):
            raise Exception("Java gateway process exited before sending its port number")

        with open(conn_info_file, "rb") as info:
            gateway_port = struct.unpack("!I", info.read(4))[0]
    finally:
        shutil.rmtree(conn_info_dir)

    # Connect to the gateway
    gateway = JavaGateway(
        gateway_parameters=GatewayParameters(port=gateway_port, auto_convert=True))

    return gateway
Esempio n. 16
0
def construct_log_settings(env):
    templates = [
        "-Dlog.file=${flink_log_dir}/flink-${flink_ident_string}-python-${hostname}.log",
        "-Dlog4j.configuration=${log4j_properties}",
        "-Dlog4j.configurationFile=${log4j_properties}",
        "-Dlogback.configurationFile=${logback_xml}"
    ]

    flink_home = os.path.realpath(_find_flink_home())
    flink_conf_dir = env['FLINK_CONF_DIR']
    flink_conf_file = os.path.join(env['FLINK_CONF_DIR'], "flink-conf.yaml")

    if "FLINK_LOG_DIR" in env:
        flink_log_dir = env["FLINK_LOG_DIR"]
    else:
        flink_log_dir = read_from_config(KEY_ENV_LOG_DIR,
                                         os.path.join(flink_home, "log"),
                                         flink_conf_file)

    if "LOG4J_PROPERTIES" in env:
        log4j_properties = env["LOG4J_PROPERTIES"]
    else:
        log4j_properties = "%s/log4j-cli.properties" % flink_conf_dir

    if "LOGBACK_XML" in env:
        logback_xml = env["LOGBACK_XML"]
    else:
        logback_xml = "%s/logback.xml" % flink_conf_dir

    if "FLINK_IDENT_STRING" in env:
        flink_ident_string = env["FLINK_IDENT_STRING"]
    else:
        flink_ident_string = getpass.getuser()

    hostname = socket.gethostname()
    log_settings = []
    for template in templates:
        log_settings.append(
            Template(template).substitute(
                log4j_properties=log4j_properties,
                logback_xml=logback_xml,
                flink_log_dir=flink_log_dir,
                flink_ident_string=flink_ident_string,
                hostname=hostname))
    return log_settings
Esempio n. 17
0
def construct_flink_classpath(env):
    flink_home = _find_flink_home()
    flink_lib_directory = env['FLINK_LIB_DIR']
    flink_opt_directory = env['FLINK_OPT_DIR']

    if on_windows():
        # The command length is limited on Windows. To avoid the problem we should shorten the
        # command length as much as possible.
        lib_jars = os.path.join(flink_lib_directory, "*")
    else:
        lib_jars = os.pathsep.join(glob.glob(os.path.join(flink_lib_directory, "*.jar")))

    flink_python_jars = glob.glob(os.path.join(flink_opt_directory, "flink-python*.jar"))
    if len(flink_python_jars) < 1:
        print("The flink-python jar is not found in the opt folder of the FLINK_HOME: %s" %
              flink_home)
        return lib_jars
    flink_python_jar = flink_python_jars[0]

    return os.pathsep.join([lib_jars, flink_python_jar])
Esempio n. 18
0
def construct_log_settings():
    templates = [
        "-Dlog.file=${flink_log_dir}/flink-${flink_ident_string}-python-${hostname}.log",
        "-Dlog4j.configuration=${log4j_properties}",
        "-Dlog4j.configurationFile=${log4j_properties}",
        "-Dlogback.configurationFile=${logback_xml}"
    ]

    flink_home = _find_flink_home()
    flink_conf_dir = os.path.join(flink_home, "conf")
    if "FLINK_LOG_DIR" in os.environ:
        flink_log_dir = os.environ["FLINK_LOG_DIR"]
    else:
        flink_log_dir = os.path.join(flink_home, "log")

    if "LOG4J_PROPERTIES" in os.environ:
        log4j_properties = os.environ["LOG4J_PROPERTIES"]
    else:
        log4j_properties = "%s/log4j-cli.properties" % flink_conf_dir

    if "LOGBACK_XML" in os.environ:
        logback_xml = os.environ["LOGBACK_XML"]
    else:
        logback_xml = "%s/logback.xml" % flink_conf_dir

    if "FLINK_IDENT_STRING" in os.environ:
        flink_ident_string = os.environ["FLINK_IDENT_STRING"]
    else:
        flink_ident_string = getpass.getuser()

    hostname = socket.gethostname()
    log_settings = []
    for template in templates:
        log_settings.append(
            Template(template).substitute(
                log4j_properties=log4j_properties,
                logback_xml=logback_xml,
                flink_log_dir=flink_log_dir,
                flink_ident_string=flink_ident_string,
                hostname=hostname))
    return log_settings
Esempio n. 19
0
def find_java_executable():
    java_executable = "java.exe" if on_windows() else "java"
    flink_home = _find_flink_home()
    flink_conf_path = os.path.join(flink_home, "conf", "flink-conf.yaml")
    java_home = None

    if os.path.isfile(flink_conf_path):
        with open(flink_conf_path, "r") as f:
            flink_conf_yaml = f.read()
        java_homes = re.findall(r'^[ ]*env\.java\.home[ ]*: ([^#]*).*$',
                                flink_conf_yaml)
        if len(java_homes) > 1:
            java_home = java_homes[len(java_homes) - 1].strip()

    if java_home is None and "JAVA_HOME" in os.environ:
        java_home = os.environ["JAVA_HOME"]

    if java_home is not None:
        java_executable = os.path.join(java_home, "bin", java_executable)

    return java_executable