def __init__(self, tmpdir, log, quiet, append, branching_factor,
                 skip_logging_configuration, optimizer_iterations):
        super(LocalBackend, self).__init__()

        spark_home = find_spark_home()
        hail_jar_path = os.environ.get('HAIL_JAR')
        if hail_jar_path is None:
            if pkg_resources.resource_exists(__name__, "hail-all-spark.jar"):
                hail_jar_path = pkg_resources.resource_filename(
                    __name__, "hail-all-spark.jar")
            else:
                raise RuntimeError(
                    'local backend requires a packaged jar or HAIL_JAR to be set'
                )

        port = launch_gateway(redirect_stdout=sys.stdout,
                              redirect_stderr=sys.stderr,
                              jarpath=f'{spark_home}/jars/py4j-0.10.9.jar',
                              classpath=f'{spark_home}/jars/*:{hail_jar_path}',
                              die_on_exit=True)
        self._gateway = JavaGateway(
            gateway_parameters=GatewayParameters(port=port, auto_convert=True))
        self._jvm = self._gateway.jvm

        hail_package = getattr(self._jvm, 'is').hail

        self._hail_package = hail_package
        self._utils_package_object = scala_package_object(hail_package.utils)

        self._jbackend = hail_package.backend.local.LocalBackend.apply(tmpdir)
        self._jhc = hail_package.HailContext.apply(self._jbackend, log, True,
                                                   append, branching_factor,
                                                   skip_logging_configuration,
                                                   optimizer_iterations)

        # This has to go after creating the SparkSession. Unclear why.
        # Maybe it does its own patch?
        install_exception_handler()

        from hail.context import version

        py_version = version()
        jar_version = self._jhc.version()
        if jar_version != py_version:
            raise RuntimeError(
                f"Hail version mismatch between JAR and Python library\n"
                f"  JAR:    {jar_version}\n"
                f"  Python: {py_version}")

        self._fs = LocalFS()
        self._logger = None

        if not quiet:
            connect_logger(self._utils_package_object, 'localhost', 12888)
Beispiel #2
0
    def __init__(self, tmpdir, log, quiet, append, branching_factor,
                 skip_logging_configuration, optimizer_iterations):
        SPARK_HOME = os.environ['SPARK_HOME']
        HAIL_HOME = os.environ['HAIL_HOME']
        port = launch_gateway(
            redirect_stdout=sys.stdout,
            redirect_stderr=sys.stderr,
            jarpath=f'{SPARK_HOME}/jars/py4j-0.10.7.jar',
            classpath=
            f'{SPARK_HOME}/jars/*:{HAIL_HOME}/hail/build/libs/hail-all-spark.jar',
            die_on_exit=True)
        self._gateway = JavaGateway(
            gateway_parameters=GatewayParameters(port=port, auto_convert=True))
        self._jvm = self._gateway.jvm

        hail_package = getattr(self._jvm, 'is').hail

        self._hail_package = hail_package
        self._utils_package_object = scala_package_object(hail_package.utils)

        self._jbackend = hail_package.backend.local.LocalBackend.apply(tmpdir)
        self._jhc = hail_package.HailContext.apply(self._jbackend, log, True,
                                                   append, branching_factor,
                                                   skip_logging_configuration,
                                                   optimizer_iterations)

        # This has to go after creating the SparkSession. Unclear why.
        # Maybe it does its own patch?
        install_exception_handler()

        from hail.context import version

        py_version = version()
        jar_version = self._jhc.version()
        if jar_version != py_version:
            raise RuntimeError(
                f"Hail version mismatch between JAR and Python library\n"
                f"  JAR:    {jar_version}\n"
                f"  Python: {py_version}")

        self._fs = LocalFS()
        self._logger = None

        if not quiet:
            connect_logger(self._utils_package_object, 'localhost', 12888)
Beispiel #3
0
    def __init__(self, idempotent, sc, spark_conf, app_name, master, local,
                 log, quiet, append, min_block_size, branching_factor, tmpdir,
                 local_tmpdir, skip_logging_configuration,
                 optimizer_iterations):
        if pkg_resources.resource_exists(__name__, "hail-all-spark.jar"):
            hail_jar_path = pkg_resources.resource_filename(
                __name__, "hail-all-spark.jar")
            assert os.path.exists(
                hail_jar_path), f'{hail_jar_path} does not exist'
            conf = pyspark.SparkConf()

            base_conf = spark_conf or {}
            for k, v in base_conf.items():
                conf.set(k, v)

            jars = [hail_jar_path]

            if os.environ.get('HAIL_SPARK_MONITOR'):
                import sparkmonitor
                jars.append(
                    os.path.join(os.path.dirname(sparkmonitor.__file__),
                                 'listener.jar'))
                conf.set("spark.extraListeners",
                         "sparkmonitor.listener.JupyterSparkMonitorListener")

            conf.set('spark.jars', ','.join(jars))
            conf.set('spark.driver.extraClassPath', ','.join(jars))
            conf.set('spark.executor.extraClassPath', './hail-all-spark.jar')
            if sc is None:
                pyspark.SparkContext._ensure_initialized(conf=conf)
            elif not quiet:
                sys.stderr.write(
                    'pip-installed Hail requires additional configuration options in Spark referring\n'
                    '  to the path to the Hail Python module directory HAIL_DIR,\n'
                    '  e.g. /path/to/python/site-packages/hail:\n'
                    '    spark.jars=HAIL_DIR/hail-all-spark.jar\n'
                    '    spark.driver.extraClassPath=HAIL_DIR/hail-all-spark.jar\n'
                    '    spark.executor.extraClassPath=./hail-all-spark.jar')
        else:
            pyspark.SparkContext._ensure_initialized()

        self._gateway = pyspark.SparkContext._gateway
        self._jvm = pyspark.SparkContext._jvm

        hail_package = getattr(self._jvm, 'is').hail

        self._hail_package = hail_package
        self._utils_package_object = scala_package_object(hail_package.utils)

        jsc = sc._jsc.sc() if sc else None

        if idempotent:
            self._jbackend = hail_package.backend.spark.SparkBackend.getOrCreate(
                jsc, app_name, master, local, True, min_block_size, tmpdir,
                local_tmpdir)
            self._jhc = hail_package.HailContext.getOrCreate(
                self._jbackend, log, True, append, branching_factor,
                skip_logging_configuration, optimizer_iterations)
        else:
            self._jbackend = hail_package.backend.spark.SparkBackend.apply(
                jsc, app_name, master, local, True, min_block_size, tmpdir,
                local_tmpdir)
            self._jhc = hail_package.HailContext.apply(
                self._jbackend, log, True, append, branching_factor,
                skip_logging_configuration, optimizer_iterations)

        self._jsc = self._jbackend.sc()
        if sc:
            self.sc = sc
        else:
            self.sc = pyspark.SparkContext(gateway=self._gateway,
                                           jsc=self._jvm.JavaSparkContext(
                                               self._jsc))
        self._jspark_session = self._jbackend.sparkSession()
        self._spark_session = pyspark.sql.SparkSession(self.sc,
                                                       self._jspark_session)

        # This has to go after creating the SparkSession. Unclear why.
        # Maybe it does its own patch?
        install_exception_handler()

        from hail.context import version

        py_version = version()
        jar_version = self._jhc.version()
        if jar_version != py_version:
            raise RuntimeError(
                f"Hail version mismatch between JAR and Python library\n"
                f"  JAR:    {jar_version}\n"
                f"  Python: {py_version}")

        self._fs = None
        self._logger = None

        if not quiet:
            sys.stderr.write('Running on Apache Spark version {}\n'.format(
                self.sc.version))
            if self._jsc.uiWebUrl().isDefined():
                sys.stderr.write('SparkUI available at {}\n'.format(
                    self._jsc.uiWebUrl().get()))

            connect_logger(self._utils_package_object, 'localhost', 12888)

            self._jbackend.startProgressBar()