コード例 #1
0
def init_spark_on_local(cores=2,
                        conf=None,
                        python_location=None,
                        spark_log_level="WARN",
                        redirect_spark_log=True):
    """
    Create a SparkContext with Analytics Zoo configurations on the local machine.

    :param cores: The number of cores for Spark local. Default to be 2. You can also set it to "*"
           to use all the available cores. i.e `init_spark_on_local(cores="*")`
    :param conf: You can append extra conf for Spark in key-value format.
           i.e conf={"spark.executor.extraJavaOptions": "-XX:+PrintGCDetails"}.
           Default to be None.
    :param python_location: The path to your running Python executable. If not specified, the
           default Python interpreter in effect would be used.
    :param spark_log_level: The log level for Spark. Default to be 'WARN'.
    :param redirect_spark_log: Whether to redirect the Spark log to local file. Default to be True.

    :return: An instance of SparkContext.
    """
    from zoo.util.spark import SparkRunner
    runner = SparkRunner(spark_log_level=spark_log_level,
                         redirect_spark_log=redirect_spark_log)
    set_python_home()
    return runner.init_spark_on_local(cores=cores,
                                      conf=conf,
                                      python_location=python_location)
コード例 #2
0
def init_spark_on_local(cores=2, conf=None, python_location=None, spark_log_level="WARN",
                        redirect_spark_log=True):
    """Saves the Trainer state to the provided checkpoint path.

    Args:

        checkpoint (str): Path to target checkpoint file.
    """
    from zoo.util.spark import SparkRunner
    runner = SparkRunner(spark_log_level=spark_log_level,
                         redirect_spark_log=redirect_spark_log)
    set_python_home()
    return runner.init_spark_on_local(cores=cores, conf=conf,
                                      python_location=python_location)
コード例 #3
0
def init_spark_on_yarn(hadoop_conf,
                       conda_name,
                       num_executors,
                       executor_cores,
                       executor_memory="2g",
                       driver_cores=4,
                       driver_memory="1g",
                       extra_executor_memory_for_ray=None,
                       extra_python_lib=None,
                       penv_archive=None,
                       additional_archive=None,
                       hadoop_user_name="root",
                       spark_yarn_archive=None,
                       spark_log_level="WARN",
                       redirect_spark_log=True,
                       jars=None,
                       conf=None):
    """Returns the local TrainingOperator object.
    
       Be careful not to perturb its state, or else you can cause the system
       to enter an inconsistent state.
       
       Returns:
            TrainingOperator: The local TrainingOperator object.
    """
    from zoo.util.spark import SparkRunner
    runner = SparkRunner(spark_log_level=spark_log_level,
                         redirect_spark_log=redirect_spark_log)
    set_python_home()
    sc = runner.init_spark_on_yarn(
        hadoop_conf=hadoop_conf,
        conda_name=conda_name,
        num_executors=num_executors,
        executor_cores=executor_cores,
        executor_memory=executor_memory,
        driver_cores=driver_cores,
        driver_memory=driver_memory,
        extra_executor_memory_for_ray=extra_executor_memory_for_ray,
        extra_python_lib=extra_python_lib,
        penv_archive=penv_archive,
        additional_archive=additional_archive,
        hadoop_user_name=hadoop_user_name,
        spark_yarn_archive=spark_yarn_archive,
        jars=jars,
        conf=conf)
    return sc
コード例 #4
0
def init_spark_standalone(num_executors,
                          executor_cores,
                          executor_memory="2g",
                          driver_cores=4,
                          driver_memory="1g",
                          master=None,
                          extra_executor_memory_for_ray=None,
                          extra_python_lib=None,
                          spark_log_level="WARN",
                          redirect_spark_log=True,
                          conf=None,
                          jars=None,
                          python_location=None,
                          enable_numa_binding=False):
    """Returns the local TrainingOperator object.

       Be careful not to perturb its state, or else you can cause the system
       to enter an inconsistent state.

       Returns:
            TrainingOperator: The local TrainingOperator object.
    """

    from zoo.util.spark import SparkRunner
    runner = SparkRunner(spark_log_level=spark_log_level,
                         redirect_spark_log=redirect_spark_log)
    set_python_home()
    sc = runner.init_spark_standalone(
        num_executors=num_executors,
        executor_cores=executor_cores,
        executor_memory=executor_memory,
        driver_cores=driver_cores,
        driver_memory=driver_memory,
        master=master,
        extra_executor_memory_for_ray=extra_executor_memory_for_ray,
        extra_python_lib=extra_python_lib,
        conf=conf,
        jars=jars,
        python_location=python_location,
        enable_numa_binding=enable_numa_binding)
    return sc
コード例 #5
0
ファイル: nncontext.py プロジェクト: xiejinglei/analytics-zoo
def init_spark_on_yarn(hadoop_conf,
                       conda_name,
                       num_executors,
                       executor_cores,
                       executor_memory="2g",
                       driver_cores=4,
                       driver_memory="2g",
                       extra_executor_memory_for_ray=None,
                       extra_python_lib=None,
                       penv_archive=None,
                       additional_archive=None,
                       hadoop_user_name="root",
                       spark_yarn_archive=None,
                       spark_log_level="WARN",
                       redirect_spark_log=True,
                       jars=None,
                       conf=None):
    """
    Create a SparkContext with Analytics Zoo configurations on Yarn cluster for yarn-client mode.
    You only need to create a conda environment and install the python dependencies in that
    environment beforehand on the driver machine. These dependencies would be automatically
    packaged and distributed to the whole Yarn cluster.

    :param hadoop_conf: The path to the yarn configuration folder.
    :param conda_name: The name of the conda environment.
    :param num_executors: The number of Spark executors.
    :param executor_cores: The number of cores for each executor.
    :param executor_memory: The memory for each executor. Default to be '2g'.
    :param driver_cores: The number of cores for the Spark driver. Default to be 4.
    :param driver_memory: The memory for the Spark driver. Default to be '1g'.
    :param extra_executor_memory_for_ray: The extra memory for Ray services. Default to be None.
    :param extra_python_lib: Extra python files or packages needed for distribution.
           Default to be None.
    :param penv_archive: Ideally, the program would auto-pack the conda environment specified by
           'conda_name', but you can also pass the path to a packed file in "tar.gz" format here.
           Default to be None.
    :param additional_archive: Comma-separated list of additional archives to be uploaded and
           unpacked on executors. Default to be None.
    :param hadoop_user_name: The user name for running the yarn cluster. Default to be 'root'.
    :param spark_yarn_archive: Conf value for setting spark.yarn.archive. Default to be None.
    :param spark_log_level: The log level for Spark. Default to be 'WARN'.
    :param redirect_spark_log: Whether to redirect the Spark log to local file. Default to be True.
    :param jars: Comma-separated list of jars to be included on driver and executor's classpath.
           Default to be None.
    :param conf: You can append extra conf for Spark in key-value format.
           i.e conf={"spark.executor.extraJavaOptions": "-XX:+PrintGCDetails"}.
           Default to be None.

    :return: An instance of SparkContext.
    """
    from zoo.util.spark import SparkRunner
    runner = SparkRunner(spark_log_level=spark_log_level,
                         redirect_spark_log=redirect_spark_log)
    set_python_home()
    sc = runner.init_spark_on_yarn(
        hadoop_conf=hadoop_conf,
        conda_name=conda_name,
        num_executors=num_executors,
        executor_cores=executor_cores,
        executor_memory=executor_memory,
        driver_cores=driver_cores,
        driver_memory=driver_memory,
        extra_executor_memory_for_ray=extra_executor_memory_for_ray,
        extra_python_lib=extra_python_lib,
        penv_archive=penv_archive,
        additional_archive=additional_archive,
        hadoop_user_name=hadoop_user_name,
        spark_yarn_archive=spark_yarn_archive,
        jars=jars,
        conf=conf)
    return sc
コード例 #6
0
ファイル: nncontext.py プロジェクト: xiejinglei/analytics-zoo
def init_nncontext(conf=None, spark_log_level="WARN", redirect_spark_log=True):
    """
    Creates or gets a SparkContext with optimized configurations for BigDL performance.
    This method will also initialize the BigDL engine.

    Note: If you use spark-shell or Jupyter notebook, as the SparkContext is created
    before your code, you have to set the Spark configurations through command line options
    or the properties file before calling this method. In this case, you are recommended
    to use the launch scripts we provide:
    https://github.com/intel-analytics/analytics-zoo/tree/master/scripts.

    :param conf: An instance of SparkConf. If not specified, a new SparkConf with
           Analytics Zoo and BigDL configurations would be created and used.
           You can also input a string here to indicate the name of the application.
    :param spark_log_level: The log level for Spark. Default to be 'WARN'.
    :param redirect_spark_log: Whether to redirect the Spark log to local file. Default to be True.

    :return: An instance of SparkContext.
    """
    has_activate_sc = SparkContext._active_spark_context is not None
    # The following code copied and modified from
    # https://github.com/Valassis-Digital-Media/spylon-kernel/blob/master/
    # spylon_kernel/scala_interpreter.py
    if ZooContext.log_output and not has_activate_sc:
        import subprocess
        import pyspark.java_gateway
        spark_jvm_proc = None

        def Popen(*args, **kwargs):
            """Wraps subprocess.Popen to force stdout and stderr from the child process
            to pipe to this process without buffering.
            """
            nonlocal spark_jvm_proc
            # Override these in kwargs to avoid duplicate value errors
            # Set streams to unbuffered so that we read whatever bytes are available
            # when ready, https://docs.python.org/3.6/library/subprocess.html#popen-constructor
            kwargs['bufsize'] = 0
            # Capture everything from stdout for display in the notebook
            kwargs['stdout'] = subprocess.PIPE
            # Optionally capture stderr, otherwise it'll go to the kernel log
            kwargs['stderr'] = subprocess.PIPE
            spark_jvm_proc = subprocess.Popen(*args, **kwargs)
            return spark_jvm_proc

        pyspark.java_gateway.Popen = Popen

    if isinstance(conf, six.string_types):
        sc = getOrCreateSparkContext(conf=None, appName=conf)
    else:
        sc = getOrCreateSparkContext(conf=conf)
    sc.setLogLevel(spark_log_level)

    if ZooContext.log_output and not has_activate_sc and spark_jvm_proc is not None:
        if spark_jvm_proc.stdout is not None:
            stdout_reader = threading.Thread(target=_read_stream,
                                             daemon=True,
                                             kwargs=dict(
                                                 fd=spark_jvm_proc.stdout,
                                                 fn=sys.stdout.write))
            stdout_reader.start()
        if spark_jvm_proc.stderr is not None:
            stderr_reader = threading.Thread(target=_read_stream,
                                             daemon=True,
                                             kwargs=dict(
                                                 fd=spark_jvm_proc.stderr,
                                                 fn=sys.stderr.write))
            stderr_reader.start()
    check_version()
    if redirect_spark_log:
        redire_spark_logs()
        show_bigdl_info_logs()
    init_engine()
    set_python_home()
    return sc
コード例 #7
0
ファイル: nncontext.py プロジェクト: xiejinglei/analytics-zoo
def init_spark_standalone(num_executors,
                          executor_cores,
                          executor_memory="2g",
                          driver_cores=4,
                          driver_memory="2g",
                          master=None,
                          extra_executor_memory_for_ray=None,
                          extra_python_lib=None,
                          spark_log_level="WARN",
                          redirect_spark_log=True,
                          conf=None,
                          jars=None,
                          python_location=None,
                          enable_numa_binding=False):
    """
    Create a SparkContext with Analytics Zoo configurations on Spark standalone cluster.

    You need to specify master if you already have a Spark standalone cluster. For a
    standalone cluster with multiple nodes, make sure that analytics-zoo is installed via
    pip in the Python environment on every node.
    If master is not specified, a new Spark standalone cluster on the current single node
    would be started first and the SparkContext would use its master address. You need to
    call `stop_spark_standalone` after your program finishes to shutdown the cluster.

    :param num_executors: The number of Spark executors.
    :param executor_cores: The number of cores for each executor.
    :param executor_memory: The memory for each executor. Default to be '2g'.
    :param driver_cores: The number of cores for the Spark driver. Default to be 4.
    :param driver_memory: The memory for the Spark driver. Default to be '1g'.
    :param master: The master URL of an existing Spark standalone cluster: 'spark://master:port'.
    You only need to specify this if you have already started a standalone cluster.
    Default to be None and a new standalone cluster would be started in this case.
    :param extra_executor_memory_for_ray: The extra memory for Ray services. Default to be None.
    :param extra_python_lib: Extra python files or packages needed for distribution.
           Default to be None.
    :param spark_log_level: The log level for Spark. Default to be 'WARN'.
    :param redirect_spark_log: Whether to redirect the Spark log to local file. Default to be True.
    :param jars: Comma-separated list of jars to be included on driver and executor's classpath.
           Default to be None.
    :param conf: You can append extra conf for Spark in key-value format.
           i.e conf={"spark.executor.extraJavaOptions": "-XX:+PrintGCDetails"}.
           Default to be None.
    :param python_location: The path to your running Python executable. If not specified, the
           default Python interpreter in effect would be used.
    :param enable_numa_binding: Whether to use numactl to start spark worker in order to bind
           different worker processes to different cpus and memory areas. This is may lead to
           better performance on a multi-sockets machine. Defaults to False.

    :return: An instance of SparkContext.
    """
    from zoo.util.spark import SparkRunner
    runner = SparkRunner(spark_log_level=spark_log_level,
                         redirect_spark_log=redirect_spark_log)
    set_python_home()
    sc = runner.init_spark_standalone(
        num_executors=num_executors,
        executor_cores=executor_cores,
        executor_memory=executor_memory,
        driver_cores=driver_cores,
        driver_memory=driver_memory,
        master=master,
        extra_executor_memory_for_ray=extra_executor_memory_for_ray,
        extra_python_lib=extra_python_lib,
        conf=conf,
        jars=jars,
        python_location=python_location,
        enable_numa_binding=enable_numa_binding)
    return sc
コード例 #8
0
def init_nncontext(conf=None, spark_log_level="WARN", redirect_spark_log=True):
    
    # The following code copied and modified from
    # https://github.com/Valassis-Digital-Media/spylon-kernel/blob/master/
    # spylon_kernel/scala_interpreter.py
    if ZooContext.log_output:
        import subprocess
        import pyspark.java_gateway
        spark_jvm_proc = None
        """Returns the local TrainingOperator object.

       Be careful not to perturb its state, or else you can cause the system
       to enter an inconsistent state.

       Returns:
            TrainingOperator: The local TrainingOperator object.
   
       """

        def Popen(*args, **kwargs):
            """Wraps subprocess.Popen to force stdout and stderr from the child process
            to pipe to this process without buffering.
            """
            nonlocal spark_jvm_proc
            # Override these in kwargs to avoid duplicate value errors
            # Set streams to unbuffered so that we read whatever bytes are available
            # when ready, https://docs.python.org/3.6/library/subprocess.html#popen-constructor
            kwargs['bufsize'] = 0
            # Capture everything from stdout for display in the notebook
            kwargs['stdout'] = subprocess.PIPE
            # Optionally capture stderr, otherwise it'll go to the kernel log
            kwargs['stderr'] = subprocess.PIPE
            spark_jvm_proc = subprocess.Popen(*args, **kwargs)
            return spark_jvm_proc

        pyspark.java_gateway.Popen = Popen

    if isinstance(conf, six.string_types):
        sc = getOrCreateSparkContext(conf=None, appName=conf)
    else:
        sc = getOrCreateSparkContext(conf=conf)
    sc.setLogLevel(spark_log_level)

    if ZooContext.log_output:
        if spark_jvm_proc.stdout is not None:
            stdout_reader = threading.Thread(target=_read_stream,
                                             daemon=True,
                                             kwargs=dict(
                                                 fd=spark_jvm_proc.stdout,
                                                 fn=sys.stdout.write))
            stdout_reader.start()
        if spark_jvm_proc.stderr is not None:
            stderr_reader = threading.Thread(target=_read_stream,
                                             daemon=True,
                                             kwargs=dict(
                                                 fd=spark_jvm_proc.stderr,
                                                 fn=sys.stderr.write))
            stderr_reader.start()
    check_version()
    if redirect_spark_log:
        redire_spark_logs()
        show_bigdl_info_logs()
    init_engine()
    set_python_home()
    return sc