コード例 #1
0
ファイル: spark.py プロジェクト: swarnend/analytics-zoo
 def _create_sc(self, submit_args, conf):
     os.environ['PYSPARK_SUBMIT_ARGS'] = submit_args
     zoo_conf = init_spark_conf(conf)
     sc = init_nncontext(conf=zoo_conf,
                         spark_log_level=self.spark_log_level,
                         redirect_spark_log=self.redirect_spark_log)
     return sc
コード例 #2
0
    def _create_sc(self, submit_args, conf):
        from pyspark.sql import SparkSession
        print("pyspark_submit_args is: {}".format(submit_args))
        os.environ['PYSPARK_SUBMIT_ARGS'] = submit_args
        zoo_conf = init_spark_conf(conf)
        sc = init_nncontext(conf=zoo_conf, redirect_spark_log=self.redirect_spark_log)
        sc.setLogLevel(self.spark_log_level)

        return sc
コード例 #3
0
 def init_spark_on_local(self, cores, conf=None, python_location=None):
     print("Start to getOrCreate SparkContext")
     os.environ['PYSPARK_PYTHON'] = \
         python_location if python_location else self._detect_python_location()
     master = "local[{}]".format(cores)
     zoo_conf = init_spark_conf(conf).setMaster(master)
     sc = init_nncontext(conf=zoo_conf, redirect_spark_log=self.redirect_spark_log)
     sc.setLogLevel(self.spark_log_level)
     print("Successfully got a SparkContext")
     return sc
コード例 #4
0
    def init_spark_standalone(self,
                              num_executors,
                              executor_cores,
                              executor_memory="10g",
                              driver_memory="1g",
                              driver_cores=4,
                              master=None,
                              extra_executor_memory_for_ray=None,
                              extra_python_lib=None,
                              conf=None,
                              jars=None):
        import subprocess
        import pyspark
        from zoo.util.utils import get_node_ip
        from zoo.util.engine import get_analytics_zoo_classpath
        from bigdl.util.engine import get_bigdl_classpath

        if 'PYSPARK_PYTHON' not in os.environ:
            os.environ["PYSPARK_PYTHON"] = self._detect_python_location()
        if not master:
            pyspark_home = os.path.abspath(pyspark.__file__ + "/../")
            zoo_standalone_home = os.path.abspath(
                __file__ + "/../../share/bin/standalone")
            node_ip = get_node_ip()
            SparkRunner.standalone_env = {
                "SPARK_HOME": pyspark_home,
                "ZOO_STANDALONE_HOME": zoo_standalone_home,
                # If not set this, by default master is hostname but not ip,
                "SPARK_MASTER_HOST": node_ip
            }
            # The scripts installed from pip don't have execution permission
            # and need to first give them permission.
            pro = subprocess.Popen(
                ["chmod", "-R", "+x", "{}/sbin".format(zoo_standalone_home)])
            os.waitpid(pro.pid, 0)
            # Start master
            start_master_pro = subprocess.Popen(
                "{}/sbin/start-master.sh".format(zoo_standalone_home),
                shell=True,
                env=SparkRunner.standalone_env)
            os.waitpid(start_master_pro.pid, 0)
            master = "spark://{}:7077".format(
                node_ip)  # 7077 is the default port
            # Start worker
            start_worker_pro = subprocess.Popen(
                "{}/sbin/start-worker.sh {}".format(zoo_standalone_home,
                                                    master),
                shell=True,
                env=SparkRunner.standalone_env)
            os.waitpid(start_worker_pro.pid, 0)
        else:  # A Spark standalone cluster has already been started by the user.
            assert master.startswith("spark://"), \
                "Please input a valid master address for your Spark standalone cluster: " \
                "spark://master:port"

        # Start pyspark-shell
        submit_args = " --master " + master
        submit_args = submit_args + " --driver-cores {} --driver-memory {} --num-executors {}" \
                                    " --executor-cores {} --executor-memory {}"\
            .format(driver_cores, driver_memory, num_executors, executor_cores, executor_memory)
        if extra_python_lib:
            submit_args = submit_args + " --py-files {}".format(
                extra_python_lib)
        if jars:
            submit_args = submit_args + " --jars {}".format(jars)
        submit_args = submit_args + " pyspark-shell"
        os.environ['PYSPARK_SUBMIT_ARGS'] = submit_args

        zoo_bigdl_jar_path = ":".join(
            [get_analytics_zoo_classpath(),
             get_bigdl_classpath()])
        spark_conf = init_spark_conf(conf) \
            .set("spark.driver.cores", driver_cores) \
            .set("spark.driver.memory", driver_memory) \
            .set("spark.executor.instances", num_executors) \
            .set("spark.executor.cores", executor_cores) \
            .set("spark.cores.max", num_executors * executor_cores) \
            .set("spark.executorEnv.PYTHONHOME",
                 "/".join(self._detect_python_location().split("/")[:-2]))
        if extra_executor_memory_for_ray:
            spark_conf.set("spark.executor.memoryOverhead",
                           extra_executor_memory_for_ray)
        if spark_conf.contains("spark.executor.extraClassPath"):
            spark_conf.set(
                "spark.executor.extraClassPath",
                "{}:{}".format(zoo_bigdl_jar_path,
                               conf.get("spark.executor.extraClassPath")))
        else:
            spark_conf.set("spark.executor.extraClassPath", zoo_bigdl_jar_path)

        sc = init_nncontext(spark_conf,
                            redirect_spark_log=self.redirect_spark_log)
        sc.setLogLevel(self.spark_log_level)
        return sc
コード例 #5
0
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

from bigdl.optim.optimizer import Adam
from keras.datasets import imdb
from keras.preprocessing import sequence
from zoo.pipeline.api.keras.models import Model
from zoo.pipeline.api.keras.layers import *
from zoo.pipeline.api.autograd import *
from zoo.common.nncontext import init_spark_conf
from zoo.common.nncontext import init_nncontext

conf = init_spark_conf()
conf.set("spark.executor.extraJavaOptions", "-Xss512m")
conf.set("spark.driver.extraJavaOptions", "-Xss512m")
sc = init_nncontext(conf)
max_features = 20000
max_len = 200

print('Loading data...')
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')

print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(x_train, maxlen=max_len)
x_test = sequence.pad_sequences(x_test, maxlen=max_len)
print('x_train shape:', x_train.shape)