Ejemplo n.º 1
0
 def __init__(self, sc=None, **create_sc_kwargs):
     if not sc:
         sc = create_sc(**create_sc_kwargs)
     if type(sc) is not SparkContext:
         raise TypeError("sparktk context init requires a valid SparkContext.  Received type %s" % type(sc))
     self._sc = sc
     self._jtc = self._sc._jvm.org.trustedanalytics.sparktk.TkContext(self._sc._jsc)
     self._jutils = JUtils(self._sc)
     self._scala_sc = self._jutils.get_scala_sc()
     loggers.set_spark(self._sc, "off")  # todo: undo this/move to config, I just want it outta my face most of the time
Ejemplo n.º 2
0
    def __init__(self,
                 sc=None,
                 master=default_spark_master,
                 py_files=None,
                 spark_home=None,
                 sparktk_home=None,
                 pyspark_submit_args=None,
                 app_name="sparktk",
                 other_libs=None,
                 extra_conf=None,
                 use_local_fs=False,
                 debug=None):
        r"""
        Creates a TkContext object

        :param sc: (SparkContext) Active Spark Context, if not provided a new Spark Context is created with the
                   rest of the args
                   (see https://spark.apache.org/docs/latest/api/java/org/apache/spark/SparkContext.html)
        :param master: (str) override spark master setting; for ex. 'local[4]' or 'yarn-client'
        :param py_files: (list) list of str of paths to python dependencies; Note the the current python
        package will be freshly zipped up and put in a tmp folder for shipping by spark, and then removed
        :param spark_home: (str) override $SPARK_HOME, the location of spark
        :param sparktk_home: (str) override $SPARKTK_HOME, the location of spark-tk
        :param pyspark_submit_args: (str) extra args passed to the pyspark submit
        :param app_name: (str) name of spark app that will be created
        :param other_libs: (list) other libraries (actual python packages or modules) that are compatible with spark-tk,
                           which need to be added to the spark context.  These libraries must be developed for use with
                           spark-tk and have particular methods implemented.  (See sparkconf.py _validate_other_libs)
        :param extra_conf: (dict) dict for any extra spark conf settings, for ex. {"spark.hadoop.fs.default.name": "file:///"}
        :param use_local_fs: (bool) simpler way to specify using local file system, rather than hdfs or other
        :param debug: (int or str) provide an port address to attach a debugger to the JVM that gets started
        :return: TkContext

        Creating a TkContext requires creating or obtaining a SparkContext object.  It is usually recommended to have
        the TkContext create the SparkContext, since it can provide the proper locations to the sparktk specific
        dependencies (i.e. jars).  Otherwise, specifying the classpath and jars arguments is left to the user.


        Examples
        --------

        <skip>
        Creating a TkContext using no arguments will cause a SparkContext to be created using default settings:

            >>> import sparktk

            >>> tc = sparktk.TkContext()

            >>> print tc.sc._conf.toDebugString()
            spark.app.name=sparktk
            spark.driver.extraClassPath=/opt/lib/spark/lib/*:/opt/spark-tk/sparktk-core/*
            spark.driver.extraLibraryPath=/opt/lib/hadoop/lib/native:/opt/lib/spark/lib:/opt/lib/hadoop/lib/native
            spark.jars=file:/opt/lib/spark/lib/spark-examples-1.6.0-hadoop2.6.0.jar,file:/opt/lib/spark/lib/spark-assembly.jar,file:/opt/lib/spark/lib/spark-examples.jar,file:/opt/lib/spark-tk/sparktk-core/sparktk-core-1.0-SNAPSHOT.jar,file:/opt/lib/spark-tk/sparktk-core/dependencies/spark-mllib_2.10-1.6.0.jar, ...
            spark.master=local[4]
            spark.yarn.jar=local:/opt/lib/spark/lib/spark-assembly.jar


        Another case with arguments to control some Spark Context settings:

            >>> import sparktk

            >>> tc = sparktk.TkContext(master='yarn-client',
            ...                        py_files='mylib.py',
            ...                        pyspark_submit_args='--jars /usr/lib/custom/extra.jar' \
            ...                                            '--driver-class-path /usr/lib/custom/*' \
            ...                                            '--executor-memory 6g',
            ...                        extra_conf={'spark.files.overwrite': 'true'},
            ...                        app_name='myapp'

            >>> print tc.sc._conf.toDebugString()
            spark.app.name=myapp
            spark.driver.extraClassPath=/usr/lib/custom/*:/opt/lib/spark/lib/*:/opt/spark-tk/sparktk-core/*
            spark.driver.extraLibraryPath=/opt/lib/hadoop/lib/native:/opt/lib/spark/lib:/opt/lib/hadoop/lib/native
            spark.executor.memory=6g
            spark.files.overwrite=true
            spark.jars=file:/usr/local/custom/extra.jar,file:/opt/lib/spark/lib/spark-examples-1.6.0-hadoop2.6.0.jar,file:/opt/lib/spark/lib/spark-assembly.jar,file:/opt/lib/spark/lib/spark-examples.jar,file:/opt/lib/spark-tk/sparktk-core/sparktk-core-1.0-SNAPSHOT.jar,file:/opt/lib/spark-tk/sparktk-core/dependencies/spark-mllib_2.10-1.6.0.jar, ...
            spark.master=yarn-client
            spark.yarn.isPython=true
            spark.yarn.jar=local:/opt/lib/spark/lib/spark-assembly.jar

        </skip>

        """
        if not sc:
            if SparkContext._active_spark_context:
                sc = SparkContext._active_spark_context
            else:
                sc = create_sc(master=master,
                               py_files=py_files,
                               spark_home=spark_home,
                               sparktk_home=sparktk_home,
                               pyspark_submit_args=pyspark_submit_args,
                               app_name=app_name,
                               other_libs=other_libs,
                               extra_conf=extra_conf,
                               use_local_fs=use_local_fs,
                               debug=debug)
        if type(sc) is not SparkContext:
            if sc is TkContext.__mock:
                return
            raise TypeError(
                "sparktk context init requires a valid SparkContext.  Received type %s"
                % type(sc))
        self._sc = sc
        self._sql_context = None
        self._jtc = self._sc._jvm.org.trustedanalytics.sparktk.TkContext(
            self._sc._jsc)
        self._jutils = JUtils(self._sc)
        self._scala_sc = self._jutils.get_scala_sc()
        self._other_libs = other_libs if other_libs is None or isinstance(
            other_libs, list) else [other_libs]
        if self._other_libs is not None:
            for lib in self._other_libs:
                lib_obj = lib.get_main_object(self)
                setattr(self, lib.__name__, lib_obj)
        loggers.set_spark(
            self._sc, "off"
        )  # todo: undo this/move to config, I just want it outta my face most of the time
Ejemplo n.º 3
0
    def __init__(self,
                 sc=None,
                 master=None,
                 py_files=None,
                 spark_home=None,
                 sparktk_home=None,
                 pyspark_submit_args=None,
                 app_name="sparktk",
                 other_libs=None,
                 extra_conf=None,
                 use_local_fs=False,
                 debug=None):
        """
        Creates a TkContext.

        If SparkContext sc is not provided, a new spark context will be created using the
        given settings and otherwise default values

        :param sc: (SparkContext) Active Spark Context, if not provided a new Spark Context is created with the
                   rest of the args
        :param master: (str) override spark master setting; for ex. 'local[4]' or 'yarn-client'
        :param py_files: (list) list of str of paths to python dependencies; Note the the current python
        package will be freshly zipped up and put in a tmp folder for shipping by spark, and then removed
        :param spark_home: (str) override $SPARK_HOME, the location of spark
        :param sparktk_home: (str) override $SPARKTK_HOME, the location of spark-tk
        :param pyspark_submit_args: (str) extra args passed to the pyspark submit
        :param app_name: (str) name of spark app that will be created
        :param other_libs: (list) other libraries (actual python packages or modules) that are compatible with spark-tk,
                           which need to be added to the spark context.  These libraries must be developed for use with
                           spark-tk and have particular methods implemented.  (See sparkconf.py _validate_other_libs)
        :param extra_conf: (dict) dict for any extra spark conf settings, for ex. {"spark.hadoop.fs.default.name": "file:///"}
        :param use_local_fs: (bool) simpler way to specify using local file system, rather than hdfs or other
        :param debug: (int or str) provide an port address to attach a debugger to the JVM that gets started
        :return: TkContext
        """
        if not sc:
            if SparkContext._active_spark_context:
                sc = SparkContext._active_spark_context
            else:
                sc = create_sc(master=master,
                               py_files=py_files,
                               spark_home=spark_home,
                               sparktk_home=sparktk_home,
                               pyspark_submit_args=pyspark_submit_args,
                               app_name=app_name,
                               other_libs=other_libs,
                               extra_conf=extra_conf,
                               use_local_fs=use_local_fs,
                               debug=debug)
        if type(sc) is not SparkContext:
            raise TypeError(
                "sparktk context init requires a valid SparkContext.  Received type %s"
                % type(sc))
        self._sc = sc
        self._sql_context = None
        self._jtc = self._sc._jvm.org.trustedanalytics.sparktk.TkContext(
            self._sc._jsc)
        self._jutils = JUtils(self._sc)
        self._scala_sc = self._jutils.get_scala_sc()
        self._other_libs = other_libs if other_libs is None or isinstance(
            other_libs, list) else [other_libs]
        if self._other_libs is not None:
            for lib in self._other_libs:
                lib_obj = lib.get_main_object(self)
                setattr(self, lib.__name__, lib_obj)
        loggers.set_spark(
            self._sc, "off"
        )  # todo: undo this/move to config, I just want it outta my face most of the time