def __init__(self, sc=None, **create_sc_kwargs): if not sc: sc = create_sc(**create_sc_kwargs) if type(sc) is not SparkContext: raise TypeError("sparktk context init requires a valid SparkContext. Received type %s" % type(sc)) self._sc = sc self._jtc = self._sc._jvm.org.trustedanalytics.sparktk.TkContext(self._sc._jsc) self._jutils = JUtils(self._sc) self._scala_sc = self._jutils.get_scala_sc() loggers.set_spark(self._sc, "off") # todo: undo this/move to config, I just want it outta my face most of the time
def __init__(self, sc=None, master=default_spark_master, py_files=None, spark_home=None, sparktk_home=None, pyspark_submit_args=None, app_name="sparktk", other_libs=None, extra_conf=None, use_local_fs=False, debug=None): r""" Creates a TkContext object :param sc: (SparkContext) Active Spark Context, if not provided a new Spark Context is created with the rest of the args (see https://spark.apache.org/docs/latest/api/java/org/apache/spark/SparkContext.html) :param master: (str) override spark master setting; for ex. 'local[4]' or 'yarn-client' :param py_files: (list) list of str of paths to python dependencies; Note the the current python package will be freshly zipped up and put in a tmp folder for shipping by spark, and then removed :param spark_home: (str) override $SPARK_HOME, the location of spark :param sparktk_home: (str) override $SPARKTK_HOME, the location of spark-tk :param pyspark_submit_args: (str) extra args passed to the pyspark submit :param app_name: (str) name of spark app that will be created :param other_libs: (list) other libraries (actual python packages or modules) that are compatible with spark-tk, which need to be added to the spark context. These libraries must be developed for use with spark-tk and have particular methods implemented. (See sparkconf.py _validate_other_libs) :param extra_conf: (dict) dict for any extra spark conf settings, for ex. {"spark.hadoop.fs.default.name": "file:///"} :param use_local_fs: (bool) simpler way to specify using local file system, rather than hdfs or other :param debug: (int or str) provide an port address to attach a debugger to the JVM that gets started :return: TkContext Creating a TkContext requires creating or obtaining a SparkContext object. It is usually recommended to have the TkContext create the SparkContext, since it can provide the proper locations to the sparktk specific dependencies (i.e. jars). Otherwise, specifying the classpath and jars arguments is left to the user. Examples -------- <skip> Creating a TkContext using no arguments will cause a SparkContext to be created using default settings: >>> import sparktk >>> tc = sparktk.TkContext() >>> print tc.sc._conf.toDebugString() spark.app.name=sparktk spark.driver.extraClassPath=/opt/lib/spark/lib/*:/opt/spark-tk/sparktk-core/* spark.driver.extraLibraryPath=/opt/lib/hadoop/lib/native:/opt/lib/spark/lib:/opt/lib/hadoop/lib/native spark.jars=file:/opt/lib/spark/lib/spark-examples-1.6.0-hadoop2.6.0.jar,file:/opt/lib/spark/lib/spark-assembly.jar,file:/opt/lib/spark/lib/spark-examples.jar,file:/opt/lib/spark-tk/sparktk-core/sparktk-core-1.0-SNAPSHOT.jar,file:/opt/lib/spark-tk/sparktk-core/dependencies/spark-mllib_2.10-1.6.0.jar, ... spark.master=local[4] spark.yarn.jar=local:/opt/lib/spark/lib/spark-assembly.jar Another case with arguments to control some Spark Context settings: >>> import sparktk >>> tc = sparktk.TkContext(master='yarn-client', ... py_files='mylib.py', ... pyspark_submit_args='--jars /usr/lib/custom/extra.jar' \ ... '--driver-class-path /usr/lib/custom/*' \ ... '--executor-memory 6g', ... extra_conf={'spark.files.overwrite': 'true'}, ... app_name='myapp' >>> print tc.sc._conf.toDebugString() spark.app.name=myapp spark.driver.extraClassPath=/usr/lib/custom/*:/opt/lib/spark/lib/*:/opt/spark-tk/sparktk-core/* spark.driver.extraLibraryPath=/opt/lib/hadoop/lib/native:/opt/lib/spark/lib:/opt/lib/hadoop/lib/native spark.executor.memory=6g spark.files.overwrite=true spark.jars=file:/usr/local/custom/extra.jar,file:/opt/lib/spark/lib/spark-examples-1.6.0-hadoop2.6.0.jar,file:/opt/lib/spark/lib/spark-assembly.jar,file:/opt/lib/spark/lib/spark-examples.jar,file:/opt/lib/spark-tk/sparktk-core/sparktk-core-1.0-SNAPSHOT.jar,file:/opt/lib/spark-tk/sparktk-core/dependencies/spark-mllib_2.10-1.6.0.jar, ... spark.master=yarn-client spark.yarn.isPython=true spark.yarn.jar=local:/opt/lib/spark/lib/spark-assembly.jar </skip> """ if not sc: if SparkContext._active_spark_context: sc = SparkContext._active_spark_context else: sc = create_sc(master=master, py_files=py_files, spark_home=spark_home, sparktk_home=sparktk_home, pyspark_submit_args=pyspark_submit_args, app_name=app_name, other_libs=other_libs, extra_conf=extra_conf, use_local_fs=use_local_fs, debug=debug) if type(sc) is not SparkContext: if sc is TkContext.__mock: return raise TypeError( "sparktk context init requires a valid SparkContext. Received type %s" % type(sc)) self._sc = sc self._sql_context = None self._jtc = self._sc._jvm.org.trustedanalytics.sparktk.TkContext( self._sc._jsc) self._jutils = JUtils(self._sc) self._scala_sc = self._jutils.get_scala_sc() self._other_libs = other_libs if other_libs is None or isinstance( other_libs, list) else [other_libs] if self._other_libs is not None: for lib in self._other_libs: lib_obj = lib.get_main_object(self) setattr(self, lib.__name__, lib_obj) loggers.set_spark( self._sc, "off" ) # todo: undo this/move to config, I just want it outta my face most of the time
def __init__(self, sc=None, master=None, py_files=None, spark_home=None, sparktk_home=None, pyspark_submit_args=None, app_name="sparktk", other_libs=None, extra_conf=None, use_local_fs=False, debug=None): """ Creates a TkContext. If SparkContext sc is not provided, a new spark context will be created using the given settings and otherwise default values :param sc: (SparkContext) Active Spark Context, if not provided a new Spark Context is created with the rest of the args :param master: (str) override spark master setting; for ex. 'local[4]' or 'yarn-client' :param py_files: (list) list of str of paths to python dependencies; Note the the current python package will be freshly zipped up and put in a tmp folder for shipping by spark, and then removed :param spark_home: (str) override $SPARK_HOME, the location of spark :param sparktk_home: (str) override $SPARKTK_HOME, the location of spark-tk :param pyspark_submit_args: (str) extra args passed to the pyspark submit :param app_name: (str) name of spark app that will be created :param other_libs: (list) other libraries (actual python packages or modules) that are compatible with spark-tk, which need to be added to the spark context. These libraries must be developed for use with spark-tk and have particular methods implemented. (See sparkconf.py _validate_other_libs) :param extra_conf: (dict) dict for any extra spark conf settings, for ex. {"spark.hadoop.fs.default.name": "file:///"} :param use_local_fs: (bool) simpler way to specify using local file system, rather than hdfs or other :param debug: (int or str) provide an port address to attach a debugger to the JVM that gets started :return: TkContext """ if not sc: if SparkContext._active_spark_context: sc = SparkContext._active_spark_context else: sc = create_sc(master=master, py_files=py_files, spark_home=spark_home, sparktk_home=sparktk_home, pyspark_submit_args=pyspark_submit_args, app_name=app_name, other_libs=other_libs, extra_conf=extra_conf, use_local_fs=use_local_fs, debug=debug) if type(sc) is not SparkContext: raise TypeError( "sparktk context init requires a valid SparkContext. Received type %s" % type(sc)) self._sc = sc self._sql_context = None self._jtc = self._sc._jvm.org.trustedanalytics.sparktk.TkContext( self._sc._jsc) self._jutils = JUtils(self._sc) self._scala_sc = self._jutils.get_scala_sc() self._other_libs = other_libs if other_libs is None or isinstance( other_libs, list) else [other_libs] if self._other_libs is not None: for lib in self._other_libs: lib_obj = lib.get_main_object(self) setattr(self, lib.__name__, lib_obj) loggers.set_spark( self._sc, "off" ) # todo: undo this/move to config, I just want it outta my face most of the time