Example #1
0
def get_SparkContext(app_name='tuixing-spark', **kwargs):
    conf = SparkConf()
    conf.setAppName(app_name)
    conf.setAll(COMMON_SC)
    for key in kwargs:
        conf.set(key, kwargs[key])

    sc = SparkContext(conf=conf)
    return sc
def create_streaming_context():
    conf = SparkConf()
    pairs = [('spark.app.name', 'Process Stories Stream'),
             ('spark.master', 'local[4]'), ('spark.ui.port', '4040')]
    conf.setAll(pairs)
    sc = SparkContext(conf=conf)
    ssc = StreamingContext(sc, batch_secs)
    ssc.checkpoint(checkpointDirectory)  # set checkpoint directory
    return ssc
Example #3
0
def spark_session(app, cores=2, gpus=0, max_failures=1, *args):
    from pyspark import SparkConf
    from pyspark.sql import SparkSession

    with TemporaryDirectory() as tmpdir:
        metastore_path = os.path.join(tmpdir, 'metastore')

        # start a single worker with given cores when gpus are present
        # max failures are ignored when gpus in that case
        master = 'local-cluster[1,{},1024]'.format(cores) if gpus > 0 \
            else 'local[{},{}]'.format(cores, max_failures)
        conf = SparkConf().setAppName(app).setMaster(master)
        conf = conf.setAll([
            ('spark.ui.showConsoleProgress', 'false'),
            ('spark.test.home', os.environ.get('SPARK_HOME')),
            ('spark.locality.wait', '0'),
            ('spark.unsafe.exceptionOnMemoryLeak', 'true'),
            ('spark.ui.enabled', 'false'),
            ('spark.local.dir', os.path.join(tmpdir, 'tmp')),
            ('spark.sql.warehouse.dir', os.path.join(tmpdir, 'warehouse')),
            ('javax.jdo.option.ConnectionURL',
             f'jdbc:derby:;databaseName={metastore_path};create=true'),
        ])

        with temppath() as temp_filename:
            if gpus > 0:
                with open(temp_filename, 'wb') as temp_file:
                    addresses = ', '.join('\\"{}\\"'.format(i)
                                          for i in range(gpus))
                    temp_file.write(
                        b'echo {\\"name\\": \\"gpu\\", \\"addresses\\": [' +
                        addresses.encode('ascii') + b']}')

                os.chmod(
                    temp_file.name, stat.S_IRWXU | stat.S_IXGRP | stat.S_IRGRP
                    | stat.S_IROTH | stat.S_IXOTH)

                # the single worker takes all gpus discovered, and a single executor will get them
                # each task on that executor will get a single gpu
                conf = conf.setAll([
                    ('spark.worker.resource.gpu.discoveryScript',
                     temp_filename),
                    ('spark.worker.resource.gpu.amount', str(gpus)),
                    ('spark.task.resource.gpu.amount', '1'),
                    ('spark.executor.resource.gpu.amount', str(gpus)),
                ])

            session = SparkSession \
                .builder \
                .config(conf=conf) \
                .getOrCreate()

            try:
                yield session
            finally:
                session.stop()
 def get_spark_conf(self):
     conf = SparkConf()  # 创建spark config对象
     config = (
         ("spark.app.name", self.SPARK_APP_NAME),  # 设置启动的spark的app名称,没有提供,将随机产生一个名称
         ("spark.executor.memory", "2g"),  # 设置该app启动时占用的内存用量,默认1g
         ("spark.master", self.SPARK_URL),  # spark master的地址
         ("spark.executor.cores", "2")  # 设置spark executor使用的CPU核心数
         # 	('spark.sql.pivotMaxValues', '99999'),  # 当需要pivot DF,且值很多时,需要修改,默认是10000
     )
     # 查看更详细配置及说明:https://spark.apache.org/docs/latest/configuration.html
     conf.setAll(config)
     return conf
Example #5
0
    def __init__(self,
                 path_files: str,
                 path_index: str,
                 path_dict: str,
                 file_name: str,
                 num_partition: int):
        path = ''.join(path_files + file_name)
        self.__file_name = file_name

        conf = SparkConf()
        conf.setAll(
            [
                ('spark.app.name', 'Challenge Data Engineer'),
                ('spark.driver.cores', '4'),
                ('spark.executor.cores', '4'),
                ('spark.driver.maxResultSize', '10g'),
                ('spark.executor.memory', '4g'),
                ('spark.executor.memoryOverhead	', '4g'),
                ('spark.driver.memory', '10g'),
                ('spark.local.dir', PATH_INDEX),
                ('spark.driver.extraJavaOptions', '-Xmx1024m'),
                ('spark.memory.offHeap.enabled', 'true'),
                ('spark.memory.offHeap.size', '20g')
            ]
        )

        self.__spark = SparkSession \
            .builder \
            .config(conf=conf) \
            .getOrCreate()

        self.__df_dict = self.__spark \
            .read \
            .parquet(path_dict) \
            .repartition(numPartitions=num_partition)

        self.__df_doc = self.__spark \
            .read \
            .text(path)

        self.__df_wordid_docid = self.__spark \
            .read \
            .parquet(path_index) \
            .rdd \
            .unpersist() \
            .repartition(numPartitions=1000)

        self.__df_wordid_docid = self.__df_wordid_docid.toDF()

        logging.warning(f"Processing doc: {path}")
Example #6
0
 def create_spark_session(self):
     conf = SparkConf()
     config = (("spark.app.name", self.SPARK_APP_NAME),
               ("spark.executor.memory", self.SPARK_EXECUTOR_MEMORY),
               ("spark.master", self.SPARK_MASTER),
               ("spark.executor.cores", self.SPARK_EXECUTOR_CORES),
               ("spark.executor.instances", self.SPARK_EXECUTOR_INSTANCES),
               ("spark.debug.maxToStringFields", "10000"))
     conf.setAll(config)
     if self.ENABLE_HIVE_SUPPORT:
         return SparkSession.builder.config(
             conf=conf).enableHiveSupport().getOrCreate()
     else:
         return SparkSession.builder.config(conf=conf).getOrCreate()
    def __init__(self, path_files: str, path_index: str, path_dict: str,
                 file_name: str, num_partition: int):
        path = ''.join(path_files + file_name)
        self.__file_name = file_name

        conf = SparkConf()
        # Application Properties
        # http://spark.apache.org/docs/latest/configuration.html#spark-properties
        conf.setAll([('spark.app.name', 'Challenge Data Engineer'),
                     ('spark.driver.cores', '4'),
                     ('spark.executor.cores', '4'),
                     ('spark.driver.maxResultSize', '10g'),
                     ('spark.executor.memory', '10g'),
                     ('spark.executor.memoryOverhead	', '10g'),
                     ('spark.driver.memory', '10g'),
                     ('spark.local.dir', PATH_INDEX),
                     ('spark.driver.extraJavaOptions', '-Xmx1024m'),
                     ('spark.memory.offHeap.enabled', 'true'),
                     ('spark.memory.offHeap.size', '20g')])

        self.__spark = SparkSession \
            .builder \
            .config(conf=conf) \
            .getOrCreate()

        self.__df_dict = self.__spark \
            .read \
            .parquet(path_dict) \
            .repartition(numPartitions=num_partition)

        self.__df_doc = self.__spark \
            .read \
            .text(path)

        self.__df_wordid_docid = self.__spark \
            .read \
            .parquet(path_index) \
            .rdd \
            .unpersist() \
            .repartition(numPartitions=1000)

        print(self.__df_wordid_docid.getStorageLevel())
        print(self.__df_wordid_docid.getNumPartitions())
        print(self.__spark.sparkContext.getConf().getAll())
        self.__spark.sql("SET -v").show(n=200, truncate=False)

        self.__df_wordid_docid = self.__df_wordid_docid.toDF()

        self.__spark.sparkContext.setLogLevel("warn")
        logging.warning(f"Processing doc: {path}")
    def _init_spark(self, appname):
        """Internal function to setup spark context
        
        Note: only include spark modules here so that
        the interface can be queried outside of pyspark.

        """
        # currently using LZ4 compression: should not degrade runtime much
        # but will help with some operations like shuffling, especially when
        # dealing with things object like highly compressible label volumes
        # NOTE: objects > INT_MAX will cause problems for LZ4
        worker_env = {}
        if "DVIDSPARK_WORKFLOW_TMPDIR" in os.environ and os.environ["DVIDSPARK_WORKFLOW_TMPDIR"]:
            worker_env["DVIDSPARK_WORKFLOW_TMPDIR"] = os.environ["DVIDSPARK_WORKFLOW_TMPDIR"]
        
        try:
            spark_config = self.config_data["options"]["spark-config"]
        except KeyError:
            # Old workflows haven't been updated to inherit the base Workflow schema
            spark_config = {}
        
        for k in list(spark_config.keys()):
            spark_config[k] = str(spark_config[k])
            if spark_config[k] in ('True', 'False'):
                spark_config[k] = spark_config[k].lower()
            
        # Backwards compatibility:
        # if 'corespertask' option exists, override it in the spark config
        if "corespertask" in self.config_data["options"] and self.config_data["options"]["corespertask"] != 0:
            if "spark.task.cpus" in spark_config and spark_config["spark.task.cpus"] != '1':
                raise RuntimeError("Bad config: You can't set both 'corespertask' and 'spark.task.cpus'.  Use 'spark.task.cpus'.")
            spark_config["spark.task.cpus"] = str(self.config_data["options"]["corespertask"])

        # set spark config
        from pyspark import SparkContext, SparkConf
        conf = SparkConf()
        conf.setAppName(appname)
        conf.setAll(list(spark_config.items()))
        
#         from pyspark_flame import FlameProfiler
#         flamegraph_dir = f'{self.config_dir}/flamegraphs'
#         os.makedirs(flamegraph_dir, exist_ok=True)
#         conf.set("spark.python.profile.dump", flamegraph_dir)
#         conf.set("spark.python.profile", "true")
#         worker_env['pyspark_flame.interval'] = 0.25 # Default is 0.2 seconds
#         return SparkContext(conf=conf, batchSize=1, environment=worker_env, profiler_cls=FlameProfiler)

        # Auto-batching heuristic doesn't work well with our auto-compressed numpy array pickling scheme.
        # Therefore, disable batching with batchSize=1
        return SparkContext(conf=conf, batchSize=1, environment=worker_env)
Example #9
0
 def _create_spark_session(self):
     conf = SparkConf()
     config = (
         ('spark.app.name', self.SPARK_APP_NAME),
         ('spark.executor.memory', self.SPARK_EXECUTOR_MEMORY),
         ('spark.master', self.SPARK_URL),
         ('spark.executor.cores', self.SPARK_EXECUTOR_CORES),
         ('spark.executor.instances', self.SPARK_EXECUTOR_INSTANCES),
     )
     conf.setAll(config)
     if self.ENABLE_HIVE_SUPPORT:
         return SparkSession.builder.config(
             conf=conf).enableHiveSupport().getOrCreate()
     else:
         return SparkSession.builder.config(conf=conf).getOrCreate()
Example #10
0
def spark_cluster(logfile, discovery_schedule, hosts, extra_conf=None):
    from pyspark import SparkConf
    from pyspark.sql import SparkSession

    unknown_keys = set([prop for prop, _ in extra_conf]) \
        .difference(conf.SPARK_CONF_DEFAULT_VALUES.keys()) \
        if extra_conf else None
    if unknown_keys:
        raise ValueError(
            'default values must be defined for these properties: {}'.format(
                unknown_keys))

    cluster = SparkClusterController(logfile, discovery_schedule, hosts, 1)
    try:
        cluster.start()

        config = SparkConf().setAppName('elastic spark tests').setMaster(
            cluster.master_url())
        config = config.setAll([
            # pyspark-shell JVM will OOM even with 1GB when all tests run in one process
            # SparkContext and pyspark-shell JVM gets reused even though we do SparkSession.stop()
            # pyspark-shell JVM memory footprint increases from test to test
            # when run with pytest --forked, set SPARK_DRIVER_MEM=512m env
            ('spark.driver.memory', os.environ.get('SPARK_DRIVER_MEM',
                                                   '1500m')),
            # the minimum executor memory we can set
            ('spark.executor.memory', '512m'),
            # don't pollute the log with progress bar
            ('spark.ui.showConsoleProgress', 'false'),
        ])
        # config properties once set will survive session.stop() and
        # SparkSession.config(conf=config).getOrCreate(), so we have to make sure
        # we overwrite their value if not in extra_conf
        more_conf = conf.SPARK_CONF_DEFAULT_VALUES.copy()
        more_conf.update(extra_conf or [])
        config.setAll(more_conf.items())

        session = SparkSession \
            .builder \
            .config(conf=config) \
            .getOrCreate()

        try:
            yield session
        finally:
            session.stop()
    finally:
        cluster.shutdown()
Example #11
0
def get_spark_config(
        predictrip_config: Mapping[str, Mapping[str, str]]) -> SparkConf:
    """
    Create an object representing the Spark configuration we want

    :type predictrip_config: mapping returned by load_config containing configuration options
    :return: pyspark.SparkConf instance
    """
    # NOTE: contrary to https://www.geomesa.org/documentation/user/spark/pyspark.html#using-geomesa-pyspark, use of
    # geomesa_pyspark.configure() no longer necessary since Spark 2.1, as long as you tell spark to include the
    # geomesa_pyspark python module some other way (e.g. spark.files)

    sc = SparkConf()
    sc = sc.setAppName('PredicTrip ' + path.basename(__file__))
    # FIXME: the following doesn't seem to be effective
    sc = sc.setAll([('fs.s3a.awsAccessKeyId',
                     predictrip_config['AWS']['access_key_id']),
                    ('fs.s3a.awsSecretAccessKey',
                     predictrip_config['AWS']['secret_access_key'])])
    # add to sc any spark options that might be set in predictrip_config
    if 'executor_cores' in predictrip_config['Spark']:
        sc = sc.set('spark.executor.cores',
                    predictrip_config['Spark']['executor_cores'])
    if 'executor_memory' in predictrip_config['Spark']:
        sc = sc.set('spark.executor.memory',
                    predictrip_config['Spark']['executor_memory'])
    return sc
Example #12
0
 def _create_spark_session(self):
     #1、创建配置
     '''给spark程序创建初始化spark session'''
     conf=SparkConf()
     config=(
         ("spark.app.name", self.SPARK_APP_NAME),  # 设置启动的spark的app名称,没有提供,将随机产生一个名称
         ("spark.executor.memory", self.SPARK_EXECUTOR_MEMORY),  # 设置该app启动时占用的内存用量,默认2g
         ("spark.master", self.SPARK_URL),  # spark master的地址
         ("spark.executor.cores", self.SPARK_EXECUTOR_CORES),  # 设置spark executor使用的CPU核心数,默认是1核心
         ("spark.executor.instances", self.SPARK_EXECUTOR_INSTANCES)
     )
     conf.setAll(config)
     #读取配置初始化
     if self.ENABLE_HIVE_SUPPORT:
         return SparkSession.builder.config(conf=conf).enableHiveSupport().getOrCreate()
     else:
         return SparkSession.builder.config(conf=conf).getOrCreate()
Example #13
0
def build_spark_session(
        app_name: str,
        spark_config: DefaultDict[str, str] = None,
        hadoop_config: DefaultDict[str, str] = None) -> SparkSession:
    conf = SparkConf()
    if spark_config:
        conf.setAll(spark_config.items())

    sc = SparkContext(conf=conf)

    if hadoop_config:
        for k, v in hadoop_config.items():
            sc._jsc.hadoopConfiguration().set(k, v)

    return SparkSession.builder \
        .appName(app_name) \
        .config(conf=sc.getConf()) \
        .getOrCreate()
Example #14
0
    def _create_spark_hbase(self):
        conf = SparkConf()  # 创建spark config对象
        config = (
            ("spark.app.name", self.SPARK_APP_NAME),  # 设置启动的spark的app名称,没有提供,将随机产生一个名称
            ("spark.executor.memory", self.SPARK_EXECUTOR_MEMORY),  # 设置该app启动时占用的内存用量,默认2g
            ("spark.master", self.SPARK_URL),  # spark master的地址
            ("spark.executor.cores", self.SPARK_EXECUTOR_CORES),  # 设置spark executor使用的CPU核心数,默认是1核心
            ("spark.executor.instances", self.SPARK_EXECUTOR_INSTANCES),
            ("hbase.zookeeper.quorum", "192.168.19.137"),
            ("hbase.zookeeper.property.clientPort", "22181")
        )

        conf.setAll(config)

        # 利用config对象,创建spark session
        if self.ENABLE_HIVE_SUPPORT:
            return SparkSession.builder.config(conf=conf).enableHiveSupport().getOrCreate()
        else:
            return SparkSession.builder.config(conf=conf).getOrCreate()
Example #15
0
def create_spark_conf():
    bigdl_conf = get_bigdl_conf()
    sparkConf = SparkConf()
    sparkConf.setAll(bigdl_conf.items())
    if not is_spark_below_2_2():
        extend_spark_driver_cp(sparkConf, get_bigdl_classpath())

    # add content in PYSPARK_FILES in spark.submit.pyFiles
    # This is a workaround for current Spark on k8s
    python_lib = os.environ.get('PYSPARK_FILES', None)
    if python_lib:
        existing_py_files = sparkConf.get("spark.submit.pyFiles")
        if existing_py_files:
            sparkConf.set(key="spark.submit.pyFiles",
                          value="%s,%s" % (python_lib, existing_py_files))
        else:
            sparkConf.set(key="spark.submit.pyFiles", value=python_lib)

    return sparkConf
Example #16
0
    def _create_spark_session(self):
        conf = SparkConf()

        config = (
            ("spark.app.name", self.SPARK_APP_NAME),
            ("spark.executor.memory", self.SPARK_EXECUTOR_MEMORY),
            # ("spark.master", self.SPARK_URL),
            ("spark.executor.cores", self.SPARK_EXECUTOR_CORES),
            ("spark.executor.instances", self.SPARK_EXECUTOR_INSTANCES),
            # ("spark.sql.warehouse.dir", "/root/apache-hive-2.3.7-bin/warehouse"),
            ("hive.metastore.uris", "thrift://172.18.0.2:9083"))

        conf.setAll(config)
        print(self.ENABLE_HIVE_SUPPORT, config)

        if self.ENABLE_HIVE_SUPPORT:
            return SparkSession.builder.config(
                conf=conf).enableHiveSupport().getOrCreate()
        else:
            return SparkSession.builder.config(conf=conf).getOrCreate()
Example #17
0
File: spark.py Project: daskos/epos
    def closure(*args, **kwargs):
        try:
            options = opts
            options.update({
                'sql_parquet_compression_codec': 'uncompressed',
                'mesos_role': role,
                'mesos_coarse': bool(coarse),
                'cores_max': int(coarse) or None,
                'executor_cores': int(executor_cores),
                'executor_memory': '{}m'.format(int(executor_memory / MiB)),
                'driver_memory': '{}m'.format(int(driver_memory / MiB)),
                'mesos_executor_memoryOverhead': int(
                    (memory_overhead or (executor_cores * python_worker_memory +
                                         0.1 * executor_memory))
                    / MiB),
                'python_worker_memory': int(python_worker_memory / MiB),
                'mesos_uris': ','.join(uris),
                'mesos_executor_docker_image': docker
            })
            options = {'spark.{}'.format(k.replace('_', '.')): str(v)
                       for k, v in options.items() if v not in (None, '')}
            environs = envs.items()
        except TypeError as e:
            # curry doesn't reraise TypeErrors:
            # https://github.com/pytoolz/toolz/issues/288
            raise Exception(repr(e))

        conf = SparkConf()
        conf.setMaster(str(master))
        conf.setAppName(str(name or fn.__name__))
        conf.setAll(pairs=options.items())
        conf.setExecutorEnv(pairs=environs)

        with SparkContext(conf=conf) as sc:
            sc.setLogLevel(str(log))
            map(sc.addFile, files)
            map(sc.addPyFile, pyfiles)
            # TODO: user sparksession
            sql = SQLContext(sc)
            return fn(sc, sql, *args, **kwargs)
Example #18
0
 def get_spark_config(path, dependencies) -> SparkConf:
     master = 'local[2]'
     conf = SparkConf().setAppName('unit test').setMaster(master)
     return conf.setAll([
         ('spark.ui.showConsoleProgress', 'false'),
         ('spark.test.home', os.environ.get('SPARK_HOME')),
         ('spark.locality.wait', '0'),
         ('spark.driver.extraClassPath', '{}'.format(':'.join([
             os.path.join(os.getcwd(), path, 'target', 'classes'),
             os.path.join(os.getcwd(), path, 'target', 'test-classes'),
             dependencies
         ]))),
     ])
Example #19
0
def spark_session(app, cores=2, gpus=0, max_failures=1, *args):
    from pyspark import SparkConf
    from pyspark.sql import SparkSession

    master = 'local-cluster[{},{},1024]'.format(cores, max_failures) if gpus > 0 \
        else 'local[{},{}]'.format(cores, max_failures)
    conf = SparkConf().setAppName(app).setMaster(master)
    conf = conf.setAll([
        ('spark.ui.showConsoleProgress', 'false'),
        ('spark.test.home', os.environ.get('SPARK_HOME')),
        ('spark.locality.wait', '0'),
    ])

    with temppath() as temp_filename:
        if gpus > 0:
            with open(temp_filename, 'wb') as temp_file:
                addresses = ', '.join('\\"{}\\"'.format(i) for i in range(gpus))
                temp_file.write(b'echo {\\"name\\": \\"gpu\\", \\"addresses\\": [' +
                                addresses.encode('ascii') + b']}')

            os.chmod(temp_file.name, stat.S_IRWXU | stat.S_IXGRP | stat.S_IRGRP |
                     stat.S_IROTH | stat.S_IXOTH)

            conf = conf.setAll([
                ('spark.worker.resource.gpu.discoveryScript', temp_filename),
                ('spark.worker.resource.gpu.amount', '1'),
                ('spark.task.resource.gpu.amount', '1'),
                ('spark.executor.resource.gpu.amount', '1')
            ])

        session = SparkSession \
            .builder \
            .config(conf=conf) \
            .getOrCreate()

        try:
            yield session
        finally:
            session.stop()
Example #20
0
def create_spark_conf():
    bigdl_conf = get_bigdl_conf()
    sparkConf = SparkConf()
    sparkConf.setAll(bigdl_conf.items())
    if os.environ.get("BIGDL_JARS", None) and not is_spark_below_2_2():
        for jar in os.environ["BIGDL_JARS"].split(":"):
            extend_spark_driver_cp(sparkConf, jar)

    # add content in PYSPARK_FILES in spark.submit.pyFiles
    # This is a workaround for current Spark on k8s
    python_lib = os.environ.get("PYSPARK_FILES", None)
    if python_lib:
        existing_py_files = sparkConf.get("spark.submit.pyFiles")
        if existing_py_files:
            sparkConf.set(
                key="spark.submit.pyFiles",
                value="%s,%s" % (python_lib, existing_py_files),
            )
        else:
            sparkConf.set(key="spark.submit.pyFiles", value=python_lib)

    return sparkConf
Example #21
0
def create_spark_conf():
    bigdl_conf = get_bigdl_conf()
    sparkConf = SparkConf()
    sparkConf.setAll(bigdl_conf.items())
    return sparkConf
Example #22
0
			except msg2xlsx.ConvertingError:
				logger.error("converting failed, clean the previous work")
				xlsx_path = os.path.join(settings.SAVE_DIRECTORY, project_id)
				clean(project_id, dirs=[temp_file, xlsx_path])
				return
				
			# processed ack
			channel.basic_ack(delivery_tag=method.delivery_tag)
	else:
		logger.error("illegal content-type: " + header.content_type)


if __name__ == '__main__':
	# initialize spark
	conf = SparkConf().setMaster(settings.SPARK_MASTER_URL).setAppName(settings.SPARK_APP_NAME)
	conf.setAll([("spark.eventLog.enabled", "true"), ("spark.eventLog.dir", settings.LOG_DIRECTORY)])
	sc = SparkContext(conf=conf)

	# initialize rabbitmq
	credentials = pika.PlainCredentials(settings.RABBITMQ_CONN_CONF['username'], settings.RABBITMQ_CONN_CONF['password'])
	conn_params = pika.ConnectionParameters(settings.RABBITMQ_CONN_CONF['host'], credentials=credentials)
	conn_broker = pika.BlockingConnection(conn_params)

	channel = conn_broker.channel()
	channel.exchange_declare(exchange=settings.RABBITMQ_SPARK['exchange'],
							 type="direct",
							 passive=False,
							 durable=True,
							 auto_delete=False)

	channel.queue_declare(queue=settings.RABBITMQ_SPARK['queue'])
from pyspark.streaming.kafka import *;
from pyspark.storagelevel import StorageLevel
import happybase


appName = "KafkaStreams"
config = SparkConf().setAppName(appName)

props = []
props.append(("spark.rememberDuration", "10"))
props.append(("spark.batchDuration", "10"))
props.append(("spark.eventLog.enabled", "true"))
props.append(("spark.streaming.timeout", "30"))
props.append(("spark.ui.enabled", "true"))

config = config.setAll(props)

sc = SparkContext(conf=config)
ssc = StreamingContext(sc, 5)

topics = ["t1"]
kafka_params = {
   "zookeeper.connect" : "localhost:5181/kafka"
 , "metadata.broker.list" : "localhost:9092"
 , "group.id" : "Kafka_MapR-Streams_to_HBase"}

raw = KafkaUtils.createDirectStream(ssc, topics, kafka_params)
raw.pprint()

server = "localhost"
table_name = "/tables/stocks"
Example #24
0
class SparkUtils:
    def __init__(self,
                 log: RootLogger = None,
                 parms: dict = None,
                 botoSession: bototSession = None,
                 appName: str = None):
        self.log = log
        self.__parms = parms or {}
        self.__runEnv = self.__parms.get("--runEnv", "local")
        if (self.__runEnv == "aws"):
            self.__boto = botoSession
            self.__s3 = S3(log, self.__boto)

        self.__initFlags()
        self.__setupSparkSession__(appName)

        self.__dfltRDDParts = \
                int(self.__spark.conf.get("spark.executor.instances", "20")) * \
                int(self.__spark.conf.get("spark.executor.cores", "4")) * 2

    def __initFlags(self):
        '''
        Init the job level parameters needed by this class
        '''
        self.__parms["--saveDFAs"] = self.__parms.get("--saveDFAs", "NONE")

        self.__explainDF = True if "-explainDF" in self.__parms else False
        self.__printcount = True if "-printCount" in self.__parms else False
        self.__useHist = True if "-useHint" in self.__parms else False
        self.__saveDF = True if self.__parms["--saveDFAs"] != "NONE" else False

        self.__fileFmt = self.__parms.get("--fileFormat", "parquet")

        if (self.__runEnv == "aws"):
            self.__tempS3 = self.__parms.get("--tempS3", "hdfs:///temp/s3")
        if (self.__runEnv != "local"):
            self.__tempHDFS = self.__parms.get("--tempHDFS", "hdfs:///temp")
            self.log.warn(
                "For persist type 'S3', 'HDFS' will be used as the --runEnv != 'aws'"
            )

    def __setupSparkSession__(self, appName: str = None):
        '''
        Init the Spark environemnt with few default configurations and start the spark session.
        '''
        self.__conf = SparkConf()
        hmConf = {
            "spark.rps.askTimeout": "1200",
            "spark.network.timeout": "1200",
            "spark.broadcast.blockSize": "16m",
            "spark.sql.broadcastTimeout": "1200",
            "spark.broadcast.compress": "true",
            "spark.rdd.compress": "true",
            "fs.s3.enableServerSideEncryption": "true",
            "spark.kryo.unsafe": "false",
            "spark.kryoserializer.buffer": "10240",
            "spark.kryoserializer.buffer.max": "2040m",
            "spark.io.compression.codec":
            "org.apache.spark.io.SnappyCompressionCodec",
            "spark.serializer": "org.apache.spark.serializer.KryoSerializer",
            "mapreduce.fileoutputcommitter.algorithm.version": "2",
            "spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version":
            "2",
        }
        self.__conf.setAll(hmConf)
        SparkContext.setSystemProperty("com.amazonaws.services.s3.enableV4",
                                       "true")
        SparkContext.setSystemProperty("com.amazonaws.services.s3.enforceV4",
                                       "true")
        self.__spark = SparkSession \
                        .builder \
                        .config(conf=self.__conf) \
                        .appName(appName or "PySparkApp") \
                        .enableHiveSupport() \
                        .getOrCreate()
        self.__sc = self.__spark.sparkContext
        self.sqlC = SQLContext(self.__sc)
        self.__sc.setSystemProperty("com.amazonaws.services.s3.enableV4",
                                    "true")
        self.__sc.setSystemProperty("com.amazonaws.services.s3.enforceV4",
                                    "true")
        self.__sc.setLogLevel(self.__parms.get("--logLevel", "INFO"))

        hdpCnf = self.__sc.hadoopConfiguration
        hdpCnf.setAll({
            "io.file.buffer.size":
            "65536",
            "mapreduce.fileoutputcommitter.algorithm.version":
            "2",
            "fs.s3a.endpoint":
            "%s.amazonaws.com" %
            (self.__parms.get("--awsRegion", 's3.us-east-1'))
        })
        if (self.__parms.get("--runEnv", "AWS") == "AWS"):
            from botocore.credentials import InstanceMetadataProvider, InstanceMetadataFetcher
            provider = InstanceMetadataProvider(
                iam_role_fetcher=InstanceMetadataFetcher(timeout=1000,
                                                         num_attempts=2))
            creds = provider.load()
            hdpCnf.setAll({
                "fs.s3a.access.key":
                creds.access_key,
                "fs.s3a.secret.key":
                creds.secret_key,
                "fs.s3a.server-side-encryption-algorithm":
                "SSE-KMS",
                "fs.s3.enableServerSideEncryption":
                "true",
                "fs.s3.impl":
                "org.apache.hadoop.fs.s3a.S3AFileSystem",
                "fs.s3a.impl":
                "org.apache.hadoop.fs.s3a.S3AFileSystem",
                "fs.s3a.endpoint":
                "s3.%s.amazonaws.com" %
                (self.__parms.get("--awsRegion", "us-east-1"))
            })

    def sql(self,
            dfName: str,
            query: str,
            partitions: int = 0,
            persistType: str = None):
        '''
        Runs the inpult SQL, partitions the resulting Dataframe and persists the Dataframe if needed.

        Supported persistType: In addition to the pySpark native persist types, this function supports
        HIVE, HDFS, S3

        '''
        if persistType == None:
            _df = self.__spark.sql(self.handleHints(query))
            if partitions == 0:
                df = _df
            elif df.rdd.getNumPartitions < partitions:
                df = _df.repartition(partitions)
            else:
                df = _df.coalesce(partitions)
            return df
        else:
            df = self.storeDF(
                df=self.sql(query, partitions),
                dfName=dfName,
                persistType=persistType,
                partitions=partitions,
                partitionCols=self.getPartitionColumnsFromSQL(query))

        if dfName:
            df.createOrReplaceTempView(dfName)

        if self.__printcount:
            self.log.info("Number of Records in DF '%s' : %d " %
                          (dfName, df.count()))

    def storeDF(self, df: DataFrame, dfName: str, persistType: str,
                partitions: int, partitionCols: List[str]):
        '''
        Store the input dataframe, read the persisted datafrme and return the new one.
        If Memory/Disk persistance requested, we run take(1) on the datafrme to force persist.
        '''
        if self.__explainDF or \
            "NULL|NONE".index(persistType.toUpperCase()) < 0 :
            self.log.info("Execution pland for building the DF '%s'" %
                          (dfName))
            df.explain()
            self.log.info("\n\n\n")

        saveType = self.__parms["--saveDFAs"] \
            if self.__saveDF and \
               "HIVE|NULL".index(persistType.toUpperCase()) < 0 \
            else \
                persistType.toUpperCase()

        if saveType == "S3" and self.__runEnv == "aws":
            saveType = "HDFS"
            self.log.debug(
                "Resetting the persist type to 'HDFS' as the --runEnv != 'aws'"
            )

        df1 = df if saveType != "HDFS" and \
                    saveType != "HIVE" and \
                    saveType != "S3" \
                 else self.repartitionDF(dataFrame= df, partitions = partitions)

        if saveType == "NULL" or saveType == "NONE":
            return df1
        elif saveType == "HDFS":
            return self.persistExternal(self.__tempHDFS, dfName, df,
                                        partitionCols)
        elif saveType == "S3":
            return self.persistExternal(self.__tempS3, dfName, df,
                                        partitionCols)
        elif saveType == "":
            return self.persist2Hive(dfName, df, partitionCols)
        elif saveType == "CHECK_POINT":
            return df.cache().checkpoint(eager=True)
        else:
            return self.persistLocal(dfName, df, persistType)

    def persistExternal(self,
                        parentDirURI: str,
                        fileName: str,
                        df: DataFrame,
                        partitionCols: List[str] = None,
                        overwrite: bool = True,
                        fileFormat: str = None,
                        **kwargs):

        fullPath = "%s%s"  % (parentDirURI,fileName or "") if parentDirURI.endswith("/") else \
                   "%s/%s" % (parentDirURI,fileName or "")
        fullPath = fullPath.replace("//", "/")
        schma = df.schema()
        fileFormat = fileFormat or self.__fileFmt
        self.write2ExtrFile(fullPath=fullPath,
                            fileFormat=fileFormat,
                            df=df,
                            partitionCols=partitionCols,
                            overwrite=overwrite,
                            **kwargs)
        df.unpersist()
        if fileFormat == "parquet":
            return self.readParquet(uri=fullPath, schema=schma, **kwargs)
        elif fileFormat == "orc":
            return self.readOrc(uri=fullPath, schema=schma, **kwargs)
        elif fileFormat == "csv":
            return self.readCSV(uri=fullPath, schema=schma, **kwargs)
        else:
            return self.readParquet(uri=fullPath, schema=schma, **kwargs)

    def readParquet(self,
                    uriString: str,
                    schema: StructType = None,
                    mergeSchema: bool = False,
                    **kwargs):
        self.log.info("Reading the parquet file '%s'" % uriString)
        rdr = self.__spark.read.format("parquet")
        if mergeSchema:
            rdr.option("mergeSchema", "true")
        if schema:
            rdr.schema(schema)
        return rdr.load(uriString)

    def readOrc(self, uriString: str, schema: StructType, **kwargs):
        self.log.info("Reading the ORC file in '%s'" % uriString)
        pass  ##TODO

    def readCSV(self, uriString: str, schema: StructType, **kwargs):
        self.log.info("Reading the CSV file in '%s'" % uriString)
        pass  ##TODO

    def write2ExtrFile(self,
                       fileFormat: str,
                       fullPath: str,
                       df: DataFrame,
                       partitionCols: List[str] = None,
                       overwrite: bool = True,
                       **kwargs):

        if fullPath.startswith("s3"):
            self.__s3.waitForFile("%s/_SUCCESS" % (fullPath))

        #TODO:Yet to Implement

    def persist2Hive(self, table: str, df: DataFrame,
                     partitionCols: List[str]):
        pass  #TODO:Yet to Implement

    def persistLocal(self, dfName: str, df: DataFrame, persistType: str):
        ''' Persist the input Datafrmae locally (memory/disk/none) and runs `df.take(1)` to force persist.
        '''
        lvl = self.getSparkPersistType(persistType.toUpperCase())
        if lvl:
            df.persist()

        if (self.__printcount == None):
            df.take(1)

    def getSparkPersistType(self, persistTypStr: str) -> StorageLevel:
        '''
            Converts the String representation to the StorageLevel Object.
            If invalid string received, it will return the `StorageLevel.NONE`
            Supported,
                `StorageLevel.NONE`
                `StorageLevel.DISK_ONLY`
                `StorageLevel.DISK_ONLY_2`
                `StorageLevel.MEMORY_ONLY`
                `StorageLevel.MEMORY_ONLY_2`
                `StorageLevel.MEMORY_AND_DISK`
                `StorageLevel.MEMORY_AND_DISK_2`
                `StorageLevel.OFF_HEAP`
        '''

        if persistTypStr == "NONE": return None
        elif persistTypStr == "DISK_ONLY": return StorageLevel.DISK_ONLY
        elif persistTypStr == "DISK_ONLY_2": return StorageLevel.DISK_ONLY_2
        elif persistTypStr == "MEMORY_ONLY": return StorageLevel.MEMORY_ONLY
        elif persistTypStr == "MEMORY_ONLY_2":
            return StorageLevel.MEMORY_ONLY_2
        elif persistTypStr == "MEMORY_AND_DISK":
            return StorageLevel.MEMORY_AND_DISK
        elif persistTypStr == "MEMORY_AND_DISK_2":
            return StorageLevel.MEMORY_AND_DISK_2
        elif persistTypStr == "OFF_HEAP":
            return StorageLevel.OFF_HEAP
        else:
            self.log.warn(
                "Invalid Persist Type %s received. Defaulting to NONE" %
                (persistTypStr))
            return None

    def repartitionDF(self, dataFrame: DataFrame, partitions: int = 0):
        '''
            Repartition the inuput dataframe

            parms: df          -> dataframe
                   partitions  -> new partitions count. Defaulted to 0 i.e Don't partition

            logic,
                if partitions = 0 , Don't repartitions
                if partitions = -1, Repartions to the default number (NumOfExecutors * ExecutorCores * 2)
                if partitions > 0 , Repartition/coalesce to the input number
        '''
        curParts = dataFrame.rdd.getNumPartitions
        finalParts = min(curParts, partitions)

        if curParts == partitions or partitions == 0:
            finalParts = -1
        elif partitions == -1:
            finalParts = self.__dfltRDDParts
        elif partitions > 0:
            finalParts = partitions
        else:
            pass  #finalParts is pre-populated.

        self.log.debug("Current Partitions: %d , Requested: %d,  Final: %d " %
                       (curParts, partitions, finalParts))

        if finalParts != -1:
            return dataFrame
        elif curParts > finalParts:
            return dataFrame.coalesce(finalParts)
        else:
            return dataFrame.repartition(finalParts)

    def handleHints(self, query: str):
        '''
            Removes the SparkSQL hints if the -useHist parm is not set.

            Example:- If sql = 'select /* hists */ cols.. from ..'
               if -useHist is not set,
                  return 'select cols.. from ..'
               else
                  return 'select /* hists */ cols.. from ..'
        '''
        if self.__useHist:
            return query
        else:
            return re.sub(r'/\*+.*\*/', '', query)

    @staticmethod
    def getPartitionColumnsFromSQL(query):
        s = query.toLowerCase().strip().replace("\n", " ")
        inx = s.index(" cluster ")
        lst = []
        if inx > 0:
            lst.extend((map(lambda x: x.strip(), s[inx + 12:].split(","))))
        else:
            frm = s.index(" distribute ")
            to = s.index(" sort ", frm + 15) if frm > 0 else 0
            if to > frm:
                lst.extend((map(lambda x: x.strip(),
                                s[frm + 15:to].split(","))))
            else:
                lst.extend((map(lambda x: x.strip(), s[frm + 15:].split(","))))
        return lst
Example #25
0
# -*- coding: utf-8 -*-
'''

'''
__author__ = 'Foxlora'
__time__ = '2020/10/10 22:22'

from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark.streaming import StreamingContext

from pyspark.streaming.kafka import KafkaUtils
from setting.default import DefaultConfig

# 1、创建spark streaming context conf
conf = SparkConf()
conf.setAll(DefaultConfig.SPARK_ONLINE_CONFIG)
sc = SparkContext(conf=conf)
stream_sc = StreamingContext(sc, 60)

# 2、配置与kafka读取的配置
similar_kafka = {"metadata.broker.list": DefaultConfig.KAFKA_SERVER, "group.id": 'similar'}
SIMILAR_DS = KafkaUtils.createDirectStream(stream_sc, ['click-trace'], similar_kafka)



Example #26
0
specific data


Note that I use spark because there is currently no way to use SQL queries
with dask
"""

from pyspark import SparkConf
from pyspark import SparkContext
from pyspark.sql import SQLContext, DataFrame

# This could benefit from some tweaks especially if the database becomes larger
conf = SparkConf()
conf.set("spark.sql.autoBroadcastJoinThreshold", 1024 * 1024 * 100)
conf.setAppName('Mnist_Spark_MLP').setMaster('local[8]')
conf.setAll([('spark.executor.memory', '8g'), ('spark.executor.cores', '3'),
             ('spark.cores.max', '3'), ('spark.driver.memory', '8g')])
conf.set("spark.sql.caseSensitive", "true")

# Global imports
import glob
import yaml
import logging
logging.getLogger().setLevel(logging.INFO)
import os
import textwrap
import numpy as np
import subprocess
from datetime import datetime
import copy
import time
import fnmatch
Example #27
0
from __future__ import print_function, division
import os
import sys
import copy
import functools

from pyspark import SparkContext, SparkConf
from pyspark.sql import HiveContext, SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType

conf = SparkConf().setMaster("yarn").setAppName("autofe").set(
    'spark.yarn.queue', 'solution')
# set app resources
configs = [('spark.driver.memory', '10g'), ('spark.executor.memory', '4g'),
           ('spark.executor.instances', '10'), ('spark.executor.cores', '2')]
conf.setAll(configs)
# conf = SparkConf().set('master', 'local')
sc = SparkContext.getOrCreate(conf=conf)
# sc = SparkContext.getOrCreate()
sql_context = HiveContext(sc)

#action表进行预处理
#数据加载
path = "hdfs://m7-model-hdp01:8020/user/2-6-0-model-test/user_1/nodes/data-load-load-240240/out/20190717/DAG_36240/NODE_240240/SLOT_0/DataLoad/02150359716"
t = sql_context.read.parquet(path)

#查看数据信息
print("查看几行数据")
print(t.show(5))
print("查看数据类型")
print(t.dtypes)
Example #28
0
    def __setupSparkSession__(
        self,
        jobConf: dict,
    ) -> SparkSession:
        '''
        Init the Spark environemnt with few default configurations and start the spark session.
        '''
        conf = SparkConf()
        #
        #Setup Spark Specific configurations
        #
        hmConf = {
            "spark.executor.pyspark.memory":
            "512m",
            "spark.debug.maxToStringFields":
            "5000",
            "spark.rps.askTimeout":
            "1200",
            "spark.network.timeout":
            "1200",
            "spark.maxRemoteBlockSizeFetchToMem":
            "512m",
            "spark.broadcast.blockSize":
            "16m",
            "spark.broadcast.compress":
            "true",
            "spark.rdd.compress":
            "true",
            "spark.io.compression.codec":
            "org.apache.spark.io.SnappyCompressionCodec",
            "spark.kryo.unsafe":
            "true",
            "spark.serializer":
            "org.apache.spark.serializer.KryoSerializer",
            "spark.kryoserializer.buffer":
            "10240",
            "spark.kryoserializer.buffer.max":
            "2040m",
            "hive.exec.dynamic.partition":
            "true",
            "hive.exec.dynamic.partition.mode":
            "nonstrict",
            "hive.warehouse.data.skiptrash":
            "true",
            "spark.sql.hive.metastorePartitionPruning":
            "true",
            "spark.sql.broadcastTimeout":
            "1200",
            "spark.sql.sources.partitionOverwriteMode":
            "dynamic",
            "spark.sql.orc.filterPushdown":
            "true",
            "spark.sql.orc.splits.include.file.footer":
            "true",
            "spark.sql.orc.cache.stripe.details.size":
            "1000",
            "spark.hadoop.parquet.enable.summary-metadata":
            "false",
            "spark.sql.parquet.mergeSchema":
            "false",
            "spark.sql.parquet.filterPushdown":
            "true",
            "spark.sql.parquet.fs.optimized.committer.optimization-enabled":
            "true",
            "spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version":
            "2",
            "spark.hadoop.mapreduce.fileoutputcommitter.cleanup-failures.ignored":
            "true"
        }

        for (k, v) in jobConf['sparkconfs'].items():
            hmConf.set(k, v)

        conf.setAll(hmConf)
        #
        #Setup Hadoop Specific configurations
        #
        hdpCnf = SparkContext._jsc.hadoopConfiguration()
        hdpCnf.set('io.file.buffer.size', '65536')
        hdpCnf.set('mapreduce.fileoutputcommitter.algorithm.version', '2')

        for (k, v) in jobConf['hadoopconfs'].items():
            hdpCnf.set(k, v)

    #
    #Setup AWS Specific configurations
    #
        if jobConf['appconfs']['runenv'].toUpperCase() == 'AWS':
            SparkContext.setSystemProperty(
                'com.amazonaws.services.s3.enableV4', 'true')
            SparkContext.setSystemProperty(
                'com.amazonaws.services.s3.enforceV4', 'true')
            conf.set(
                "spark.sql.parquet.output.committer.class",
                "com.amazon.emr.committer.EmrOptimizedSparkSqlParquetOutputCommitter"
            )

            cred = None
            try:
                from botocore.credentials import InstanceMetadataProvider, InstanceMetadataFetcher
                provider = InstanceMetadataProvider(
                    iam_role_fetcher=InstanceMetadataFetcher(timeout=1000,
                                                             num_attempts=2))
                creds = provider.load()
                hdpCnf.setAll({
                    'fs.s3a.access.key': creds.access_key,
                    'fs.s3a.access.key': creds.secret_key,
                })
            except:
                pass
            hdpCnf.setAll({
                'fs.s3a.server-side-encryption-algorithm':
                'SSE-KMS',
                'fs.s3.enableServerSideEncryption':
                'true',
                'fs.s3.enableServerSideEncryption':
                'true',
                'fs.s3.impl':
                'org.apache.hadoop.fs.s3a.S3AFileSystem',
                'fs.s3a.impl':
                'org.apache.hadoop.fs.s3a.S3AFileSystem',
                'fs.s3a.endpoint':
                "s3.%s.amazonaws.com" %
                (jobConf['appconfs']['appdefaults'] or 'us-east-1')
            })


        spark = SparkSession \
                .builder \
                .config(conf=conf) \
                .appName(jobConf['name'] or 'PySparkApp') \
                .enableHiveSupport() \
                .getOrCreate()

        sc = spark.sparkContext
        sc.setLogLevel(jobConf['appconfs']['logging']['sparkloglevel']
                       or 'INFO')
        if jobConf['appconfs']['logging']['sparkloglevel'] or 'INFO' == "DEBUG":
            msg = ""
            for k in sc._conf.getAll():
                msg += "\t%50s -> %s\n" % (k[0], k[1])
            log.debug(
                "Initiated SparkSesion with below confs,\n{}".format(msg))

        return spark
Example #29
0
from pyspark import SparkContext, SparkConf
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import *
from pyspark.storagelevel import StorageLevel

appName = "KafkaStreams"
config = SparkConf().setAppName(appName)

props = []
props.append(("spark.rememberDuration", "10"))
props.append(("spark.batchDuration", "10"))
props.append(("spark.eventLog.enabled", "true"))
props.append(("spark.streaming.timeout", "30"))
props.append(("spark.ui.enabled", "true"))

config = config.setAll(props)

sc = SparkContext(conf=config)
ssc = StreamingContext(sc, 5)

topics = ["t1"]
kafka_params = {
    "zookeeper.connect": "localhost:5181/kafka",
    "metadata.broker.list": "localhost:9092",
    "group.id": "Kafka_MapR-Streams_to_HBase"
}

raw = KafkaUtils.createDirectStream(ssc, topics, kafka_params)
raw.pprint()

ssc.start()  # Start the computation
Example #30
0
def spark_session(spark_id, executor_num, local_dir):
    logger.info('[%s] init spark session', spark_id)

    # spark
    if 'SPARK_HOME' not in os.environ:
        os.environ['SPARK_HOME'] = SPARK_HOME

    os.environ['PYSPARK_PYTHON'] = WORKER_PYTHON
    os.environ['PYSPARK_DRIVER_PYTHON'] = DRIVER_PYTHON

    if not local_dir:
        local_dir = os.path.join(JOB_ROOT_DIR.LOCAL_ROOT, spark_id)

    os.makedirs(os.path.join(local_dir, 'tmp'))
    #os.makedirs(os.path.join(local_dir, 'metastore_db'))
    spark_conf = SparkConf()
    conf_details = [
        # ('spark.yarn.jars', ''),
        # ('spark.executorEnv.PATH', SPARK_CONFIG['WORKER_PATH']),
        # ('spark.eventLog.dir', 'hdfs://TS-CLICKH011:8020/spark/history'),
        # ('spark.yarn.historyServer.address', 'http://ts-clickh09:18080/'),
        # ('spark.executorEnv.PATH', './python3/bin/:$PATH'),
        # ('spark.appMasterEnv.PATH', SparkConfig['WORKER_PATH']),
        # ('spark.yarn.appMasterEnv.PYSPARK_PYTHON', './python3/bin/python3'),
        # ('spark.executorEnv.PYSPARK_PYTHON', './python3/bin/python3'),
        # ('spark.driver.host', '172.22.16.57'),
        # ('spark.pyspark.python', './python3/bin/python3'),
        # ('spark.pyspark.python', './python3/bin/python3'),
        # ('spark.pyspark.driver.python', '/data/anaconda3/bin/python'),
        ('spark.yarn.archive', HDFS_SPARK_JARS),
        ('spark.yarn.dist.archives', SPARK_CONFIG['SPARK_ARCHIVES']),
        ('spark.eventLog.enabled', 'true'),
        ('spark.eventLog.compress', 'true'),
        ('spark.driver.memory', '2G'),
        ('spark.driver.extraJavaOptions',
         f'-Duser.timezone=UTC+0800 -Djava.io.tmpdir={os.path.join(local_dir, "tmp")} -Dderby.system.home={os.path.abspath(local_dir)}'
         ),
        ('spark.executor.extraJavaOptions',
         '-Duser.timezone=UTC+0800 -XX:+PrintGCDetails -XX:+PrintGCTimeStamps'
         ),
        ('spark.executor.instances', executor_num),
        ('spark.executor.memory', '8G'),
        ('spark.executor.cores', 4),
        ('spark.sql.shuffle.partitions', executor_num),
        ('spark.yarn.executor.memoryOverhead', '4G'),
        ('spark.sql.warehouse.dir', os.path.join(local_dir, 'metastore_db')),
        ('spark.local.dir', os.path.join(local_dir, 'tmp')),
        ('spark.driver.extraClassPath',
         "/data/tool/env/hadoop-lzo/lib/hadoop-lzo-0.4.19.jar"),
        ('spark.driver.extraLibraryPath ',
         '/data/tool/env/hadoop-lzo/lib/native/')
    ]

    for k, v in conf_details:
        print(f'[{spark_id}] spark config {k} = {v}')

    spark_conf.setAll(conf_details)
    spark_conf.setAppName(f'{spark_id}')
    spark_conf.setMaster('yarn')

    spark = SparkSession.builder.config(
        conf=spark_conf).enableHiveSupport().getOrCreate()
    return spark
Example #31
0
def create_spark_conf():
    bigdl_conf = get_bigdl_conf()
    sparkConf = SparkConf()
    sparkConf.setAll(bigdl_conf.items())
    return sparkConf
Example #32
0
def init_spark(config, app=None, use_session=False):
    import os
    import sys
    from glob import glob

    if 'spark-home' in config:
        os.environ['SPARK_HOME'] = config['spark-home']

    if 'spark-conf-dir' in config:
        os.environ['SPARK_CONF_DIR'] = config['spark-conf-dir']

    if 'pyspark-python' in config:
        # Set python interpreter on both driver and workers
        os.environ['PYSPARK_PYTHON'] = config['pyspark-python']

    if 'yarn-conf-dir' in config:
        # Hadoop YARN configuration
        os.environ['YARN_CONF_DIR'] = config['yarn-conf-dir']

    if 'spark-classpath' in config:
        # can be used to use external folder with Hive configuration
        # e. g. spark-classpath='/etc/hive/conf.cloudera.hive1'
        os.environ['SPARK_CLASSPATH'] = config['spark-classpath']

    submit_args = []

    driver_mem = config.get('spark-prop.spark.driver.memory', None)
    if driver_mem is not None:
        submit_args.extend(["--driver-memory", driver_mem])

    driver_cp = config.get('spark-prop.spark.driver.extraClassPath', None)
    if driver_cp is not None:
        submit_args.extend(["--driver-class-path", driver_cp])

    driver_java_opt = config.get('spark-prop.spark.driver.extraJavaOptions', None)
    if driver_java_opt is not None:
        submit_args.extend(["--driver-java-options", driver_java_opt])

    jars = config.get('jars', None)
    if jars is not None:
        if isinstance(jars, str):
            jars = [jars]
        submit_args.extend(["--jars", ','.join(jars)])

    mode_yarn = config['spark-prop.spark.master'].startswith('yarn')

    if mode_yarn:
        # pyspark .zip distribution flag is set only if spark-submit have master=yarn in command-line arguments
        # see spark.yarn.isPython conf property setting code
        # in org.apache.spark.deploy.SparkSubmit#prepareSubmitEnvironment
        submit_args.extend(['--master', 'yarn'])

    # pyspark .zip distribution flag is set only if spark-submit have pyspark-shell or .py as positional argument
    # see spark.yarn.isPython conf property setting code
    # in org.apache.spark.deploy.SparkSubmit#prepareSubmitEnvironment
    submit_args.append('pyspark-shell')

    os.environ['PYSPARK_SUBMIT_ARGS'] = ' '.join(submit_args)

    spark_home = os.environ['SPARK_HOME']
    spark_python = os.path.join(spark_home, 'python')
    pyspark_libs = glob(os.path.join(spark_python, 'lib', '*.zip'))
    sys.path.extend(pyspark_libs)

    virtualenv_reqs = config['spark-prop'].get('spark.pyspark.virtualenv.requirements', None)
    if use_session:
        from pyspark.sql import SparkSession

        builder = SparkSession.builder.appName(app or config['app'])

        if mode_yarn:
            builder = builder.enableHiveSupport()

        for k, v in prop_list(config['spark-prop']).items():
            builder = builder.config(k, v)

        ss = builder.getOrCreate()
        if virtualenv_reqs is not None:
            ss.addFile(virtualenv_reqs)
        return ss
    else:
        from pyspark import SparkConf, SparkContext
        conf = SparkConf()
        conf.setAppName(app or config['app'])
        props = [(k, str(v)) for k, v in prop_list(config['spark-prop']).items()]
        conf.setAll(props)
        sc = SparkContext(conf=conf)
        if virtualenv_reqs is not None:
            sc.addFile(virtualenv_reqs)
        return sc
Example #33
0
 def _create_spark_context():
     spark_conf = SparkConf()
     spark_conf.set('spark.sql.catalogImplementation', 'hive')
     spark_conf.setAll(self._setup_options(additional_options))
     return SparkContext(conf=spark_conf)
Example #34
0
# -*- coding: UTF-8 -*-

import happybase
from setting.default import DefaultConfig
import redis

pool = happybase.ConnectionPool(size=10, host='hadoop-master', port=9090)

# 召回数据
# 加上decode_responses=True,写入的键值对中的value为str类型,不加这个参数写入的则为字节类型。
redis_client = redis.StrictRedis(host=DefaultConfig.REDIS_HOST,
                                 port=DefaultConfig.REDIS_PORT,
                                 db=10,
                                 decode_responses=True)

# 用于缓存的Redis数据库
# 加上decode_responses=True,写入的键值对中的value为str类型,不加这个参数写入的则为字节类型。
cache_client = redis.StrictRedis(host=DefaultConfig.REDIS_HOST,
                                 port=DefaultConfig.REDIS_PORT,
                                 db=8,
                                 decode_responses=True)

# 在 sort_service.py 排序逻辑中使用
from pyspark import SparkConf
from pyspark.sql import SparkSession
# spark配置
conf = SparkConf()
conf.setAll(DefaultConfig.SPARK_GRPC_CONFIG)
SORT_SPARK = SparkSession.builder.config(conf=conf).getOrCreate()