Python SparkConf.getAll Exemples, pyspark.SparkConf.getAll Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : spark_utils.py Projet : iCodeIN/learn-advanced-pyspark

def setup_spark(app_name="my_app",
                master="local[*]",
                spark_jars=[],
                spark_config={},
                py_files=[]):
    """sets up spark app using configuration provided"""

    cwd = os.getcwd()
    conf = SparkConf().setAppName(app_name).setMaster(master)

    conf = conf.set("spark.jars", ",".join(spark_jars))

    # update spark config
    for key, val in spark_config.items():
        conf.set(key, val)

    for i in conf.getAll():
        print(i[0], "-->", i[1])

    SparkSession.builder.config(conf=conf)
    spark_session = SparkSession.builder.appName(app_name).getOrCreate()

    for pyf in py_files:
        spark_session.sparkContext.addPyFile(pyf)

    return spark_session

Exemple #2

0

Afficher le fichier

Fichier : sparkifpen.py Projet : jgratien/DataAnalysisFramework

def create_sc(app_name='AppName',
              master='local[*]',
              executor_memory='4G',
              nb_cores='0',
              driver_memory='32G',
              max_result_size='10G'):
    sc_conf = SparkConf()
    sc_conf.setAppName(app_name)
    sc_conf.setMaster(master)
    sc_conf.set('spark.executor.memory', executor_memory)
    if nb_cores != '0':
        sc_conf.set('spark.executor.cores', nb_cores)
    sc_conf.set('spark.driver.memory', driver_memory)
    sc_conf.set('spark.cores.max', '32')
    sc_conf.set('spark.driver.maxResultSize', max_result_size)
    sc_conf.set('spark.logConf', True)
    print(sc_conf.getAll())

    sc = None
    try:
        sc.stop()
        sc = SparkContext(conf=sc_conf)
    except:
        sc = SparkContext(conf=sc_conf)

    return sc

Exemple #3

0

Afficher le fichier

Fichier : spark_context.py Projet : kernc/Orange3-Spark

class OWSparkContext(SharedSparkContext, widget.OWWidget):
    priority = 0
    name = "Context"
    description = "Create a shared Spark (sc) and Hive (hc) Contexts"
    icon = "../icons/spark.png"

    want_main_area = False
    resizing_enabled = True

    conf = None

    def __init__(self):
        super().__init__()

        # The main label of the Control's GUI.
        # gui.label(self.controlArea, self, "Spark Context")

        self.conf = SparkConf()
        all_prefedined = dict(self.conf.getAll())
        # Create parameters Box.
        box = gui.widgetBox(self.controlArea, "Spark Application", addSpace = True)

        self.gui_parameters = OrderedDict()

        main_parameters = OrderedDict()
        main_parameters['spark.app.name'] = 'OrangeSpark'
        main_parameters['spark.master'] = 'yarn-client'
        main_parameters["spark.executor.instances"] = "8"
        main_parameters["spark.executor.cores"] = "4"
        main_parameters["spark.executor.memory"] = "8g"
        main_parameters["spark.driver.cores"] = "4"
        main_parameters["spark.driver.memory"] = "2g"
        main_parameters["spark.logConf"] = "false"
        main_parameters["spark.app.id"] = "dummy"

        for k, v in main_parameters.items():
            default_value = all_prefedined.setdefault(k, v)
            self.gui_parameters[k] = GuiParam(parent_widget = box, label = k, default_value = v)
            all_prefedined.pop(k)

        for k, v in all_prefedined.items():
            self.gui_parameters[k] = GuiParam(parent_widget = box, label = k, default_value = str(v))

        action_box = gui.widgetBox(box)
        # Action Button
        self.create_sc_btn = gui.button(action_box, self, label = 'Submit', callback = self.create_context)

    def onDeleteWidget(self):
        if self.sc:
            self.sc.stop()

    def create_context(self):

        for key, parameter in self.gui_parameters.items():
            self.conf.set(key, parameter.get_value())

        self.sc = SparkContext(conf = self.conf)
        self.hc = HiveContext(self.sc)

Exemple #4

0

Afficher le fichier

def init_spark(app_name):
    sc_conf = SparkConf()
    sc_conf.setAppName(app_name)
    sc_conf.setMaster('local[*]')
    sc_conf.set('spark.executor.memory', '4g')
    sc_conf.set('spark.executor.cores', '4')
    sc_conf.set('spark.driver.memory', '32G')
    sc_conf.set('spark.cores.max', '32')
    sc_conf.set('spark.driver.maxResultSize', '10G')
    sc_conf.set('spark.logConf', True)
    sc_conf.getAll()
    sc = None
    try:
        sc.stop()
        sc = SparkContext(conf=sc_conf)
    except:
        sc = SparkContext(conf=sc_conf)
    return sc

Exemple #5

0

Afficher le fichier

Fichier : finance_similarity.py Projet : xc2156/Spark-in-Finance-Quantitative-Investing

def create_sc():
    sc_conf = SparkConf()
    sc_conf.setAppName("finance-similarity-app")
    sc_conf.setMaster('spark://10.21.208.21:7077')
    sc_conf.set('spark.executor.memory', '2g')
    sc_conf.set('spark.executor.cores', '4')
    sc_conf.set('spark.cores.max', '40')
    sc_conf.set('spark.logConf', True)
    print sc_conf.getAll()

    sc = None
    try:
        sc.stop()
        sc = SparkContext(conf=sc_conf)
    except:
        sc = SparkContext(conf=sc_conf)

    return sc

Exemple #6

0

Afficher le fichier

Fichier : spark_commons.py Projet : corneliouzbett/Ampath-Data-Flow

    def create_context(self):
        sc_conf = SparkConf()
        sc_conf.setAppName(self.name)
        sc_conf.setMaster('local[*]')
        sc_conf.set('spark.executor.memory', '2g')
        sc_conf.set('spark.executor.cores', '4')
        sc_conf.set('spark.cores.max', '40')
        sc_conf.set('spark.logConf', True)
        sc_conf.set('spark.debug.maxToStringFields', '100')

        print sc_conf.getAll()
        sc = None
        try:
            sc.stop()
            sc = SparkContext(conf=sc_conf)
        except:
            sc = SparkContext(conf=sc_conf)

        return SQLContext(sc)

Exemple #7

0

Afficher le fichier

def create_sc(pyFiles):
    sc_conf = SparkConf()
    sc_conf.setAppName("Weather_PCA")
    sc_conf.set('spark.executor.memory', '3g')
    sc_conf.set('spark.executor.cores', '1')
    sc_conf.set('spark.cores.max', '4')
    sc_conf.set('spark.default.parallelism', '10')
    sc_conf.set('spark.logConf', True)
    print(sc_conf.getAll())

    sc = SparkContext(conf=sc_conf, pyFiles=pyFiles)

    return sc

Exemple #8

0

Afficher le fichier

Fichier : xdrProject.py Projet : yxfcodeup/PythonSpiders

def mainRun():
    try:
        hdfs_path = settings.dir_path["hdfs_path"]
        spark_url = settings.spark_config["spark_url"]
        spark_conf_settings = settings.spark_config["spark_conf_settings"]
    except Exception as e:
        logger.error(str(e))
        logger.error("Exit...")
        sys.exit(1)
    app_name = "xdrProject"
    spark_conf = SparkConf().setAppName(app_name).setMaster(str(spark_url))
    for conf_name, conf_sets in spark_conf_settings.items():
        spark_conf = spark_conf.set(conf_name, conf_sets)
    sc = SparkContext(conf=spark_conf)
    logger.info("Spark config: " + str(spark_conf.getAll()))
    xdrProject(sc, hdfs_path)

Exemple #9

0

Afficher le fichier

Fichier : linecount.py Projet : cgeroux/big_data_benchmark

def main():
  
  #parse command line options
  (options,args)=parseOptions()
  
  if len(args) != 2:
   raise Exception("need an input file and an output path")
  
  #set number of file partitions/parallelism
  if options.numPartitions==None:
    #pick number of partitions based on default amount of parallelism and filesize
    partFactor=1#how many times the default parallelism. Defaul Parallelism is 
      #related to the number of cores on the machine.
    numPartitions=sc.defaultParallelism*partFactor
  else:
    numPartitions=options.numPartitions
  
  conf=SparkConf().setAppName("wordCount").setMaster("local["+str(numPartitions)+"]")
  sc = SparkContext(conf=conf)
  conf=sc.getConf()
  print("conf="+str(conf.getAll()))
  print("defaultMinPartitions="+str(sc.defaultMinPartitions))
  print("defaultParallelism="+str(sc.defaultParallelism))
  
  inputFileName = args[0]
  outputFileName= args[1]
  
  timeStart=time.time()
  file = sc.textFile(inputFileName,minPartitions=numPartitions)
  counts = file.count()
  timeEnd=time.time()
  dtRead=timeEnd-timeStart#time in seconds
  
  #write out to a file
  timeStart=time.time()
  file.saveAsTextFile(outputFileName)
  timeEnd=time.time()
  dtWrite=timeEnd-timeStart#time in seconds
  
  print("read+count time="+str(dtRead)+" s")
  print("write time="+str(dtWrite)+" s")
  print("number of lines="+str(counts))
  print("num Partitions="+str(file.getNumPartitions()))

Exemple #10

0

Afficher le fichier

Fichier : pi.py Projet : rshekarchian/spark

def create_sc():
    sc_conf = SparkConf()
    sc_conf.setAppName("finance-similarity-app")
    sc_conf.set("spark.dynamicAllocation.enabled", "false")
    sc_conf.set("spark.driver.host", "172.31.85.37")
    sc_conf.set('spark.executor.memory', '1g')
    sc_conf.set('spark.executor.cores', '2')
    sc_conf.set('spark.cores.max', '4')
    sc_conf.set('spark.logConf', True)
    print(sc_conf.getAll())

    sc = None
    try:
        sc.stop()
        sc = SparkContext(conf=sc_conf)
    except:
        sc = SparkContext(conf=sc_conf)

    return sc

Exemple #11

0

Afficher le fichier

class OWSparkContext(SharedSparkContext, widget.OWWidget):
    priority = 0
    name = "Context"
    description = "Create a shared Spark (sc) and Hive (hc) Contexts"
    icon = "../icons/spark.png"

    want_main_area = False
    resizing_enabled = True

    conf = None

    def __init__(self):
        super().__init__()

        # The main label of the Control's GUI.
        # gui.label(self.controlArea, self, "Spark Context")

        self.conf = SparkConf()
        all_prefedined = dict(self.conf.getAll())
        # Create parameters Box.
        box = gui.widgetBox(self.controlArea,
                            "Spark Application",
                            addSpace=True)

        self.gui_parameters = OrderedDict()

        main_parameters = OrderedDict()
        main_parameters['spark.app.name'] = 'OrangeSpark'
        main_parameters['spark.master'] = 'yarn-client'
        main_parameters["spark.executor.instances"] = "8"
        main_parameters["spark.executor.cores"] = "4"
        main_parameters["spark.executor.memory"] = "8g"
        main_parameters["spark.driver.cores"] = "4"
        main_parameters["spark.driver.memory"] = "2g"
        main_parameters["spark.logConf"] = "false"
        main_parameters["spark.app.id"] = "dummy"

        for k, v in main_parameters.items():
            default_value = all_prefedined.setdefault(k, v)
            self.gui_parameters[k] = GuiParam(parent_widget=box,
                                              label=k,
                                              default_value=v)
            all_prefedined.pop(k)

        for k, v in all_prefedined.items():
            self.gui_parameters[k] = GuiParam(parent_widget=box,
                                              label=k,
                                              default_value=str(v))

        action_box = gui.widgetBox(box)
        # Action Button
        self.create_sc_btn = gui.button(action_box,
                                        self,
                                        label='Submit',
                                        callback=self.create_context)

    def onDeleteWidget(self):
        if self.sc:
            self.sc.stop()

    def create_context(self):

        for key, parameter in self.gui_parameters.items():
            self.conf.set(key, parameter.get_value())

        self.sc = SparkContext(conf=self.conf)
        self.hc = HiveContext(self.sc)

Exemple #12

0

Afficher le fichier

from pyspark.sql.window import Window as W
from spacy.lang.en import English
import spacy
from pyspark.ml.stat import Summarizer
from collections import Counter
import pandas
import time

nlp = spacy.load("en_core_web_sm")
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["PYSPARK_PYTHON"] = "/usr/bin/python3.6"
conf = SparkConf().setAppName('Spark DL Tabular Pipeline').setMaster(
    'local[6]').set('spark.driver.memory', '16g').set('spark.executor.memory',
                                                      '6g')

print(conf.getAll())
sc = SparkContext(conf=conf)
sql_context = SparkSession(sc)

# Load Data to Spark Dataframe
df = sql_context.read.csv('final/amazon_reviews.tsv',
                          header=True,
                          sep=r'\t',
                          inferSchema=True)

tags = [
    'SYM', 'PUNCT', 'X', 'ADJ', 'CCONJ', 'NUM', 'DET', 'PRON', 'ADP', 'ADJ',
    'VERB', 'NOUN', 'PROPN', 'ADV', 'SPACE', 'PART', 'INTJ', 'AUX', 'SCONJ'
]

Exemple #13

0

Afficher le fichier

Fichier : consumer.py Projet : LZRhhh/stock_kafka

    init_db()
    session = get_session()

    sc = SparkContext(appName="streamingkafka")
    sc_conf = SparkConf()
    sc_conf.set('spark.executor.memory',
                '2g')  # executor memory是每个节点上占用的内存。每一个节点可使用内存
    sc_conf.set(
        "spark.executor.cores", '4'
    )  # spark.executor.cores：顾名思义这个参数是用来指定executor的cpu内核个数，分配更多的内核意味着executor并发能力越强，能够同时执行更多的task
    sc_conf.set(
        'spark.cores.max', 40
    )  # spark.cores.max：为一个application分配的最大cpu核心数，如果没有设置这个值默认为spark.deploy.defaultCores
    sc_conf.set('spark.logConf',
                True)  # 当SparkContext启动时，将有效的SparkConf记录为INFO。
    print(sc_conf.getAll())
    sc.setLogLevel("WARN")  # reduce logs from shell
    window = 60
    ssc = StreamingContext(sc, window)  # get messages of 1 min
    brokers = 'localhost:9092'
    topic = 'test'

    tmp = init()
    #
    kafka_streaming_rdd = KafkaUtils.createDirectStream(
        ssc, [topic], {"metadata.broker.list": brokers})
    # kafka_streaming_rdd.pprint()
    lines_rdd = kafka_streaming_rdd.map(lambda x: x[1]).map(split) \
        .foreachRDD(lambda rdd: process_stream(rdd, tmp))

    ssc.start()

Exemple #14

0

Afficher le fichier

Fichier : spark_context.py Projet : liam60/Workflow-for-text-analysis

class OWSparkContext(SparkEnvironment, widget.OWWidget):
    priority = 0
    name = "Spark Config"
    description = "Configure the shared contexts: SparkContext (sc), SqlContext (sqlContext) and  HiveContext (hc)"
    icon = "../assets/Spark.svg"

    want_main_area = False
    resizing_enabled = True
    saved_gui_params = Setting(OrderedDict())

    conf = None

    def __init__(self):
        super().__init__()

        # The main label of the Control's GUI.
        # gui.label(self.controlArea, self, "Spark Context")

        self.conf = SparkConf()
        all_prefedined = dict(self.conf.getAll())
        # Create parameters Box.
        box = gui.widgetBox(self.controlArea,
                            "Spark Application",
                            addSpace=True)

        self.gui_parameters = OrderedDict()

        main_parameters = OrderedDict()
        main_parameters['spark.app.name'] = 'weta_workflow'
        main_parameters['spark.master'] = 'local'  # 'yarn'
        main_parameters["spark.executor.instances"] = "8"
        main_parameters["spark.executor.cores"] = "4"
        main_parameters["spark.executor.memory"] = "8g"
        main_parameters["spark.driver.cores"] = "4"
        main_parameters["spark.driver.memory"] = "2g"
        main_parameters["spark.logConf"] = "false"
        main_parameters["spark.app.id"] = "dummy"

        for k, v in self.saved_gui_params.items():
            main_parameters[k] = v

        for k, v in main_parameters.items():
            default_value = all_prefedined.setdefault(k, v)
            self.gui_parameters[k] = ParameterWidget(parent_widget=box,
                                                     label=k,
                                                     default_value=v)
            all_prefedined.pop(k)

        for k, v in all_prefedined.items():
            self.gui_parameters[k] = ParameterWidget(parent_widget=box,
                                                     label=k,
                                                     default_value=str(v))

        action_box = gui.widgetBox(box)
        # Action Button
        self.create_sc_btn = gui.button(action_box,
                                        self,
                                        label='Submit',
                                        callback=self.create_context)

    def onDeleteWidget(self):
        if self.sc:
            self.sc.stop()

    def create_context(self):
        if self.sc:
            self.sc.stop()

        for key, parameter in self.gui_parameters.items():
            self.conf.set(key, parameter.get_value())
            self.saved_gui_params[key] = parameter.get_value()

        self.sc = SparkContext(conf=self.conf)
        self.sqlContext = SQLContext(self.sc)
        self.hc = HiveContext(self.sc)
        self.hide()

Exemple #15

0

Afficher le fichier

Fichier : spark_context.py Projet : Atigeo/xpatterns-xframe

class CommonSparkContext(object):
    __metaclass__ = Singleton

    def __init__(self):
        """
        Create a spark context.

        The spark configuration is taken from xframes/config.ini and from
        the values set in SparkInitContext.set() if this has been called.
        """

        # This is placed here because otherwise it causes an error when used in a spark slave.
        from pyspark import SparkConf, SparkContext, SQLContext, HiveContext

        # This reads from default.ini and then xframes/config.ini
        # if they exist.
        self._env = Environment.create()
        context = create_spark_config(self._env)
        verbose = self._env.get_config("xframes", "verbose", "false").lower() == "true"
        hdfs_user_name = self._env.get_config("webhdfs", "user", "hdfs")
        os.environ["HADOOP_USER_NAME"] = hdfs_user_name
        config_pairs = [(k, v) for k, v in context.iteritems()]
        self._config = SparkConf().setAll(config_pairs)
        if verbose:
            print "Spark Config: {}".format(config_pairs)

        self._sc = SparkContext(conf=self._config)
        self._sqlc = SQLContext(self._sc)
        self._hivec = HiveContext(self._sc)
        self.zip_path = []
        version = [int(n) for n in self._sc.version.split(".")]
        self.status_tracker = self._sc.statusTracker()
        if cmp(version, [1, 4, 1]) >= 0:
            self.application_id = self._sc.applicationId
        else:
            self.application_id = None

        if verbose:
            print "Spark Version: {}".format(self._sc.version)
            if self.application_id:
                print "Application Id: {}".format(self.application_id)

        if not context["spark.master"].startswith("local"):
            zip_path = self.build_zip(get_xframes_home())
            if zip_path:
                self._sc.addPyFile(zip_path)
                self.zip_path.append(zip_path)

        trace_flag = self._env.get_config("xframes", "rdd-trace", "false").lower() == "true"
        XRdd.set_trace(trace_flag)
        atexit.register(self.close_context)

    def spark_add_files(self, dirs):
        """
        Adds python files in the given directory or directories.

        Parameters
        ----------
        dirs: str or list(str)
            If a str, the pathname to a directory containing a python module.
            If a list, then it is a list of such directories.

            The python files in each directory are compiled, packed into a zip, distributed to each
            spark slave, and placed in PYTHONPATH.

            This is only done if spark is deployed on a cluster.
        """
        props = self.config()
        if props.get("spark.master", "local").startswith("local"):
            return
        if isinstance(dirs, basestring):
            dirs = [dirs]
        for path in dirs:
            zip_path = self.build_zip(path)
            if zip_path:
                self._sc.addPyFile(zip_path)
                self.zip_path.append(zip_path)

    def close_context(self):
        if self._sc:
            self._sc.stop()
            self._sc = None
            for zip_path in self.zip_path:
                os.remove(zip_path)

    def config(self):
        """
        Gets the configuration parameters used to initialize the spark context.

        Returns
        -------
        out : dict
            A dict of the properties used to initialize the spark context.
        """
        props = self._config.getAll()
        return {prop[0]: prop[1] for prop in props}

    def env(self):
        """
        Gets the config environment.

        Returns
        -------
        out : Environment
            The environment.  This contains all the values from the configuration file(s).
        """

        return self._env

    def sc(self):
        """
        Gets the spark context.

        Returns
        -------
        out : SparkContext
            The spark context.  There is a single spark context per process.
        """
        return self._sc

    def sqlc(self):
        """
        Gets the spark sql context.

        Returns
        -------
        out : sql.SqlContext
            The spark sql context.
        """
        return self._sqlc

    def hivec(self):
        """
        Gets the hive context.

        Returns
        -------
        out : sql.HiveContext
            The hive context.
        """
        return self._hivec

    def version(self):
        """
        Gets the spark version.

        Returns
        -------
        out: lst[int]
            The spark version, as a list of integers.
        """
        return [int(n) for n in self._sc.version.split(".")]

    def jobs(self):
        """
        Get the spark job ID and info for the active jobs.

        This method would normally be called by another thread from the executing job.

        Returns
        -------
        out: map(job_id: job_info}
            A map of the active job IDs and their corresponding job info
        """
        return {job_id: self.status_tracker.getJobInfo(job_id) for job_id in self.status_tracker.getActiveJobIds()}

    def cluster_mode(self):
        """
        Get the cluster mode of the spark cluster.

        Returns
        -------
        out: boolean
            True if spark is running in cluster mode.  Cluster mode means that spark is running on a platform separate
            the program.  In practice, cluster mode means that file arguments must be located on
            a network filesystem such as HDFS or NFS.
        """
        return not self._config.get("spark.master").startswith("local")

    # noinspection PyBroadException
    @staticmethod
    def build_zip(module_dir):
        # This can fail at writepy if there is something wrong with the files
        #  in xframes.  Go ahead anyway, but things will probably fail if this job is
        #  distributed
        try:
            tf = NamedTemporaryFile(suffix=".zip", delete=False)
            z = PyZipFile(tf, "w")
            z.writepy(module_dir)
            z.close()
            return tf.name
        except:
            logging.warn("Zip file distribution failed -- workers will not get xframes code.")
            logging.warn("Check for unexpected files in xframes directory.")
            return None

    @staticmethod
    def spark_context():
        """
        Returns the spark context.

        Returns
        -------
        out : pyspark.SparkContext
            The SparkContext object from spark.
        """
        return CommonSparkContext().sc()

    @staticmethod
    def spark_config():
        """
        Returns the spark cofig parameters.

        Returns
        -------
        out : list
            A list of the key-value pairs stored as tuples, used to initialize the spark context.
        """
        return CommonSparkContext().config()

    @staticmethod
    def spark_sql_context():
        """
        Returns the spark sql context.

        Returns
        -------
        out : pyspark.sql.SQLContext
            The SQLContext object from spark.
        """
        return CommonSparkContext().sqlc()

    @staticmethod
    def hive_context():
        """
        Returns the hive context.

        Returns
        -------
        out : pyspark.sql.HiveContext
            The Hive object from spark.
        """
        return CommonSparkContext().hivec()

    @staticmethod
    def spark_version():
        """
        Gets the spark version.

        Returns
        -------
        out: list[int]
            The spark version, as a list of integers.
        """
        return CommonSparkContext().version()

    @staticmethod
    def spark_cluster_mode():
        """
        Gets the cluster mode

        Returns
        -------
        out: boolean
            True if spark is running in cluster mode.  Cluster mode means that spark is running on a platform separate
            the program.  In practice, cluster mode means that file arguments must be located on
            a network filesystem such as HDFS or NFS.
        """
        env = Environment.create()
        config = create_spark_config(env)
        return not config.get("spark.master").startswith("local")

Exemple #16

0

Afficher le fichier

Fichier : spark_examine_context.py Projet : Rgee92/snippets-1

#!/usr/bin/env python
from pyspark import SparkConf, SparkContext

# Spark Options:
# https://spark.apache.org/docs/1.6.1/api/java/org/apache/spark/SparkConf.html
conf = SparkConf().setMaster("local").setAppName("MyApp")
sc = SparkContext(conf=conf)

print conf.getAll()

Exemple #17

0

Afficher le fichier

Fichier : hello_world.py Projet : jeremieSimon/hello-spark

run from command line
spark-submit --master yarn-client --conf key=value --conf someotherkey=someothervalue you_code.py
"""

from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext, Row

conf = SparkConf().setAppName("hello-world").setMaster('yarn-client')
conf.set("spark.files.overwrite","true")
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)

#log
log4jLogger = sc._jvm.org.apache.log4j
LOG = log4jLogger.LogManager.getLogger("hello.world.spark")
LOG.info("Args = " + conf.getAll().__str__())

inputFile = conf.get("spark.input")
outputFile = conf.get("spark.output")

wordcount = sc.textFile(inputFile).map(lambda line: line.replace("\"", " ").replace("{", " ").replace("}", " ").replace(".", " ").replace(":", " ")) \
    .flatMap(lambda line: line.split(" ")) \
    .map(lambda word: (word, 1)) \
    .reduceByKey(lambda a, b: a + b) \
    .map(lambda (k, v): (v, k)) \
    .sortByKey(ascending=False) \
    .map(lambda (k, v): (v, k))

df = wordcount.toDF(['word', 'count'])
df.save(path=outputFile, source='json', mode='overwrite')