Python SparkConf Examples

Programming Language: Python

Namespace/Package Name: PySpark

Class/Type: SparkConf

Examples at hotexamples.com: 5

SparkConf is a class in the PySpark library that allows users to configure Spark settings. This class helps build and set various Spark configurations, like the application name, master URL, executor memory, and driver memory.

Example 1:
The following example creates a SparkConf object and sets the application name.

from pyspark import SparkConf

conf = SparkConf().setAppName("myApp")

Example 2:
The following example uses SparkConf to set the master URL, executor memory, and driver memory.

from pyspark import SparkConf

conf = SparkConf() \
    .setAppName("myApp") \
    .setMaster("local[*]") \
    .set("spark.executor.memory", "4g") \
    .set("spark.driver.memory", "4g")

Package library: PySpark

Python SparkConf - 5 examples found. These are the top rated real world Python examples of PySpark.SparkConf extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

SparkConf(5)

Frequently Used Methods

SparkConf (5)

Example #1

Show file

def main():
    conf = SparkConf().set("spark.ui.showConsoleProgress", "false").setAppName("PythonStatusAPIDemo").setMaster("local[*]")
    sc = SparkContext(conf=conf)

    def run():
        rdd = sc.parallelize(range(10), 10).map(delayed(2))
        reduced = rdd.map(lambda x: (x, 1)).reduceByKey(lambda x, y: x + y)
        return reduced.map(delayed(2)).collect()

    result = call_in_background(run)
    status = sc.statusTracker()
    while result.empty():
        ids = status.getJobIdsForGroup()
        for id in ids:
            job = status.getJobInfo(id)
            print("Job", id, "status: ", job.status)
            for sid in job.stageIds:
                info = status.getStageInfo(sid)
                if info:
                    print("Stage %d: %d tasks total (%d active, %d complete)" %
                          (sid, info.numTasks, info.numActiveTasks, info.numCompletedTasks))
        time.sleep(1)

    print("Job results are:", result.get())
    sc.stop()

Example #2

Show file

 def __init__(self):
     self.sparkname = 'transform'
     self.hostname = "localhost"
     self.dbname = 'bigdata'
     self.jdbcPort = '3306'
     self.properties = {
         "user": '******',
         "password": '******',
         "driver": 'com.mysql.jdbc.Driver'
     }
     self.mailto_list = ["*****@*****.**"]
     self.mail_host = "smtp.qq.com"
     self.mail_user = "******"
     self.mail_pass = "******"
     self.mail_postfix = "qq.com"
     self.database = MySQLdb.connect(host=self.hostname,
                                     user=self.properties.get('user'),
                                     passwd=self.properties.get('password'),
                                     db=self.dbname,
                                     charset="utf8")
     self.cursor = self.database.cursor()
     self.conf = SparkConf().setAppName(
         self.sparkname).setMaster("local[*]")
     self.sc = SparkContext(conf=self.conf)
     self.sqlContext = SQLContext(self.sc)

Example #3

Show file

File: JDBC.py Project: jerome520zl/PythonCouldbeEverything

def save(tablename):
    conf = SparkConf().setAppName("python model").setMaster("local[*]")
    sc = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)

    hostname = "10.23.73.118"
    dbname = 'ouye'
    jdbcPort = '3306'
    properties = {
        "user": '******',
        "password": '******',
        "driver": 'com.mysql.jdbc.Driver'
    }
    jdbcUrl = "jdbc:mysql://{0}:{1}/{2}?characterEncoding=utf8".format(
        hostname, jdbcPort, dbname)

    df = sqlContext.read.jdbc(url=jdbcUrl,
                              table=tablename,
                              properties=properties)

    df.show()

Example #4

Show file

File: LowestRatedPopularMovieSpark.py Project: ddzmitry/HDFS

    with open("ml-100k/u.item") as f:
        for line in f:
            fields = line.split('|')
            movieNames[int(fields[0])] = fields[1]
    return movieNames

# Take each line of u.data and convert it to (movieID, (rating, 1.0))
# This way we can then add up all the ratings for each movie, and
# the total number of ratings for each movie (which lets us compute the average)
def parseInput(line):
    fields = line.split()
    return (int(fields[1]), (float(fields[2]), 1.0))

if __name__ == "__main__":
    # The main script - create our SparkContext
    conf = SparkConf().setAppName("WorstMovies")
    sc = SparkContext(conf = conf)

    # Load up our movie ID -> movie name lookup table
    movieNames = loadMovieNames()

    # Load up the raw u.data file
    lines = sc.textFile("hdfs:///user/maria_dev/ml-100k/u.data")

    # Convert to (movieID, (rating, 1.0))
    movieRatings = lines.map(parseInput)

    # Reduce to (movieID, (sumOfRatings, totalRatings))
    ratingTotalsAndCount = movieRatings.reduceByKey(lambda movie1, movie2: ( movie1[0] + movie2[0], movie1[1] + movie2[1] ) )

    # Filter out movies rated 10 or fewer times

Example #5

Show file

File: transtive_closure.py Project: jerome520zl/PythonCouldbeEverything

def generateGraph():
    edges = set()
    while len(edges) < numEdges:
        src = rand.randrange(0, numVertices)
        dst = rand.randrange(0, numVertices)
        if src != dst:
            edges.add((src, dst))
    return edges


if __name__ == "__main__":
    """
    Usage: transitive_closure [partitions]
    """
    conf = SparkConf().setAppName("PythonTransitiveClosure").setMaster(
        "local[*]")
    sc = SparkContext(conf=conf)
    # partitions = int(sys.argv[1]) if len(sys.argv) > 1 else 2
    tc = sc.parallelize(generateGraph(), 2).cache()

    # Linear transitive closure: each round grows paths by one edge,
    # by joining the graph's edges with the already-discovered paths.
    # e.g. join the path (y, z) from the TC with the edge (x, y) from
    # the graph to obtain the path (x, z).

    # Because join() joins on keys, the edges are stored in reversed order.
    edges = tc.map(lambda x_y: (x_y[1], x_y[0]))

    oldCount = 0
    nextCount = tc.count()
    while True: