Python SparkContext.runJob Examples

Programming Language: Python

Namespace/Package Name: pyspark

Class/Type: SparkContext

Method/Function: runJob

Examples at hotexamples.com: 3

Python SparkContext.runJob - 3 examples found. These are the top rated real world Python examples of pyspark.SparkContext.runJob extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

setLogLevel(30)

setSystemProperty(30)

setCheckpointDir(30)

getConf(30)

parallelize(30)

pickleFile(30)

broadcast(30)

emptyRDD(30)

newAPIHadoopFile(30)

binaryFiles(30)

addPyFile(30)

addFile(30)

accumulator(30)

getOrCreate(30)

SparkContext(30)

sequenceFile(30)

newAPIHadoopRDD(25)

_ensure_initialized(14)

createDataFrame(11)

hadoopFile(10)

show_profiles(9)

range(8)

dump_profiles(6)

mongoRDD(6)

binaryRecords(6)

map(4)

setLocalProperty(3)

runJob(3)

flatMap(2)

cassandraTable(2)

collect(2)

close(2)

setJobGroup(2)

paralellize(1)

neo4jTable(1)

neo4jConfig(1)

parallelise(1)

BSONFileRDD(1)

parallelized(1)

parallize(1)

reduceByKey(1)

sample(1)

mongoPairRDD(1)

setMaster(1)

show_profile(1)

sortBy(1)

saveAsTextFile(1)

hadoopConfiguration(1)

mixin(1)

filter(1)

Example #1

Show file

File: Xtest.py Project: yuyinsitan123456/quantum-platform

def Xtest():
    # Configure Spark
    conf = SparkConf() \
        .setMaster("spark://192.168.2.221:7077") \
        .setAppName("quantum") \
        .set("spark.cores.max", "20")\

    sc = SparkContext(conf=conf)
    t1 = np.array([0, 1])
    t2 = np.array([1, 0])
    t3 = np.array(np.sqrt([0.5, 0.5]))
    print("输入：", [t1, t2, t3])
    vectors = sc.parallelize([t1, t2, t3])
    # matrix=np.ones((2,2))-np.identity(2)
    matrix = np.sqrt(0.5) * np.matrix([[1, 1], [1, -1]])
    print(matrix)
    sc.runJob(vectors, lambda part: [print(x) for x in part])
    mult = sc.runJob(vectors, lambda part: [np.dot(matrix, x) for x in part])
    sc.runJob(vectors, lambda part: [print(x) for x in part])
    sc.stop()
    print("计算结果：", mult)

Example #2

Show file

Author: lsl
Date: 2020-09-09 21:41:33
LastEditTime: 2020-09-09 22:01:42
Description: rdd的transformation函数aggregateByKey
'''
from pyspark import SparkConf, SparkContext
import operator

conf = SparkConf()
conf.set("matster", "spark://hadoop-maste:7077")
sc = SparkContext(conf=conf)
# 创建rdd
datas = [("a", 22), ("b", 33), ("c", 44), ("b", 55), ("a", 66)]
rdd = sc.parallelize(datas, 3)
# 查看rdd如何分区
res1 = sc.runJob(rdd, lambda iterator: iterator, partitions=[0])
res2 = sc.runJob(rdd, lambda iterator: iterator, partitions=[1])
res3 = sc.runJob(rdd, lambda iterator: iterator, partitions=[2])
print("res1:", res1)
print("res2:", res2)
print("res3:", res3)
res = rdd.aggregateByKey(1, operator.add, operator.add).collect()
print(res)
print("==============================================")
# 创建rdd
datas1 = [("a", 22), ("a", 66), ("b", 33), ("b", 55), ("c", 44)]
rdd1 = sc.parallelize(datas1, 2)
# 查看rdd如何分区
res11 = sc.runJob(rdd1, lambda iterator: iterator, partitions=[0])
res12 = sc.runJob(rdd1, lambda iterator: iterator, partitions=[1])
print("res11:", res11)

Example #3

Show file

File: 01_pyspark.py Project: wupengbo125/penter

print(pr)  # [[0], [2, 3], [4, 6]]
print(sc.parallelize([0, 2, 3, 4, 6], 10).glom().collect())
# [[], [0], [], [2], [], [3], [], [4], [], [6]]

# sc.range(5).collect() # 等于 sc.parallelize(xrange(5))
# sc.range(2, 4).collect() # sc.parallelize(xrange(2, 4))
# sc.range(1, 7, 2).collect() # sc.parallelize(xrange(1, 7, 2))

print(sc.resources)
"""
runJob(rdd, partitionFunc, partitions=None, allowLocal=False)
对指定的分区集执行给定的partitionFunc，以元素数组的形式返回结果。如果未指定分区，则将在所有分区上运行。
"""

myRDD = sc.parallelize(range(6), 3)
print(sc.runJob(myRDD, lambda part: [x * x for x in part]))
# [0, 1, 4, 9, 16, 25]
print(myRDD.getNumPartitions())  # 3
print(sc.runJob(myRDD, lambda part: [x * x for x in part], [0, 2], True))
# [0, 1, 16, 25]

# RDD的持久化/缓存、容错机制Checkpoint https://msd.misuland.com/pd/4146263708462485282
"""
rdd.cache()

//设置检查点目录,会立即在HDFS上创建一个空目录
sc.setCheckpointDir("hdfs://node01:8020/ckpdir") 

//对rdd1进行检查点保存
rdd1.checkpoint() 
"""