def Xtest():
    # Configure Spark
    conf = SparkConf() \
        .setMaster("spark://192.168.2.221:7077") \
        .setAppName("quantum") \
        .set("spark.cores.max", "20")\

    sc = SparkContext(conf=conf)
    t1 = np.array([0, 1])
    t2 = np.array([1, 0])
    t3 = np.array(np.sqrt([0.5, 0.5]))
    print("输入:", [t1, t2, t3])
    vectors = sc.parallelize([t1, t2, t3])
    # matrix=np.ones((2,2))-np.identity(2)
    matrix = np.sqrt(0.5) * np.matrix([[1, 1], [1, -1]])
    print(matrix)
    sc.runJob(vectors, lambda part: [print(x) for x in part])
    mult = sc.runJob(vectors, lambda part: [np.dot(matrix, x) for x in part])
    sc.runJob(vectors, lambda part: [print(x) for x in part])
    sc.stop()
    print("计算结果:", mult)
Beispiel #2
0
Author: lsl
Date: 2020-09-09 21:41:33
LastEditTime: 2020-09-09 22:01:42
Description: rdd的transformation函数aggregateByKey
'''
from pyspark import SparkConf, SparkContext
import operator

conf = SparkConf()
conf.set("matster", "spark://hadoop-maste:7077")
sc = SparkContext(conf=conf)
# 创建rdd
datas = [("a", 22), ("b", 33), ("c", 44), ("b", 55), ("a", 66)]
rdd = sc.parallelize(datas, 3)
# 查看rdd如何分区
res1 = sc.runJob(rdd, lambda iterator: iterator, partitions=[0])
res2 = sc.runJob(rdd, lambda iterator: iterator, partitions=[1])
res3 = sc.runJob(rdd, lambda iterator: iterator, partitions=[2])
print("res1:", res1)
print("res2:", res2)
print("res3:", res3)
res = rdd.aggregateByKey(1, operator.add, operator.add).collect()
print(res)
print("==============================================")
# 创建rdd
datas1 = [("a", 22), ("a", 66), ("b", 33), ("b", 55), ("c", 44)]
rdd1 = sc.parallelize(datas1, 2)
# 查看rdd如何分区
res11 = sc.runJob(rdd1, lambda iterator: iterator, partitions=[0])
res12 = sc.runJob(rdd1, lambda iterator: iterator, partitions=[1])
print("res11:", res11)
Beispiel #3
0
print(pr)  # [[0], [2, 3], [4, 6]]
print(sc.parallelize([0, 2, 3, 4, 6], 10).glom().collect())
# [[], [0], [], [2], [], [3], [], [4], [], [6]]

# sc.range(5).collect() # 等于 sc.parallelize(xrange(5))
# sc.range(2, 4).collect() # sc.parallelize(xrange(2, 4))
# sc.range(1, 7, 2).collect() # sc.parallelize(xrange(1, 7, 2))

print(sc.resources)
"""
runJob(rdd, partitionFunc, partitions=None, allowLocal=False)
对指定的分区集执行给定的partitionFunc,以元素数组的形式返回结果。如果未指定分区,则将在所有分区上运行。
"""

myRDD = sc.parallelize(range(6), 3)
print(sc.runJob(myRDD, lambda part: [x * x for x in part]))
# [0, 1, 4, 9, 16, 25]
print(myRDD.getNumPartitions())  # 3
print(sc.runJob(myRDD, lambda part: [x * x for x in part], [0, 2], True))
# [0, 1, 16, 25]

# RDD的持久化/缓存、容错机制Checkpoint https://msd.misuland.com/pd/4146263708462485282
"""
rdd.cache()

//设置检查点目录,会立即在HDFS上创建一个空目录
sc.setCheckpointDir("hdfs://node01:8020/ckpdir") 

//对rdd1进行检查点保存
rdd1.checkpoint() 
"""