def Xtest(): # Configure Spark conf = SparkConf() \ .setMaster("spark://192.168.2.221:7077") \ .setAppName("quantum") \ .set("spark.cores.max", "20")\ sc = SparkContext(conf=conf) t1 = np.array([0, 1]) t2 = np.array([1, 0]) t3 = np.array(np.sqrt([0.5, 0.5])) print("输入:", [t1, t2, t3]) vectors = sc.parallelize([t1, t2, t3]) # matrix=np.ones((2,2))-np.identity(2) matrix = np.sqrt(0.5) * np.matrix([[1, 1], [1, -1]]) print(matrix) sc.runJob(vectors, lambda part: [print(x) for x in part]) mult = sc.runJob(vectors, lambda part: [np.dot(matrix, x) for x in part]) sc.runJob(vectors, lambda part: [print(x) for x in part]) sc.stop() print("计算结果:", mult)
Author: lsl Date: 2020-09-09 21:41:33 LastEditTime: 2020-09-09 22:01:42 Description: rdd的transformation函数aggregateByKey ''' from pyspark import SparkConf, SparkContext import operator conf = SparkConf() conf.set("matster", "spark://hadoop-maste:7077") sc = SparkContext(conf=conf) # 创建rdd datas = [("a", 22), ("b", 33), ("c", 44), ("b", 55), ("a", 66)] rdd = sc.parallelize(datas, 3) # 查看rdd如何分区 res1 = sc.runJob(rdd, lambda iterator: iterator, partitions=[0]) res2 = sc.runJob(rdd, lambda iterator: iterator, partitions=[1]) res3 = sc.runJob(rdd, lambda iterator: iterator, partitions=[2]) print("res1:", res1) print("res2:", res2) print("res3:", res3) res = rdd.aggregateByKey(1, operator.add, operator.add).collect() print(res) print("==============================================") # 创建rdd datas1 = [("a", 22), ("a", 66), ("b", 33), ("b", 55), ("c", 44)] rdd1 = sc.parallelize(datas1, 2) # 查看rdd如何分区 res11 = sc.runJob(rdd1, lambda iterator: iterator, partitions=[0]) res12 = sc.runJob(rdd1, lambda iterator: iterator, partitions=[1]) print("res11:", res11)
print(pr) # [[0], [2, 3], [4, 6]] print(sc.parallelize([0, 2, 3, 4, 6], 10).glom().collect()) # [[], [0], [], [2], [], [3], [], [4], [], [6]] # sc.range(5).collect() # 等于 sc.parallelize(xrange(5)) # sc.range(2, 4).collect() # sc.parallelize(xrange(2, 4)) # sc.range(1, 7, 2).collect() # sc.parallelize(xrange(1, 7, 2)) print(sc.resources) """ runJob(rdd, partitionFunc, partitions=None, allowLocal=False) 对指定的分区集执行给定的partitionFunc,以元素数组的形式返回结果。如果未指定分区,则将在所有分区上运行。 """ myRDD = sc.parallelize(range(6), 3) print(sc.runJob(myRDD, lambda part: [x * x for x in part])) # [0, 1, 4, 9, 16, 25] print(myRDD.getNumPartitions()) # 3 print(sc.runJob(myRDD, lambda part: [x * x for x in part], [0, 2], True)) # [0, 1, 16, 25] # RDD的持久化/缓存、容错机制Checkpoint https://msd.misuland.com/pd/4146263708462485282 """ rdd.cache() //设置检查点目录,会立即在HDFS上创建一个空目录 sc.setCheckpointDir("hdfs://node01:8020/ckpdir") //对rdd1进行检查点保存 rdd1.checkpoint() """