def SearchTiles_and_Factorize(n): global globalmergedtiles global globalcoordinates global factors_accum global spcon spcon = SparkContext("local[4]","Spark_TileSearch_Optimized") if persisted_tiles == True: tileintervalsf=open("/home/shrinivaasanka/Krishna_iResearch_OpenSource/GitHub/asfer-github-code/cpp-src/miscellaneous/DiscreteHyperbolicFactorizationUpperbound_TileSearch_Optimized.tileintervals","r") tileintervalslist=tileintervalsf.read().split("\n") #print "tileintervalslist=",tileintervalslist tileintervalslist_accum=spcon.accumulator(tilesintervalslist, VectorAccumulatorParam()) paralleltileintervals=spcon.parallelize(tileintervalslist) paralleltileintervals.foreach(tilesearch) else: factorsfile=open("DiscreteHyperbolicFactorizationUpperbound_TileSearch_Optimized.factors","w") hardy_ramanujan_ray_shooting_queries(n) hardy_ramanujan_prime_number_theorem_ray_shooting_queries(n) baker_harman_pintz_ray_shooting_queries(n) cramer_ray_shooting_queries(n) zhang_ray_shooting_queries(n) factors_accum=spcon.accumulator(factors_of_n, FactorsAccumulatorParam()) #spcon.parallelize(xrange(1,n)).foreach(tilesearch_nonpersistent) spcon.parallelize(spcon.range(1,n).collect()).foreach(tilesearch_nonpersistent) print "factors_accum.value = ", factors_accum.value factors=[] factordict={} for f in factors_accum.value: factors += f factordict[n]=factors json.dump(factordict,factorsfile) return factors
def main(argv=None): '''this is called if run from command line''' parser = argparse.ArgumentParser() parser.add_argument('-i','--input', help="Required Seq input file on cluster.", required=True) args = parser.parse_args() sc = SparkContext() global goodJsonRecords, badJsonRecords, noPublisherRecords, noPublisherNameRecords goodJsonRecords = sc.accumulator(0) badJsonRecords = sc.accumulator(0) noPublisherRecords = sc.accumulator(0) noPublisherNameRecords = sc.accumulator(0) data = sc.sequenceFile(args.input, "org.apache.hadoop.io.Text", "org.apache.hadoop.io.Text") keyCounts = data.values().flatMap(getKeys).countByValue() print "========================================" print "goodJsonRecords = %d" % goodJsonRecords.value print "badJsonRecords = %d" % badJsonRecords.value print "noPublisherRecords = %d" % noPublisherRecords.value print "noPublisherNameRecords = %d" % noPublisherNameRecords.value for k in sorted(keyCounts): print k, keyCounts[k] print "========================================" sc.stop()
def main(argv=None): '''this is called if run from command line''' parser = argparse.ArgumentParser() parser.add_argument('-i','--input', help="Seq input file on cluster.", required=True) parser.add_argument('-o','--output', help="UTF-8 output file on cluster.", required=False) parser.add_argument('-p','--printToLog', help="Print results to log.", required=False, action='store_true') args = parser.parse_args() sc = SparkContext() global goodJsonRecords, badJsonRecords goodJsonRecords = sc.accumulator(0) badJsonRecords = sc.accumulator(0) data = sc.sequenceFile(args.input, "org.apache.hadoop.io.Text", "org.apache.hadoop.io.Text") tagCounts = data.values().flatMap(getTokens).countByValue() # So far, this code isn't useful. The output fiile is written by the # master node into an isolated folder, and I don't know of a way to # retrieve it. if args.output != None: with codecs.open(args.output, 'wb', 'utf-8') as f: for k in sorted(tagCounts): f.write(k + " " + str(tagCounts[k]) + "\n") print "========================================" print "goodJsonRecords = %d" % goodJsonRecords.value print "badJsonRecords = %d" % badJsonRecords.value if args.printToLog: for k in sorted(tagCounts): print json.dumps(k), tagCounts[k] print "========================================"
def main(argv=None): '''this is called if run from command line''' parser = argparse.ArgumentParser() parser.add_argument('-i','--input', help="Seq input file on cluster.", required=True) args = parser.parse_args() sc = SparkContext() global goodJsonRecords, badJsonRecords goodJsonRecords = sc.accumulator(0) badJsonRecords = sc.accumulator(0) data = sc.sequenceFile(args.input, "org.apache.hadoop.io.Text", "org.apache.hadoop.io.Text") tagTokenCounts = data.values().flatMap(getTokens).countByValue() sc.stop() print "========================================" print "goodJsonRecords = %d" % goodJsonRecords.value print "badJsonRecords = %d" % badJsonRecords.value print "========================================" # Restructure the data, grouping by tag (token type indicator): tagTokenLists = {} for tagToken in tagTokenCounts.keys(): (tag, tokenValue) = tagToken.split(":", 1) count = tagTokenCounts[tagToken] if tag not in tagTokenLists: tagTokenLists[tag] = [] tagTokenLists[tag].append(Token(tokenValue, count)) # Process each tag seperately: for tag in tagTokenLists.keys(): tokenList = tagTokenLists[tag] # Sort the tokens by descending count and ascending token value: sortedTokenList = sorted(tokenList, key=attrgetter("value")) sortedTokenList = sorted(sortedTokenList, key=attrgetter("count"), reverse=True) # Calculate the cumulative token count for each token in sorted order: totalTokens = 0 for token in sortedTokenList: totalTokens += token.count token.cumulativeCount = totalTokens # We'll use the final total later, but we need it as a float to ensure # floating point division is used: floatTotalTokens = float(totalTokens) # Print the sorted tokens with cumulative counts, fraction of # total (cunumative distribution function), and index # (enumerate the tokens per tag, starting with 1). print "========================================" tokenIndex = 0 for token in sortedTokenList: tokenIndex += 1 fractionOfTotal = token.cumulativeCount / floatTotalTokens print("{0:8d} {1:50} {2:10d} {3:10d} {4:.5f}".format(tokenIndex, json.dumps(tag + ": " + token.value), token.count, token.cumulativeCount, fractionOfTotal)) print "========================================"
def main(argv=None): """this is called if run from command line""" parser = argparse.ArgumentParser() parser.add_argument("-e", "--excludeTags", help="Comma-separated list of tags to exclude.", required=False) parser.add_argument("--includeTags", help="Comma-separated list of tags to include.", required=False) parser.add_argument("-i", "--input", help="Seq or tuple input file.", required=True) parser.add_argument("--inputTuples", help="The input file is in tuple format.", required=False, action="store_true") parser.add_argument("-o", "--output", help="UTF-8 output file on cluster.", required=False) parser.add_argument("-p", "--printToLog", help="Print results to log.", required=False, action="store_true") args = parser.parse_args() if args.excludeTags and args.includeTags: print "Pick either --excludeTags or --includeTags, not both." return 1 sc = SparkContext() global goodJsonRecords, badJsonRecords, excludedTagCount, includedTagCount, tokenCount goodJsonRecords = sc.accumulator(0) badJsonRecords = sc.accumulator(0) excludedTagCount = sc.accumulator(0) includedTagCount = sc.accumulator(0) tokenCount = sc.accumulator(0) if args.inputTuples: data = sc.textFile(args.input).map(lambda x: eval(x)) else: data = sc.sequenceFile(args.input, "org.apache.hadoop.io.Text", "org.apache.hadoop.io.Text") tagPhraseCounts = data.values().flatMap(getPhrasesMaker(args.includeTags, args.excludeTags)).countByValue() sc.stop() # So far, this code isn't useful. The output fiile is written by the # master node into an isolated folder, and I don't know of a way to # retrieve it. if args.output != None: with codecs.open(args.output, "wb", "utf-8") as f: for k in sorted(tagPhraseCounts): f.write(k + " " + str(tagPhraseCounts[k]) + "\n") print "========================================" print "goodJsonRecords = %d" % goodJsonRecords.value print "badJsonRecords = %d" % badJsonRecords.value print "excludedTagCount = %d" % excludedTagCount.value print "includedTagCount = %d" % includedTagCount.value print "tokenCount = %d" % tokenCount.value if args.printToLog: for k in sorted(tagPhraseCounts): print json.dumps(k), tagPhraseCounts[k] print "========================================"
def SparkBroadcastAccumulator(n): global broadcast_var global accumulator_var spcon = SparkContext("local[2]","SparkBroadcastAccumulator") broadcast_var=spcon.broadcast("broadcast_message") accumulator_var=spcon.accumulator(0) spcon.parallelize(xrange(1,n)).foreach(lambda x: broadcast_accumulator_receiver(accumulator_var.add(x)))
def longest_common_substring(strands): pass # create the Spark context conf = SparkConf().setAppName("longest_common_substring") sc = SparkContext(conf=conf) # create an accumulator for key-value pairs, where each key is a substring, and each value is the set of strings where the substring can be found class ArrayAccumulatorParam(AccumulatorParam): def zero(self, initialValue): return initialValue def addInPlace(self, v1, v2): if type(v2) is list: v1.extend(v2) elif type(v2) is tuple: v1.append(v2) return v1 acc = sc.accumulator([], ArrayAccumulatorParam()) def generate_substrings(data_element): k, v = data_element i = 0 while i < len(v): j = i + 1 while j < len(v): acc.add((v[i:j],k)) j += 1 i += 1 sc.parallelize([(k, v) for k, v in strands.iteritems()]).foreach(generate_substrings) all_substrings = sc.parallelize(acc.value) return all_substrings.groupByKey().filter(lambda x: set(list(x[1])) == set(strands.keys())).takeOrdered(1, key=lambda x: -len(x[0]))[0][0]
def main(argv=None): '''this is called if run from command line''' parser = argparse.ArgumentParser() parser.add_argument('-i','--input', help="Required Seq input file on cluster.", required=True) args = parser.parse_args() sc = SparkContext() global goodJsonRecords, badJsonRecords goodJsonRecords = sc.accumulator(0) badJsonRecords = sc.accumulator(0) data = sc.textFile(args.input).map(lambda x: eval(x)) keyCounts = data.values().flatMap(getKeys).countByValue() sc.stop() print "========================================" print "goodJsonRecords = %d" % goodJsonRecords.value print "badJsonRecords = %d" % badJsonRecords.value for k in sorted(keyCounts): print k, keyCounts[k] print "========================================" sc.stop()
def word_count_compute(hdfs_input,hdfs_output,min_n,max_n): #TODO add start of sent # TODO large max size sums = [] vocab_size = [] sc = SparkContext("local","Simple Language Model Computing") file = sc.textFile(hdfs_input) counts = file.flatMap(lambda a : get_ngrams(a,min_n,max_n)).reduceByKey(lambda a, b: a + b) for i in range(min_n, max_n+1): temp = counts.filter(lambda a: is_ngram(a,i)) accum = sc.accumulator(0) temp.foreach(lambda a: accum.add(a[1])) #temp_sum = sum(x[1] for x in temp.collect()) sums.append(accum.value) temp_counts = temp.count() vocab_size.append(temp_counts) #print i,temp_counts,temp_sum print sums,vocab_size return counts,sums,vocab_size
if __name__ == "__main__": if len(sys.argv) != 3: print("Usage: direct_kafka_wordcount.py <broker_list> <topic>", file=sys.stderr) exit(-1) sc = SparkContext(appName="PythonStreamingDirectKafkaWordCount") ssc = StreamingContext(sc, 2) sqlContext = SQLContext(sc) sc.setLogLevel("WARN") ############## # Globals ############## globals()['maxTemp'] = sc.accumulator(0.0) brokers, topic = sys.argv[1:] kvs = KafkaUtils.createDirectStream(ssc, [topic], {"metadata.broker.list": brokers}) jsonDStream = kvs.map(lambda (key, value): value) # Define function to process RDDs of the json DStream to convert them # to DataFrame and run SQL queries def process(time, rdd): # Match local function variables to global variables maxTemp = globals()['maxTemp'] print("========= %s =========" % str(time)) print("rdd = %s" % str(rdd))
words_new = sc.broadcast(["scala", "java", "hadoop", "spark", "akka"]) data = words_new.value print("Stored data -> {}".format(data)) elem = words_new.value[2] print("Printing a particular element in RDD -> {}".format(elem)) """ Accumulator variables are used for aggregating the information through associative and commutative operations. For example, you can use an accumulator for a sum operation or counters (in MapReduce). The following example shows how to use an Accumulator variable. An Accumulator variable has an attribute called value that is similar to what a broadcast variable has. It stores the data and is used to return the accumulator's value, but usable only in a driver program. In this example, an accumulator variable is used by multiple workers and returns an accumulated value. """ num = sc.accumulator(10) def f(x): global num num += x rdd = sc.parallelize([20, 30, 40, 50]) rdd.foreach(f) final = num.value print("Accumulated value is -> {}".format(final))
# multiple occurances per article shouldnt be counted counts = words.groupByKey() print("Absolut count of articles containing the word:", counts.count()) print("Relative count of articles containing the word:", float(counts.count()) / rdd.count()) # articles containing the word articles = sc.textFile("/user/bigdata/wikipedia-text-tiny-clean500") \ .filter(lambda x: WORD in re.compile('\w+').findall(x.lower())) # Each article is saved in a file in the folder 'output' articles.saveAsTextFile('output') # using accumulators overall_count = sc.accumulator(0) word_count = sc.accumulator(0) def increment(article): overall_count.add(1) if WORD in article: word_count.add(1) rdd.foreach(increment) print("Absolut count: %s, relative count %f" % (word_count, float(word_count.value) / overall_count.value)) # using data frames with sql sqlContext = SQLContext(sc)
#Boilerplate stuff: from pyspark import SparkConf, SparkContext conf = SparkConf().setMaster("local").setAppName("DegreesOfSeparation") sc = SparkContext(conf=conf) # The characters we wish to find the degree of separation between: startCharacterID = 5306 #SpiderMan targetCharacterID = 14 #ADAM 3,031 (who?) # Our accumulator, used to signal when we find the target character during # our BFS traversal. hitCounter = sc.accumulator(0) def convertToBFS(line): fields = line.split() heroID = int(fields[0]) connections = [] for connection in fields[1:]: connections.append(int(connection)) color = 'WHITE' distance = 9999 if (heroID == startCharacterID): color = 'GRAY' distance = 0 return (heroID, (connections, distance, color))
for y in xrange(length): if (x == y): bits.append(value) else: bits.append(0) return bits # Pre-bin counts of false positives for different threshold ranges BINS = 101 nthresholds = 100 def bin(similarity): return int(similarity * nthresholds) # fpCounts[i] = number of entries (possible false positives) where bin(similarity) == i zeros = [0] * BINS fpCounts = sc.accumulator(zeros, VectorAccumulatorParam()) def add_element(score): global fpCounts b = bin(score) fpCounts += set_bit(b, 1, BINS) simsFullValuesRDD.foreach(add_element) # Remove true positives from FP counts def sub_element(score): global fpCounts b = bin(score) fpCounts += set_bit(b, -1, BINS) trueDupSimsRDD.foreach(sub_element)
a_wenting += 1 elif "dongshen" in element: a_dongshen += 1 elif "qifeng" in element: a_qifeng += 1 else: a_else += 1 return element if __name__ == "__main__": if len(sys.argv) != 2: print("Usage: input <file>") exit(-1) sc = SparkContext(appName="accumulatorTest") # 创建四个统计要素 a_wenting = sc.accumulator(0) a_dongshen = sc.accumulator(0) a_qifeng= sc.accumulator(0) a_else= sc.accumulator(0) lines = sc.textFile(sys.argv[1], 2).map(fun) for line in lines.collect(): print line print a_wenting.value print a_else.value sc.stop()
from functools import reduce # Import pyspark modules from pyspark import SparkConf, SparkContext, StorageLevel # noqa # Create a Spark configuration object, to create a context into which we create # a session => The only object to be manipulated here is the spark_context spark_conf = SparkConf().setMaster('local[*]') spark_context = SparkContext(conf=spark_conf) def convert(a): return [x for x in a if a.count(x) > 1] acc = spark_context.accumulator(0) scoreVar = spark_context.broadcast([1, 42, 100, 9999]) def score(assist, dmg, kill, position): if kill == 0 or assist / kill >= 5: acc.add(1) s = assist * scoreVar.value[1] + dmg * scoreVar.value[0] + kill * scoreVar.value[2] if position == 1: s += scoreVar.value[3] return s try: # date game_size match_id match_mode party_size player_assists player_dbno player_dist_ride player_dist_walk player_dmg player_kills player_name player_survive_time team_id team_placement
from pyspark import SparkContext, SparkConf import numpy as np conf = SparkConf() conf.set('master', 'spark://hadoop-maste:7077') context = SparkContext(conf=conf) acc = context.accumulator(0) print(type(acc), acc.value) rdd = context.parallelize(np.arange(101), 5) def acc_add(a): acc.add(a) return a rdd2 = rdd.map(acc_add) print(rdd2.collect()) print(acc.value) context.stop()
#Boilerplate stuff: from pyspark import SparkConf, SparkContext #conf = SparkConf().setMaster("local").setAppName("DegreesOfSeparation") #sc = SparkContext(conf = conf) sc = SparkContext("yarn") # The characters we wish to find the degree of separation between: startCharacterID = 5306 #SpiderMan targetCharacterID = 14 #ADAM 3,031 (who?) # Our accumulator, used to signal when we find the target character during # our BFS traversal. hitCounter = sc.accumulator( 0 ) # the variables updates from every node because each nodes update relayed back to the # driver and aggregated. def convertToBFS(line): fields = line.split() heroID = int(fields[0]) connections = [] for connection in fields[ 1:]: # will append all fields in that element (row) starting from fields[1] connections.append(int(connection)) color = 'WHITE' distance = 9999
f = inputFiles[i] if subFileCount == filePartitionSize: subDirs.append(subDir) subDirNum += 1 subFileCount = 0 subDir = str(subDirNum) + "/" os.makedirs(input_dir + subDir) shutil.move(input_dir + f, input_dir + subDir) subFileCount += 1 if subFileCount == filePartitionSize: subDirs.append(subDir) sc = SparkContext("local[" + numCores + "]", "job", pyFiles=[realpath('helper.py')]) timeLoads = sc.accumulator([0] * len(intervals), VectorAccumulatorParam()) bs2imsi2wasActivePrevTime = defaultdict(lambda: defaultdict()) prev_idx = 0 for i in range(len(subDirs)): d = subDirs[i] end_idx = intervals.index(dirTimeBoundaries[i]) intervalBoundary = (prev_idx + 1, end_idx) #both indexes are included prev_idx = end_idx bs2data = sc.textFile(input_dir + d + '*.gz').filter(filterData).map( generateBS2Data).reduceByKey(reduceBS2IMSI2Data) bs2data.foreach(getAccumLoad) print len(bs2imsi2wasActivePrevTime) resetDirectories(subDirs, input_dir) sys.exit()
categories[category] = float(len(categories)) # Hashing used to convert categorical input features to numeric values htf = HashingTF(5000) # Perform feature extraction on train and test data splits to feed the data to algorithm trainingData = inputRdd.map(lambda x: LabeledPoint( categories[x[1]], htf.transform([x[2], x[3].split('/')[0], x[4].split(':')[0], x[8]]))) testingSet = testRdd.map(lambda x: htf.transform( [x[2], x[3].split('/')[0], x[4].split(':')[0], x[8]])) # Train the model on the train split. Classifies a record with the probability of occurrence of a crime category given month, hour, weekday, area. model = NaiveBayes.train(trainingData, 1.0) # Use the trained model to predict test data. Returns predicted labels for each record. predictions = model.predict(testingSet) # Get actual labels for the test records label_actual = testRdd.map(lambda x: categories[x[1]]) # Initialize counter to note instances labels being corrected accurately correct_labels = sc.accumulator(0) for label_a, label_p in zip(label_actual.collect(), predictions.collect()): if (label_a == label_p): correct_labels.add(1) print "Accuracy is: " + str( (float(correct_labels.value) / float(predictions.count())) * 100) sc.stop()
from pyspark import SparkConf, SparkContext conf = SparkConf().setAppName("Sum_of_numbers.py").setMaster("local[2]") sc = SparkContext(conf=conf) data = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] numbersRdd = sc.parallelize(data) sum = numbersRdd.reduce(lambda a,b: a+b) print "Using reduce(): Sum Of given numbers is %i " %(sum) sumAcc = sc.accumulator(0,"total") numbersRdd.foreach(lambda x: sumAcc = sumAcc + x) print "Using Accumulator: Sum Of given numbers is %i " %(sumAcc)
for count, topic_probability in enumerate(topic_distribution.toArray().tolist()): topic_distribution_dict["topic_{}".format(count)] = topic_probability return topic_distribution_dict if __name__ == "__main__": sc = SparkContext(appName="Stream Layer", master="local[2]") ssc = StreamingContext(sc, 10) ssc.checkpoint("checkpoint_stream") sc._jsc.hadoopConfiguration().set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") sc._jsc.hadoopConfiguration().set('fs.s3a.access.key', AWS_ACCESS_KEY_ID) sc._jsc.hadoopConfiguration().set('fs.s3a.secret.key', AWS_SECRET_ACCESS_KEY) number_of_tweets = sc.accumulator(0) # Kafka connection brokers = 'localhost:9092' topics = ["raw_tweets"] kvs = KafkaUtils.createDirectStream(ssc, topics, {"metadata.broker.list": brokers}) # Kafka emits tuples, so we need to acces to the second element tweets = kvs.map(lambda tweet: tweet[1]).cache() # save to HDFS tweets.foreachRDD(save_stream) tweets = tweets.map(lambda tweet: json.loads(tweet)) # Convert strings to dicts tweets = parse_tweets(tweets)
import json import math import os from pyspark import SparkContext from pyspark import SparkFiles sparkMaster = sys.argv[1] inputFile = sys.argv[2] outputDir = sys.argv[3] sc = SparkContext(sparkMaster, appName="ChapterSixExample") file = sc.textFile(inputFile) # Count lines with KK6JKQ using accumulators count = sc.accumulator(0) def incrementCounter(line): global count # Access the counter if "KK6JKQ" in line: count += 1 file.foreach(incrementCounter) print "Lines with KK6JKQ %d" % count.value # Create Accumulator[Int] initialized to 0 blankLines = sc.accumulator(0) dataLines = sc.accumulator(0)
update[edge[0] - 1] = a[edge[1] - 1] h_accum.add(update) class VectorAccumulatorParam(AccumulatorParam): def zero(self, value): return np.zeros(len(value)) def addInPlace(self, v1, v2): v1 += v2 return v1 for i in range(40): a_accum = sc.accumulator(a, VectorAccumulatorParam()) edgeList.foreach(lambda edge: update_a(edge, h, a_accum)) a = a_accum.value a = a / np.amax(a) h_accum = sc.accumulator(h, VectorAccumulatorParam()) edgeList.foreach(lambda edge: update_h(edge, a, h_accum)) h = h_accum.value h = h / np.amax(h) asort = np.argsort(a) # print(list(a)) print("a Worst: ", asort[:5] + 1) print("a Best: ", asort[990:] + 1)
from pyspark import SparkContext from subprocess import call execfile('PageRank.py') # load original graph file sc = SparkContext() #graph_file = sc.textFile('hdfs:///user/leiyang/PageRank-test.txt') #index_file = sc.textFile('hdfs:///user/leiyang/toy_index.txt') graph_file = sc.textFile('hdfs:///user/leiyang/all-pages-indexed-out.txt', 80) index_file = sc.textFile('hdfs:///user/leiyang/indices.txt', 16) # initialize variables nDangling = sc.accumulator(0) lossMass = sc.accumulator(0.0) damping = 0.85 alpha = 1 - damping nTop, nIter = 200, 10 start = time() print '%s: start PageRank initialization ...' %(logTime()) graph = graph_file.flatMap(initialize).reduceByKey(accumulateMass).map(getDangling) #.cache() # get graph size G = graph.count() # broadcast dangling mass for redistribution p_dangling = sc.broadcast(1.0*nDangling.value/G) graph = graph.map(redistributeMass) print '%s: initialization completed, dangling node(s): %d, total nodes: %d' %(logTime(), nDangling.value, G) # run page rank
from pyspark import SparkConf from pyspark import StorageLevel from pyspark.rdd import RDD from termcolor import colored import json conf = SparkConf().setMaster('local').setAppName('PySparkShell') sc = SparkContext(conf=conf) #设置log级别 sc.setLogLevel("WARN") # spark = SQLContext(sc) inputFile = 'input_demo.txt' outputDir = 'i0out.txt' file = sc.textFile(inputFile) # 创建Accumulator[Int]并初始化为0 blankLines = sc.accumulator(0) def extractCallSigns(line): global blankLines # 访问全局变量 if (line == ""): blankLines += 1 return line.split(" ") callSigns = file.flatMap(extractCallSigns) callSigns.saveAsTextFile(outputDir + "/callsigns") print("Blank lines: %d" % blankLines.value)
# set optimal parameters to run the algorithms conf = SparkConf() # disable all the timeouts so they don't cause any trouble when running big data conf.set("spark.network.timeout", "36001s") conf.set("spark.executor.heartbeatInterval", "36000s") conf.set("spark.storage.blockManagerSlaveTimeoutMs", "36000s") conf.set("spark.worker.timeout", "36000s") conf.set("spark.sql.broadcastTimeout", "36000s") # give both executors an drivers enough memory so they can execute faster, the exact numbers can be adjusted conf.set("spark.executor.memory", "5g") conf.set("spark.driver.memory", "8g") conf.set("spark.worker.cleanup.enabled", "true") sc = SparkContext("local[3]", "PageRanking", conf=conf) deadendaccumulator = sc.accumulator(0) # filtering mini database to be of form (from node, list of to nodes) data = sc.textFile("./Dataset/web-Google.txt").filter(lambda l: not str(l).startswith("#")) \ .flatMap(addEntries) \ .reduceByKey(lambda x, y: x + y) #filtering for the big dataset #data = data.filter(lambda x: int(x[1]) != 0 and int(x[1]) <= 50000000).map(lambda x: ( # int(x[1]) - 1, # list(map(lambda x: int(x), filter(lambda x: x != "" and int(x) <= 50000000, str(x[0]).split(" ")))))) # pass the input to pagerank algorithm result = pageRank(data) # write result to csv file
def CoKNNSVMTrainAndPredictOnSpark(self): """ 训练模型,预测结果 """ global TOTALFEATURESANDLABEL sc = SparkContext(appName="CoKNNSVMTrainAndPredictOnSpark") TOTALFEATURESANDLABEL = sc.accumulator([], ListParamForFeatureAndLabel()) features = sc.textFile(self.__filepath) def makefeatures(line): """ 根据“_v”切分出类别信息 :param line:关键帧的特征 """ classname = os.path.basename(line[0]).split("_v")[0] classnum = self.__classmap[classname] return (float(classnum), [float(x) for x in line[1]]) def getmodelandaccuary(line): """ 训练模型,预测结果 :param line: hdfs上的要读取的features目录的目录 :return: 准确率 """ global TOTALFEATURESANDLABEL TOTALFEATURESANDLABEL += [(line[0], line[1])] # features.map(lambda x:x.split(" ")).map(getmodelandaccuary).repartition(1).saveAsTextFile(self.__savepath) features.map(lambda x: x.split(" ")).map(lambda x: (x[1], x[2:])).map( makefeatures).map(getmodelandaccuary).count() totalfeaturesandlabel = TOTALFEATURESANDLABEL.value def getfeaturelistandlabellist(totalfeaturesandlabel): """ 把累加器中的label和特征的元组提出来,形成标签list和featrueslist :param totalfeaturesandlabel:label和特征的元组 :return:(标签list,featrueslist) """ TOTALFEATURES = [] TOTALLABEL = [] for i in range(0, len(totalfeaturesandlabel)): TOTALLABEL.append(totalfeaturesandlabel[i][0]) TOTALFEATURES.append(totalfeaturesandlabel[i][1]) return (TOTALLABEL, TOTALFEATURES) totallabel, totalfeatures = getfeaturelistandlabellist( totalfeaturesandlabel) # y = totallabel # x = totalfeatures # x = Co_KNN_SVM_Utilities.getfeatureforlibsvm(x) random_index = [i for i in range(len(totallabel))] # test_random_index = [i for i in range(len(x))] random.shuffle(random_index) # random.shuffle(test_random_index) random_y = [totallabel[x] for x in random_index] random_x = [totalfeatures[x] for x in random_index] # random_test_y = [test_y[x] for x in test_random_index] # random_test_x = [test_x[x] for x in test_random_index] # random_train_y = [train_y[x] for x in train_random_index] # random_train_x = [train_x[x] for x in train_random_index] # random_test_y = [test_y[x] for x in test_random_index] # random_test_x = [test_x[x] for x in test_random_index] # random_train_y = train_y # random_train_x = train_x # random_test_y = test_y # random_test_x = test_x # train_y = random_y[0:1500] # train_x = random_x[0:1500] # test_y = random_y[1500:1580] # test_x = random_x[1500:1580] train_x, test_x, train_y, test_y = train_test_split(totalfeatures, totallabel, test_size=0.2, shuffle=False) # train_y = totallabel[0:800] # train_x = totalfeatures[0:800] # test_y = totallabel[800:1580] # test_x = totalfeatures[800:1580] Co_KNN_SVM_New.Co_KNN_SVM(train_y, train_x, test_y, test_x, self.__savepath)
return Counter() def addInPlace(self, hashdict, items): hashdict.update(items) return hashdict if __name__ == '__main__': sc = SparkContext(appName="FraMultiStage") data_input = 's3://progetto-analisi-di-dati-unimi/dataset' data_output = 's3://progetto-analisi-di-dati-unimi/output_multistage/' split_by, supp, combsize = ',', 18000, 2 data = sc.textFile(data_input).map(lambda x: sorted(set(x.split(split_by)))) #Converting item names to number item_to_n = sc.accumulator(dict(), DictAccumulatorParam()) data.foreach(item_to_n.add) item_to_n = item_to_n.value data = data.map(lambda x: [item_to_n[i] for i in x]) #Hashmaps and their hash functions hashmap1 = sc.accumulator(Counter(), HashMapAccumulator()) hashmap2 = sc.accumulator(Counter(), HashMapAccumulator()) #Get frequent items from bucket def getFreq(bucket): return filter(lambda i: i in freq, bucket) #Hashmaps functions for each hashmap def hashf1(x): return sum(x) % 21243 def hashf2(x): return sum(x) % 10621
from pyspark import SparkConf, SparkContext conf = SparkConf().setMaster("local").setAppName("DegreesOfSeparation") sc = SparkContext(conf=conf) start_character_id = 5306 # Spiderman target_character_id = 14 # Adam hit_counter = sc.accumulator(0) def convert_to_bfs(line): fields = line.split() hero_id = int(fields[0]) connections = [] for connection in fields[1:]: connections.append(int(connection)) color = 'WHITE' distance = 9999 if(hero_id == start_character_id): color = 'GRAY' distance = 0 return (hero_id, (connections, distance, color)) def create_starting_rdd(): input_file = sc.textFile("../data/Marvel-graph.txt") return input_file.map(convert_to_bfs)
x = re.sub("'", '', x) return re.sub('[?!@#$\'",.;:()]', '', x).lower() def countWord(line): global count_number if (line == "Tokyo"): count_number += 1 return line.split(' ') if __name__ == "__main__": if len(sys.argv) < 4: print >> sys.stderr, "Usage: wordcount <master> <inputfile> <outputfile>" exit(-1) sc = SparkContext(sys.argv[1], "python_wordcount_sorted in bigdataprogrammiing") lines = sc.textFile(sys.argv[2], 2) count_number = sc.accumulator(0) counts = lines\ .flatMap(countWord)\ .filter(lambda x: x!="Tokyo")\ .map(lambda x: (x.lower(), 1))\ .reduceByKey(lambda x,y:x+y)\ .sortByKey(ascending=True) counts.saveAsTextFile("hdfs://localhost:9000/output") print('Number of Tokyo : ', count_number.value) sc.stop()
def main(): num_factors = int(sys.argv[1]) num_workers = int(sys.argv[2]) num_itrns = int(sys.argv[3]) beta_value = float(sys.argv[4]) lambda_value = float(sys.argv[5]) inputV_file = sys.argv[6] outputW_file = sys.argv[7] outputH_file = sys.argv[8] #initialize spark context #using conf #conf = new SparkConf().setAppName("My application").setMaster("local") #sc = new SparkContext(conf=conf) #using only SparkContext sc = SparkContext("local", "dsgd_mf") #use accumulator to update values of teh loss over every iteration L_NZSL = sc.accumulator(0) print "L_NZSL value is ", str(L_NZSL.value) #read directory or file if os.path.isdir(inputV_file): #read data and get it from the input files RDD = get_data_in_RDD_folder(sc, inputV_file) #construct the V matrix, the user and movie hashmaps RDD_matrix_V, user_Hashmap, movie_Hashmap, N_j, N_i = get_matrix_V_from_netflix( RDD) elif os.path.isfile(inputV_file): #read data and get it from the input files RDD = get_data_in_RDD_file(sc, inputV_file) #construct the V matrix, the user and movie hashmaps RDD_matrix_V, user_Hashmap, movie_Hashmap, N_j, N_i = get_matrix_V_from_autolab( RDD) start_time = time.time() print "Time is ", str(start_time) #persist the ratings matrix as we will be using this throughout RDD_matrix_V = RDD_matrix_V.persist() #construct initial W and H matrices with random value between 0 and 1 RDD_matrix_W, RDD_matrix_H = construct_initial_factors( sc, user_Hashmap, movie_Hashmap, num_factors) #now partition the data ( V and W) partition_W = RDD_matrix_W.partitionBy(num_workers).persist() partition_V = RDD_matrix_V.partitionBy(num_workers).persist() l_values = [] #stores the value of loss over every iteration block_size = len(movie_Hashmap.keys( )) / num_workers #size of the block is number of movies/ num of workers total_n = 0 #here my iteration below are over each diagonal , and not over the whole matrix at once, hence my iterations #is the product of the iteraion over the whole data once times the number of workers new_itrns = num_itrns * num_workers #Following are te initial values to keep track of convergence of W and H for autolab data prev_W_intersect = 1000000000 prev_H_intersect = 1000000000 #Now for each iteration for i in range(0, new_itrns): #get strata for this iteration by joining with W partitions = partition_V.join(partition_W, numPartitions=num_workers) #now map partition with index strata_for_this = partitions.mapPartitionsWithIndex( get_partition, preservesPartitioning=True) #Filter the strata now new_block_strata = strata_for_this.filter( partial(get_block, i, block_size, num_workers)) #now get movie id and data #get the map for H (movies) new_map = RDD_matrix_H.collectAsMap() #here, call the update_WH function now which performs the gradient update updated_W_and_H = new_block_strata.mapPartitions( partial(update_WH, lambda_value, N_i, N_j, num_workers, new_map, beta_value, total_n, L_NZSL), preservesPartitioning=True) #got updated maps from different blocks in parallel total_n = total_n + new_block_strata.count() #get RDD of new updated W and H W_list = updated_W_and_H.flatMap(lambda x: x[0]).collect() H_list = updated_W_and_H.flatMap(lambda x: x[1]).collect() RDD_matrix_W_new = sc.parallelize(W_list).sortByKey() RDD_matrix_H_new = sc.parallelize(H_list).sortByKey() #Compute the square of difference on W and H , for convergence RDD_intersect_W = sum( RDD_matrix_W_new.join(RDD_matrix_W).map( lambda x: (x[1][0] - x[1][1])**2).sum()) RDD_intersect_H = sum( RDD_matrix_H_new.join(RDD_matrix_H).map( lambda x: (x[1][0] - x[1][1])**2).sum()) #Update and construct the new W and H for next iteration , some entries are replicated when a user appears multiple times in a strats RDD_matrix_W = RDD_matrix_W_new.union( RDD_matrix_W.subtractByKey(RDD_matrix_W_new)) RDD_matrix_H = RDD_matrix_H_new.union( RDD_matrix_H.subtractByKey(RDD_matrix_H_new)) RDD_matrix_W = RDD_matrix_W.sortByKey() RDD_matrix_H = RDD_matrix_H.sortByKey() partition_W = RDD_matrix_W.partitionBy(num_workers) #if the whole matrix has been covered (the input value of iteration) if (i + 1) % num_workers == 0: #update and check for convergence #if prev_W_intersect - RDD_intersect_W <= 0.00001 and prev_H_intersect - RDD_intersect_H <= 0.00001: # print "W diff "+str(RDD_intersect_W - prev_W_intersect) # print "H diff "+str(RDD_intersect_H - prev_H_intersect) # print "converged" # break #prev_W_intersect = RDD_intersect_W #prev_H_intersect = RDD_intersect_H #print "W diff new "+str(prev_W_intersect) #print "H diff new "+str(prev_H_intersect) #record the loss for this iteration and reset accumulator to zero l_values.append(L_NZSL) L_NZSL = sc.accumulator(0) #Now write out the new W and H to the files mentioned for l in l_values: print "iteration: " + str(i) + " L_NZSL value is " + str(l.value) i = i + 1 print print #Now write out the final W and H matrices to the mentioned files write_matrix(RDD_matrix_H, RDD_matrix_W, outputW_file, outputH_file, user_Hashmap, movie_Hashmap, num_factors) print "time taken was ", str(start_time - time.time())
os.makedirs(input_dir + subDir) for i in range(len(inputFiles)): f = inputFiles[i] if subFileCount==filePartitionSize: subDirs.append(subDir) subDirNum += 1 subFileCount = 0 subDir = str(subDirNum) + "/" os.makedirs(input_dir + subDir) shutil.move(input_dir + f,input_dir + subDir) subFileCount += 1 if subFileCount==filePartitionSize: subDirs.append(subDir) sc = SparkContext("local[" + numCores + "]" , "job", pyFiles=[realpath('helper.py')]) timeLoads = sc.accumulator([0]*len(intervals), VectorAccumulatorParam()) prev_idx = 0 numBS = 0 for i in range(len(subDirs)): d = subDirs[i] end_idx = intervals.index(dirTimeBoundaries[i]) intervalBoundary = (prev_idx+1,end_idx) #both indexes are included prev_idx = end_idx bs2data = sc.textFile(input_dir + d + '*.gz').filter(filterData).map(generateBS2Data).reduceByKey(reduceBS2IMSI2Data) bs2data.foreach(getAccumLoad) if (bs2data.count() >= numBS): numBS = bs2data.count() mean = [float(x)/numBS for x in timeLoads.value]
#action操作 print('------------------action-------------') #collect返回数据到driver,数据量大时会超出内存风险 print(rdd.collect()) #take返回若干个,类似limit? print(rdd.take(4)) #takesample返回随机若干个 print(rdd.takeSample(False, 5, 0)) #first返回第一个 print(rdd.first()) #count返回rdd中元素数量 print(rdd.count()) #redunce (1+2)+3+4+5...依次累加 print(rdd.reduce(lambda x, y: x + y)) #foreach对每个元素执行,不生成新rdd,和map功能一样?,action和transformation的区别? accum = sc.accumulator(0) rdd.foreach(lambda x: accum.add(x)) print(accum.value) #countbykey 对rdd按key统计数量 pairrdd = sc.parallelize([ (1, 1), (1, 4), (2, 1), (3, 1), (1, 6), ]) print(pairrdd.countByKey()) #transformation操作 print('------------------transformation-------------') #map对每个元素进行操作
"Usage: to gracefully shutdown type echo 1 > /tmp/flag at the terminal" ) exit(-1) app_name = "Momentum" sc = SparkContext(appName=app_name) #, pyFiles = ['./cep/redisQueue.py']) ssc = StreamingContext(sc, 2) ssc.checkpoint('../checkpoint') brokers, qname, id, fn = sys.argv[1:] id = int(id) # # demonstrate how to use broadcast variable # NumProcessed = sc.accumulator(0) Q = sc.broadcast({ 'rname': 'rname', 'qname': qname, 'namespace': 'mdq', 'host': 'localhost', 'port': 6379, 'db': 3, 'alert_bot_q': ('msg_bot', 'chatq') }) Threshold = sc.broadcast(0.00015) #kvs = KafkaUtils.createDirectStream(ssc, ['ib_tick_price', 'ib_tick_size'], {"metadata.broker.list": brokers}) kvs = KafkaUtils.createStream(ssc, brokers, app_name, { 'ib_tick_price': 1, 'ib_tick_size': 1 })
import numpy as np srtm_dtype = np.dtype('>i2') filename_regex = re.compile('([NSEW]\d+[NSEW]\d+).*') # The data directory, needs to be available to all node in the cluster data_files = '/media/bitbucket/srtm/version2_1/SRTM3/North_America' # Build up the context, using the master URL sc = SparkContext('spark://ulex:7077', 'srtm') # Now load all the zip files into a RDD data = sc.binaryFiles(data_files) # The two accumulators are used to collect values across the cluster num_samples_acc = sc.accumulator(0) sum_acc = sc.accumulator(0) # Function to array def read_array(data): hgt_2darray = np.flipud(np.fromstring(data, dtype=srtm_dtype).reshape(1201, 1201)) return hgt_2darray # Function to process a HGT file def process_file(file): (name, content) = file filename = os.path.basename(name) srtm_name = filename.split('.')[0] match = filename_regex.match(srtm_name)
#Create W H V W, H = CreateHW() num_users = W.shape[0] num_movies = H.shape[1] V = CreateMatrix(num_users, num_movies) # Initialize sc conf = SparkConf().setAppName('DSGD').setMaster('local[%d]' % num_workers) sc = SparkContext(conf=conf) # Intialize strata init_strata = [[i, v] for i, v in enumerate(np.random.permutation(num_workers))] S = sc.parallelize(init_strata) # Initialize clock clock = sc.accumulator(0) # Iteration for i in xrange(num_iterations) : # Get rows, cols from strata split = S.map(GetRowCol).collect() # Get block from rows, cols matrices = [] for row, col in split : V_block = V.tocsr()[row, :].tocsc()[:, col] W_block = W[row, :].copy() H_block = H[:, col].copy() matrices.append((V_block, W_block, H_block)) # Set clock clk = clock.value # Calculate gradient
#!/usr/bin/env python from pyspark import SparkContext import sys if __name__ == "__main__": def extractNumLines(line): global lines lines += 1 # print lines, ":", line, "\n\n" return line sc = SparkContext(appName="CountKeys") file = sc.sequenceFile( sys.argv[1]) rdd = file.reduceByKey(lambda x, y: x) lines = sc.accumulator( 0) num_lines = rdd.map(extractNumLines) num_lines.collect() print "Num lines: %d" % lines.value
send_email(EMAIL_DESTINATION, NOTIFICATION_MESSAGE) elif (acc1.value % 10 != 0 and acc2.value == 1): if (acc2.value > 0): acc2.add(-1) conf = SparkConf().setAppName("Arduino Notification").setMaster('local[*]') sparkContext = SparkContext(conf=conf) sparkContext.setLogLevel("ERROR") streamingContext = StreamingContext(sparkContext, 1) dstream = streamingContext.socketTextStream(SOCKET_HOST, SOCKET_PORT) # Micro Batches # Sensor stream count_sensor_read = sparkContext.accumulator(0) status_mailed = sparkContext.accumulator(0) data = dstream.filter(lambda _data: float(_data) <= WATER_LEVEL_THRESHOLD) data.foreachRDD(lambda rdd: send_mail(rdd, count_sensor_read, status_mailed)) data.pprint() # End of sensor stream # End of micro batches streamingContext.start() streamingContext.awaitTermination()
import sys from pyspark import SparkContext if __name__ == "__main__": if len(sys.argv) != 2: print >> sys.stderr, "Usage: AverageWordLength <file or directory>" exit(-1) sc = SparkContext() totalWords = sc.accumulator(0) totalLetters = sc.accumulator(0.0) words = sc.textFile(sys.argv[1]).flatMap(lambda line: line.split()) def addTotals(word,words,letters): words +=1 letters += len(word) words.foreach(lambda word: addTotals(word,totalWords,totalLetters)) print "Average word length:", totalLetters.value/totalWords.value
def run_spark_job(tile_dim): from pyspark import SparkConf, SparkContext from pyspark.accumulators import AccumulatorParam class ImageSourceAccumulatorParam(AccumulatorParam): """ Accumulator that will collect our image data that will be included as part of the input to the next stage of processing. """ def zero(self, dummy): return [] def addInPlace(self, sources1, sources2): res = [] if sources1: res.extend(sources1) if sources2: res.extend(sources2) return res request_uri = sys.argv[1] # If there's more arguements, its to turn off notifications publish_notifications = True if len(sys.argv) == 3: publish_notifications = False parsed_request_uri = urlparse(request_uri) request = None if not parsed_request_uri.scheme: request = json.loads(open(request_uri).read()) else: client = boto3.client("s3") o = client.get_object(Bucket=parsed_request_uri.netloc, Key=parsed_request_uri.path[1:]) request = json.loads(o["Body"].read()) source_uris = request["images"] workspace = request["workspace"] jobId = request["jobId"] target = request["target"] if publish_notifications: notify_start(jobId) try: uri_sets = create_uri_sets(source_uris, workspace) image_count = len(uri_sets) conf = SparkConf().setAppName(APP_NAME) sc = SparkContext(conf=conf) image_source_accumulator = sc.accumulator([], ImageSourceAccumulatorParam()) def create_image_sources(uri_set, acc): image_source = create_image_source(uri_set.source_uri, uri_set.workspace_source_uri, uri_set.image_folder, uri_set.order, tile_dim) acc += [image_source] return image_source def uri_set_copy(uri_set): copy_to_workspace(uri_set.source_uri, uri_set.workspace_target) return uri_set uri_set_rdd = sc.parallelize(uri_sets, image_count).map(uri_set_copy) image_sources = uri_set_rdd.map(lambda uri_set: create_image_sources(uri_set, image_source_accumulator)) chunk_tasks = image_sources.flatMap(lambda image_source: generate_chunk_tasks(image_source, tile_dim)) chunks_count = chunk_tasks.cache().count() numPartitions = max(chunks_count / 10, min(50, image_count)) chunk_tasks.repartition(numPartitions).foreach(process_chunk_task) image_sources = image_source_accumulator.value print "Processed %d images into %d chunks" % (len(image_sources), chunks_count) input_info = map(construct_image_info, sorted(image_sources, key=lambda im: im.order)) result = { "jobId": jobId, "target": target, "tileSize": tile_dim, "input": input_info } # Save off result workspace_parsed = urlparse(workspace) if not workspace_parsed.scheme: # Save to local files system open(os.path.join(workspace, OUTPUT_FILE_NAME), 'w').write(json.dumps(result)) elif workspace_parsed.scheme == "s3": client = boto3.client("s3") bucket = workspace_parsed.netloc key = os.path.join(workspace_parsed.path, OUTPUT_FILE_NAME)[1:] client.put_object(Bucket=bucket, Key=key, Body=json.dumps(result)) except Exception, e: if publish_notifications: notify_failure(jobId, "%s: %s" % (type(e).__name__, e.message)) raise
from pyspark import SparkContext, SparkConf from Utils import Utils if __name__ == "__main__": conf = SparkConf().setAppName('StackOverflow Example').setMaster( "local[*]") sc = SparkContext(conf=conf) total = sc.accumulator(0) missingSalaryMidPoint = sc.accumulator(0) responseRDD = sc.textFile("2016-stack-overflow-survey-responses.csv") processedBytes = sc.accumulator(0) def filterResponseFromCanada(response): processedBytes.add(len(response.encode('utf-8'))) splits = Utils.COMMA_DELIMITER.split(response) total.add(1) if not splits[14]: missingSalaryMidPoint.add(1) return splits[2] == "Colombia" responseFromCanada = responseRDD.filter(filterResponseFromCanada) print("Count of responses from Colombia: {}".format( responseFromCanada.count())) print("Total count of responses {}".format(total.value)) print("Count of responses missing salary middle point: {}".format( missingSalaryMidPoint.value)) print("Number of bytes processed: {}".format(processedBytes))
ZAQAR_URL='http://10.0.1.107:8888/' ZAQAR_VERSION=1.1 def get_client(): return zaqarclient.Client(ZAQAR_URL, ZAQAR_VERSION, conf=conf) def total_emitter(acc): client = get_client() queue = client.queue('log_totals') while True: time.sleep(5) queue.post({'body': acc.value, 'ttl': 300}) if __name__ == '__main__': sc = SparkContext(appName='SparkharaLogCounter') ssc = StreamingContext(sc, 1) total_lines = sc.accumulator(0) def rdd_print(rdd): a = rdd.collect() total_lines.add(len(a)) lines = ssc.socketTextStream('0.0.0.0', 9901) lines.foreachRDD(rdd_print) th = threading.Thread(target=total_emitter, args=(total_lines,)) th.start() ssc.start() ssc.awaitTermination()
def main(input_path, output_path): sc = SparkContext(appName='Data_Analysis') ''' Define spark accumulators for counting total records, valid records and empty string checking ''' empty_records = {} # accumulator check the emptiness of columns for column in CHECK_EMPTY_COLUMNS: empty_records[column] = sc.accumulator(0) total_records = sc.accumulator(0) valid_records = sc.accumulator(0) # collection all desired statistics stats_collector = {} # load raw dataset raw_rdd = sc.textFile(input_path).map(lambda x: x.split('|')) # vaidate whether data fulfill the definition of data dictionary validate_rdd = raw_rdd.filter(lambda x: data_validate(x, VALIDATION_LIST, total_records, valid_records)) ''' Keep only the following columns: 'IMSI', 'EVENT_TYPE', 'CGI', 'DATETIME', 'BBC' ''' # load fixed cell master file for cgi and bbc mapping cell_master_dict = {} with open(CELL_MASTER_FILE, 'r') as f: for line in f: line = line.strip() line = line.split('|') cell_master_dict[line[0]] = line[9] transform_rdd = validate_rdd.map(lambda x: data_tranform(x, cell_master_dict, empty_records)) ''' Filter out records for JABODETABEK ''' Jabodetabek_rdd = transform_rdd.filter(lambda x: x[-1] == 'JABODETABEK').cache() stats_collector['Jabodetabek_records'] = Jabodetabek_rdd.count() print 'Number of Jabodetabek records: %d' % stats_collector['Jabodetabek_records'] stats_collector['total_records'] = total_records.value print 'Number of total records: %d' % stats_collector['total_records'] stats_collector['valid_records'] = valid_records.value print 'Number of valid records: %d' % stats_collector['valid_records'] stats_collector['empty_records'] = {} if stats_collector['valid_records'] != 0: for column in CHECK_EMPTY_COLUMNS: stats_collector['empty_records'][column] = empty_records[column].value print 'Empty record in column %s is %d, with percentage %.2f' % (column, empty_records[column].value, empty_records[column].value / float( stats_collector['valid_records'])) if stats_collector['Jabodetabek_records'] > 0: ''' Generate the aggregate count distribution over time ''' # aggregate count distribution over time per event type event_datetime_pair = Jabodetabek_rdd.map(lambda x: ((x[1], round_datetime(x[3], 10, 'minutes')), 1)) agg_event_datetime_pair = event_datetime_pair.reduceByKey(lambda x, y: x + y).sortByKey(ascending=True) stats_collector['event_time_distribution'] = {} for item in agg_event_datetime_pair.collect(): if item[0][0] not in stats_collector['event_time_distribution']: stats_collector['event_time_distribution'][item[0][0]] = [[item[0][1], item[1]]] else: stats_collector['event_time_distribution'][item[0][0]].append([item[0][1], item[1]]) print 'Event time records distribution: %s' % str(stats_collector['event_time_distribution']) # overall aggregate count distribution over time event_time_distribution = stats_collector['event_time_distribution'] event_list = event_time_distribution.keys() time_series_dict = dict() for event_type in event_list: for item in event_time_distribution[event_type]: if item[0] not in time_series_dict: time_series_dict[item[0]] = item[1] else: time_series_dict[item[0]] += item[1] time_distribution = [[t, time_series_dict[t]] for t in time_series_dict] time_distribution = sorted(time_distribution, key=lambda x: x[0]) stats_collector['time_distribution'] = time_distribution print 'Time records distribution: %s' % str(stats_collector['time_distribution']) # Group by records according different event stats_collector['event_distribution'] = [[e_type, sum([event_time_distribution[e_type][i][1] for i in range(len(event_time_distribution[e_type]))])] for e_type in event_list] print 'Event distribution is: %s' % stats_collector['event_distribution'] ''' Convert RDD to dataframe ''' sql_context = SQLContext(sc) fields = [StructField(field_name, StringType(), True) for field_name in SCHEMA_NAMES] schema = StructType(fields) df = sql_context.createDataFrame(Jabodetabek_rdd, schema).cache() ''' Unique IMIS per hour ''' udf_round_datetime = udf(round_datetime, StringType()) df = df.select(df['*'], udf_round_datetime(df['DATETIME']).alias('Hour')) stats_collector['imsi_per_hour'] = (df.groupBy('Hour').agg(countDistinct('IMSI').alias('UNIQUE_IMSI')) .rdd.map(lambda x: [x['Hour'], x['UNIQUE_IMSI']]).collect()) ''' Number of records per imsi distribution ''' count_by_imsi = df.groupBy('IMSI').count().selectExpr('count as num_records') # ['IMSI', 'num_records'] stats_collector['total_imsi'] = count_by_imsi.count() print 'Total number of imsi is %d' % stats_collector['total_imsi'] stats = count_by_imsi.selectExpr('avg(num_records) as mean', 'stddev(num_records) as std').collect() mean_value = stats[0]['mean'] std_value = stats[0]['std'] upper_limit = mean_value + 3 * std_value # mean + 3 * standard deviation count_by_imsi_filtered = count_by_imsi.filter(count_by_imsi['num_records'] <= upper_limit) agg_by_count = count_by_imsi_filtered.groupBy('num_records').count() stats_collector['imsi_distribution'] = agg_by_count.rdd.map(lambda x: [x['num_records'], x['count']]).collect() print 'IMSI distribution is: %s' % stats_collector['imsi_distribution'] output_rdd = sc.parallelize([json.dumps(stats_collector)]) output_rdd.saveAsTextFile(output_path) else: output_rdd = sc.parallelize([json.dumps(stats_collector)]) output_rdd.saveAsTextFile(output_path) raise ValueError('No record for Jabodetabek Found.')
if i % 2 == 0: det += sum else: det += (0 - sum) if __name__ == "__main__": global rows global mat rows = 3 sc = SparkContext("local", "Determinant") # accumulator variable to accumulate final determinant value det = sc.accumulator(0) # dense matrix returns matrix in column major format hence # the entered values itself is fiven in column major so that # we can finally have a row major matrix to operate on dm2 = Matrices.dense(rows, rows, [2, 7, 3, 3, 7, 8, 5, 8, 5]) print "\n\nEntered matrix:\n", dm2.toArray() #here we are trying to divide work between workers. we divide first row # between them (calculate partial determinant for each item in first row) cols = sc.parallelize([i for i in range(0, rows)]) mat = dm2.toArray() cols.foreach(dist_deter)
if len(x[1].split(",")) > 1: return [(x[0], str(len(x[1].split(","))) + "/" + x[1])] else: return [] from pyspark import SparkContext if __name__ == '__main__': # initialize sc = SparkContext("yarn", "labelp") # reduceByKey() uses ',' to collect all followers of a user p_list = sc.textFile("s3://spark-llh/inputfile/edges.csv")\ .coalesce(100).map(divide).reduceByKey(lambda a,b:a+","+b).map(add_plus) # initialize accumulator p_count = sc.accumulator(0) while 1: p_list = p_list.coalesce(100).flatMap(p_check).union( p_list).reduceByKey(p_update) p_list.count() # an action to trigger transformations and accumulator if p_count.value == 0: break p_count.value = 0 n_list = sc.textFile("s3://spark-llh/inputfile/edges.csv")\ .coalesce(100).map(reverse).reduceByKey(lambda a,b:a+","+b).map(add_minus) n_count = sc.accumulator(0) while 1: n_list = n_list.coalesce(100).flatMap(n_check).union( n_list).reduceByKey(n_update) n_list.count() if n_count.value == 0:
from pyspark import SparkContext sc = SparkContext('spark://master:7077', 'accumulator example') # accumulators are initialized with a initial value # they have and add method to add values to the accumulator # and a value property that is visibile only to the master accum = sc.accumulator(0) data = sc.parallelize(range(1,1000)) # we are going to iterate over our data and add each value to the # accumulator data.foreach(lambda value: accum.add(value)) print accum.value
def map_phase(x): x = re.sub('--', ' ', x) x = re.sub("'", '', x) return re.sub('[?!@#$\'",.;:()]', '', x).lower() def filter_tokyo(line): line = map_phase(line) global tokyo_count if line == 'tokyo': tokyo_count += 1 return (line, 1) if __name__ == "__main__": if len(sys.argv) < 4: print >> sys.stderr, "Usage: wordcount <master> <inputfile> <outputfile>" exit(-1) sc = SparkContext(sys.argv[1], "python_wordcount_sorted in bigdataprogrammiing") tokyo_count = sc.accumulator(0) lines = sc.textFile(sys.argv[2], 2) print(lines.getNumPartitions()) # print the number of partitions outRDD = lines.map(filter_tokyo) outRDD = outRDD.reduceByKey(add) outRDD = outRDD.filter(lambda x: x[0].find('neighborhood') == -1) outRDD = outRDD.filter(lambda x: x[0].find('tokyo') == -1) outRDD = outRDD.sortBy(lambda x: x[1]) outRDD.saveAsTextFile(sys.argv[3]) print("Number of Tokyo : {}".format(tokyo_count))
W_rdd = sc.parallelize(range(num_users+1)) W_rdd = W_rdd.map(lambda x: (x, [random.uniform(0, 5) for _ in range(0, num_factors)])).keyBy(lambda entry: entry[0]/users_per_w_block.value) W_rdd = W_rdd.partitionBy(num_workers, lambda key: key).persist() #construct the H array H_rdd = sc.parallelize(range(num_movies+1)) H_rdd = H_rdd.map(lambda x: (x, [random.uniform(0, 5) for _ in range(0, num_factors)])).keyBy(lambda entry: entry[0]/movies_per_h_block.value) #H_rdd = H_rdd.partitionBy(num_workers, partition_h).persist() #broadcast beta and lambda beta_br = sc.broadcast(beta_value) lambda_br = sc.broadcast(lambda_value) total_updates = sc.broadcast(0) curr_stratum = sc.broadcast(0) last_iter_total = sc.accumulator(0) #SGD begins for iter in range(num_iterations): #filter current stratum data stratum_V_rdd = V_rdd.filter(lambda entry: entry[1][1]==curr_stratum.value) #partition H H_rdd = H_rdd.map(lambda entry: (pattern_br.value[curr_stratum.value][(entry[1][0]/movies_per_h_block.value)],entry[1])) H_rdd = H_rdd.partitionBy(num_workers, lambda key: key).persist() #group V, W and H into a stratum stratum_rdd = stratum_V_rdd.groupWith(W_rdd,H_rdd).partitionBy(num_workers, lambda key: key).persist() #parallel SGD on strata stratum_rdd = stratum_rdd.map(sgd_func, True)
qk = quadkey.from_geo((latitude, longitude), 15) acc_num_good_records.add(1) return "{},{},{},{},{},{}".format(record, country, city, latitude, longitude, qk.key) except: acc_num_bad_records.add(1) return "-----" if __name__ == "__main__": sc = SparkContext() outputPath = "hdfs://localhost/user/cloudera/audi_case_study/location_info_added" reader = None acc_num_bad_records = sc.accumulator(0) acc_num_good_records = sc.accumulator(0) records = sc.textFile( "hdfs://localhost/user/cloudera/audi_case_study/data/") records.map(add_location_info) \ .filter(lambda x: x != "-----") \ .saveAsTextFile(outputPath) print("Number of good records: {}, Number of bad records: {}".format( acc_num_good_records.value, acc_num_bad_records.value))
f = inputFiles[i] if subFileCount==filePartitionSize: subDirs.append(subDir) subDirNum += 1 subFileCount = 0 subDir = str(subDirNum) + "/" os.makedirs(input_dir + subDir) shutil.move(input_dir + f,input_dir + subDir) subFileCount += 1 if subFileCount==filePartitionSize: subDirs.append(subDir) sc = SparkContext("local[" + numCores + "]" , "job", pyFiles=[realpath('helper.py')]) eNodeBLoadVec = [] for bs in eNodeBs: v = sc.accumulator([(0,0,0)]*len(intervals), VectorAccumulatorParamTriple()) eNodeBLoadVec.append(v) prev_idx = 0 for i in range(len(subDirs)): d = subDirs[i] end_idx = intervals.index(dirTimeBoundaries[i]) intervalBoundary = (prev_idx+1,end_idx) #both indexes are included prev_idx = end_idx bs2data = sc.textFile(input_dir + d + '*.gz').filter(filterData).map(generateBS2Data).reduceByKey(reduceBS2IMSI2Data) bs2data.foreach(getBearerLoad) resetDirectories(subDirs,input_dir) header = "time "
import sys,getopt from math import * #custom accumulator for boolean variable changevar class VectorAccumulatorParam(AccumulatorParam): def zero(self, value): return False def addInPlace(self, val1, val2): return val1 or val2 sc = SparkContext(appName="quasicliqueEnumeration",serializer=MarshalSerializer()) #changevar check whether new clusters are formed and decides whether to go to the next iteration changevar= sc.accumulator(bool, VectorAccumulatorParam()) gamma =0.9#default value of gamma k=3 # default value of k-size of the clique after which gamma should be applied #create each each edge into a clique def createinitialClusters(input): nodes = input.split() edge = () if len(nodes) > 1: if(nodes[1] > nodes[0]): edge = (int(nodes[0]),int(nodes[1])) else: edge = (int(nodes[1]),int(nodes[0]))
#input data format: # id0 id01 id02 ... # id1 id11 id12 ... #... # first id (idn) is current node # following ids (idnm) are nodes connected to idn. from pyspark import SparkConf, SparkContext conf = SparkConf().setMaster("local").setAppName("BFS") sc = SparkContext(conf = conf) hitCounter = sc.accumulator(0) src_id = 1 dst_id = 6 def parseInput(line): l = line.split() v_id = int(l[0]) if v_id == src_id: v_dist = 0 v_status = 1 else: v_dist = 9999 v_status = 0
import sys sys.path.insert(0, '.') from pyspark import SparkContext, SparkConf from commons.Utils import Utils if __name__ == "__main__": conf = SparkConf().setAppName('StackOverFlowSurvey').setMaster("local[*]") sc = SparkContext(conf = conf) total = sc.accumulator(0) missingSalaryMidPoint = sc.accumulator(0) responseRDD = sc.textFile("in/2016-stack-overflow-survey-responses.csv") def filterResponseFromCanada(response): splits = Utils.COMMA_DELIMITER.split(response) # update accumulator for each record total.add(1) if not splits[14]: missingSalaryMidPoint.add(1) return splits[2] == "Canada" responseFromCanada = responseRDD.filter(filterResponseFromCanada) print("Count of responses from Canada: {}".format(responseFromCanada.count())) print("Total count of responses: {}".format(total.value)) print("Count of responses missing salary middle point: {}" \ .format(missingSalaryMidPoint.value))
#!/usr/bin/python # -*- coding: utf-8 -*- from pyspark import SparkContext from pyspark.accumulators import AccumulatorParam sc = SparkContext("local", "Simple App") class MultiplicadorAccum(AccumulatorParam): def zero(self, initialValue): return 1 def addInPlace(self, v1, v2): return v1*v2 acc = sc.accumulator(1,MultiplicadorAccum()) sc.parallelize([1,2,3,4,5,5,6,7,7]).foreach(lambda x: acc.add(x)) print("value %d " % acc.value)
lambda tokens: (int(tokens[0]), int(tokens[1]), int(tokens[2]))) # ----------------------------------- Load Accumulator Object ------------------------------------------------ class VectorAccumulatorParam(AccumulatorParam): def zero(self, value): return [0.0] * len(value) def addInPlace(self, val1, val2): #val1: list val1 += val2 return val1 # ----------------------------------- Process test RDD object ------------------------------------------------ def test_result(x): # x[0]:RatingID, x[1]:user, x[2]:item global ans PredRating = getPredict(x[1], x[2]) ans += [[x[0], PredRating]] # ----------------------------------- Get predicted rating of Test.dat ------------------------------------------------ ans = sc.accumulator([], VectorAccumulatorParam()) test_rdd.foreach(test_result) ans = np.array(ans.value) # print("RatingID ", ans[:, 0], "Rating", ans[:, 1]) # ----------------------------------- Output File ------------------------------------------------ predictPdsDF = pd.DataFrame({'RatingID': ans[:, 0], 'Rating': ans[:, 1]}) predictPdsDF['RatingID'] = predictPdsDF['RatingID'].astype(int) predictPdsDF.to_csv("predict.csv", index=False)
SOURCE = options.source TARGET = options.url NOOP_WITHIN = options.noop FIELD = options.field if options.hostmap[0:24] == 'hdfs://analytics-hadoop/': hostMap = json.loads(subprocess.check_output(["hdfs", "dfs", "-cat", options.hostmap[23:]])) else: hostMap = json.load(open(options.hostmap)) print "Transferring from %s to %s" % (SOURCE, TARGET) if __name__ == "__main__": sc = SparkContext(appName="Send To ES: %s" % (TARGET)) sqlContext = SQLContext(sc) broardcastMap = sc.broadcast(hostMap) documentCounter = sc.accumulator(0) updateCounter = sc.accumulator(0) errorCounter = sc.accumulator(0) failedDocumentCounter = sc.accumulator(0) def documentData(document): """ Create textual representation of the document data for one document """ updateData = {"update": {"_id": document.page_id}} if NOOP_WITHIN: updateDoc = {"script": { "script": "super_detect_noop", "lang": "native", "params": { "handlers": {FIELD: "within " + NOOP_WITHIN + "%"},
import json import math import os from pyspark import SparkContext from pyspark import SparkFiles sparkMaster = sys.argv[1] inputFile = sys.argv[2] outputDir = sys.argv[3] sc = SparkContext(sparkMaster, appName="ChapterSixExample") file = sc.textFile(inputFile) # Count lines with KK6JKQ using accumulators count = sc.accumulator(0) def incrementCounter(line): global count # Access the counter if "KK6JKQ" in line: count += 1 file.foreach(incrementCounter) print "Lines with KK6JKQ %d" % count.value # Create Accumulator[Int] initialized to 0 blankLines = sc.accumulator(0) dataLines = sc.accumulator(0)
# # iscount: # idscore = iscount.map(lambda a: (a[0][0], 0.15 + 0.85 * (a[0][1] + danglinglist.value() / n))) lines = sc.textFile("s3n://mapreduceadam/data/wikipedia_arcs") n = lines.flatMap(lambda a: a.encode("utf-8").strip().split("\t")).distinct().count() # Loads all ids from input file and initialize their neighbors. links = lines.map(lambda a: (a.split("\t")[0], a.split("\t")[1])).distinct().groupByKey().cache() # Loads all ids with other id(s) to from input file and initialize ranks of them to one. ranks = links.map(lambda a: (str(a[0]), 1.0)) # Calculates and updates id ranks continuously using PageRank algorithm. for i in range(10): danglinglist = sc.accumulator(0.0) outjoin = links.rightOuterJoin(ranks) contribs = outjoin.flatMap(lambda a: computeContribs(a)).reduceByKey(lambda a, b: a + b) print contribs.collect() danglingtotal = danglinglist.value # Re-calculates URL ranks based on neighbor contributions. ranks = contribs.map(lambda a: calltribs(a)) # file2 = sc.textFile("s3n://s15-p42-part2/data/wikipedia_mapping") file2 = sc.textFile("s3n://mapreduceadam/data/wikipedia_mapping").encode("utf-8") namelist = file2.map(lambda a: (a.split("\t")[0].encode("utf-8"), a.split("\t")[1].encode("utf-8"))) # output = namelist.join(ranks).map(lambda a: a[1][0] + "\t" + str(a[1][1])).saveAsTextFile("p2output") print namelist.join(ranks).map(lambda a: a[1][0] + "\t" + str(a[1][1])).collect() sc.stop() # counts = file.flatMap(lambda line: line.split(" ")) \ # .map(lambda word: (word, 1)) \
# Fetching a particular key from dict and changing its case res = lookupAndSwapCase("key2") print("============================================") print("########## BROADCAST VARIABLE EXAMPLE #######") print("Value at key2 is:", res) print("Entire broadcast object is: ", data_broadcast) print("============================================") ### # Broadcast variable section ends ### # Accumulator example begins # Adds a 3 to final value accu = sc.accumulator(3) def accuFunction(arg): global accu accu += arg rdd = sc.parallelize([1, 2, 3]) # Creates an RDD rdd.foreach(accuFunction) # Call the function for each rdd element print("============================================") print("########## ACCUMULATOR EXAMPLE #######") print("Final accumulated value is: ", accu.value) print("============================================") # Accumulator example ends