def main(self): # loading configuration parameters (from a config file when working on a project) zk, topic, app_name, batch_duration, master = self.setConfiguration() # initiate the spark context / streaming context conf = (SparkConf().setMaster(master)) sc = SparkContext(appName=app_name, conf=conf) ssc = StreamingContext(sc, batch_duration) # reading data to kafka kvs = KafkaUtils.createStream(ssc, zk, "spark-streaming-consumer", {topic: 1}) lines = kvs.map(lambda x: x[1]) lines.pprint() ssc.start() # Start the computation ssc.awaitTermination() # Wait for the computation to terminate sc.close()
from pyspark import SparkContext, SparkConf import operator import os os.chdir() #read in a local file sc = SparkContext(conf=SparkConf().setAppName('App').setMaster('local')) raw_data = sc.textFile('/data/twitter/twitter_sample_small.txt') #define a method to read the data, split by tab def parse_edge(s): user, follower = s.split('\t') return (int(user), int(follower)) # cache the intermediate rdd after parsing it edges = raw_data.map(parse_edge).cache() #apply aggregateByKey - see explanation below the code fol_agg = edges.aggregateByKey(0,lambda v1,v2: v1+1 \ ,operator.add) # top user/key with most followers. # use operator to make sure the values(aggregated counts) and not the keys/userIds # are used for the comparison top_user = fol_agg.top(1, operator.itemgetter(1)) print '%d %d' % (top_user[0][0], top_user[0][1]) sc.close()