def feeder(start_date, end_date, e1, e2, q): conf = SparkConf().setAppName("Simple App").setMaster( "spark://127.0.0.1:7077").set("spark.cassandra.connection.host", "127.0.0.1") sc = CassandraSparkContext(conf=conf) spark = SparkSession(sc) a = "" l = ['"SP1"', '"SP2"'] asia = pytz.timezone("Asia/Kolkata") #creating a dataframe for the date range and ric names rdd = sc.cassandraTable("testkeyspace", "stock_test").select( "ric", "time_stamp", "high", "low").where("ric in ?", ["SP1", "SP2", "SP3"]).where( "time_stamp > ? and time_stamp < ?", datetime(2010, 11, 26, 12, 30, tzinfo=asia), datetime(2010, 12, 10, 12, 30, tzinfo=asia)).toDF() # making a batch according to the time_stamp rdd = rdd.orderBy("time_stamp").groupBy("time_stamp").agg( collect_list(struct('ric', 'time_stamp', 'high', 'low'))).collect() # sending one batch to analytical engine for gr in rdd: e2.clear() send = gr[1] q.put(send) #adding the batch to the queue e2.set() e1.wait()
def run_driver(keyspace, table, cass_host): conf = SparkConf().setAppName("PySpark Cassandra Sample Driver") conf.set("spark.cassandra.connection.host", cass_host) sc = CassandraSparkContext(conf=conf) # Read some data from Cassandra pixels = sc.cassandraTable(keyspace, table) print pixels.first() # Count unique visitors, notice that the data returned by Cassandra is # a dict-like, you can access partition, clustering keys as well as # columns by name. CQL collections: lists, sets and maps are converted # to proper Python data types visitors = pixels.map(lambda p: (p["data"]["visitor_id"],))\ .distinct() print "Visitors: {:,}".format(visitors.count()) # Insert some new pixels into the table pixels = ({ "customer_id": "example.com", "url": "http://example.com/article1/", "hour": dt.datetime(2014, 1, 2, 1), "ts": dt.datetime(2014, 1, 2, 1, 8, 23), "pixel_id": str(uuid4()), "data": { "visitor_id": "xyz" } }, ) saveToCassandra(sc.parallelize(pixels), keyspace, table) print "Wrote new pixels to Cassandra {!r}.{!r}".format(keyspace, table)
def main(): pwords = load_wordlist("./Dataset/positive.txt") nwords = load_wordlist("./Dataset/negative.txt") conf = SparkConf().\ setMaster("local[2]").\ setAppName("TweeStreamer").\ set("spark.cassandra.connection.host",\ "52.25.173.31, 35.165.251.179, 52.27.187.234, 52.38.246.84") sc = CassandraSparkContext(conf=conf) sc.setLogLevel("WARN") sql = SQLContext(sc) # Creating a streaming context with batch interval of 1 sec ssc = StreamingContext(sc, 10) ssc.checkpoint("checkpoint") kstream = KafkaUtils.createDirectStream( ssc, topics=['twitter-topic1'], kafkaParams={"metadata.broker.list": 'localhost:9092'}) #tweets = kstream.map(lambda x: json.loads( x[1].decode('utf-8'))) tweets = kstream.map(lambda x: json.loads(x[1])) tweetsUsentiment = tweets.map( lambda tweet: tweetwithSentiment(tweet, pwords, nwords)) #searchTermSentiment = tweetsUsentiment.pprint() tweetsUsentiment.saveToCassandra("tweetdb", "tweettable") ssc.start() ssc.awaitTerminationOrTimeout(100) ssc.stop(stopGraceFully=True)
def run_driver(keyspace, table): conf = SparkConf().setAppName("PySpark Cassandra Sample Driver") conf.set("spark.cassandra.connection.host", "127.0.0.1") sc = CassandraSparkContext(conf=conf) # Read some data from Cassandra pixels = sc.cassandraTable(keyspace, table) print pixels.first() # Count unique visitors, notice that the data returned by Cassandra is # a dict-like, you can access partition, clustering keys as well as # columns by name. CQL collections: lists, sets and maps are converted # to proper Python data types visitors = pixels.map(lambda p: (p["data"]["visitor_id"],))\ .distinct() print "Visitors: {:,}".format(visitors.count()) # Insert some new pixels into the table pixels = ( { "customer_id": "example.com", "url": "http://example.com/article1/", "hour": dt.datetime(2014, 1, 2, 1), "ts": dt.datetime(2014, 1, 2, 1, 8, 23), "pixel_id": str(uuid4()), "data": {"visitor_id": "xyz"} }, ) saveToCassandra(sc.parallelize(pixels), keyspace, table) print "Wrote new pixels to Cassandra {!r}.{!r}".format(keyspace, table)
def streaming_logic(): """ :function: initial spark context and all the streaming logic :return: None """ # - read configuration from file spark_config, kafka_config, cassandra_config = read_config() # - initial spark context conf = SparkConf().setMaster(spark_config['master']).setAppName(spark_config['app_name']).set('spark.cassandra.connection.host', cassandra_config['cluster']) csc = CassandraSparkContext(conf=conf) csc.setLogLevel(spark_config['log_level']) ssc = StreamingContext(sparkContext=csc, batchDuration=spark_config['time_window']) # - creating kafka stream directKafkaStream = KafkaUtils.createDirectStream(ssc, [kafka_config['topic_in']], {'metadata.broker.list': kafka_config['cluster']}) # - start to process data # - output data structure: MetadData structured_stock_data = directKafkaStream.map(lambda data : preprocess_data(data=data)) structured_stock_data.pprint(20) stock_data_list = structured_stock_data.reduceByKey(lambda a,b : aggregate_list(a,b)) stock_data_list.pprint(20) # - get history data from cassandra alert_user_data = stock_data_list.mapValues(lambda dictlist : compute_stock_tending_in_window(dict_list=dictlist)) alert_user_data.pprint(20) # - send alert to user alert_user_data.foreachRDD(lambda rdd : rdd.foreachPartition(lambda iter : send_alert_to_kafka(iterator=iter,kafka_config=kafka_config))) ssc.start() ssc.awaitTermination()
def define_context(): conf = SparkConf().setMaster("local[*]").setAppName( "twitter-artist-count").set("spark.cassandra.connection.host", "127.0.0.1") sc = CassandraSparkContext.getOrCreate(conf=conf) sc.setCheckpointDir("./checkpoints") ssc = StreamingContext(sc, 30) return ssc
def __init__(self): self.spark_config = SparkConf()\ .setMaster("local[4]")\ .setAppName("Popularity")\ .set("spark.cassandra.connection.host", "127.0.0.1") self.sparkContext = CassandraSparkContext(conf=self.spark_config) self.cluster = Cluster() self.session = self.cluster.connect("music_recommendation") self.raw_data = self.session.execute("SELECT song_id, timestamp " "FROM user_event " "WHERE action_type='listen';") self.session.execute("DROP TABLE IF EXISTS result_popularity ;") self.session.execute("CREATE TABLE IF NOT EXISTS result_popularity (" "sid text PRIMARY KEY," "rank int);") self.current_year = datetime.datetime.now().year self.current_month = datetime.datetime.now().month
class Popularity(object): def __init__(self): self.spark_config = SparkConf()\ .setMaster("local[4]")\ .setAppName("Popularity")\ .set("spark.cassandra.connection.host", "127.0.0.1") self.sparkContext = CassandraSparkContext(conf=self.spark_config) self.cluster = Cluster() self.session = self.cluster.connect("music_recommendation") self.raw_data = self.session.execute("SELECT song_id, timestamp " "FROM user_event " "WHERE action_type='listen';") self.session.execute("DROP TABLE IF EXISTS result_popularity ;") self.session.execute("CREATE TABLE IF NOT EXISTS result_popularity (" "sid text PRIMARY KEY," "rank int);") self.current_year = datetime.datetime.now().year self.current_month = datetime.datetime.now().month def _compare_date_time(self, month, year): if self.current_year == year: if self.current_month == month: return True else: return False def _handle_raw_data(self): new_data_set = list() for row in self.raw_data: month = row.timestamp.month year = row.timestamp.year if self._compare_date_time(month, year): new_tuple = tuple([row.song_id, 1]) new_data_set.append(new_tuple) return new_data_set def calculate(self): dist_data = self.sparkContext.parallelize(self._handle_raw_data()) counts = dist_data.reduceByKey(lambda a, b: a + b) counts = counts.sortBy(lambda a: a[1], ascending=False).take(10) result = self.sparkContext.parallelize(counts) result.saveToCassandra("music_recommendation", "result_popularity") print result.collect()
def __init__(self): self.spark_config = SparkConf() \ .setMaster("local[4]") \ .setAppName("ContentBased") \ .set("spark.cassandra.connection.host", "127.0.0.1") self.sparkContext = CassandraSparkContext(conf=self.spark_config) self.cluster = Cluster() self.session = self.cluster.connect("music_recommendation") cql_cmd = "SELECT * FROM %s" cmd = cql_cmd % "i_profile_artist" self.i_artists_res = self.session.execute(cmd) cmd = cql_cmd % "i_profile_composer" self.i_composers_res = self.session.execute(cmd) cmd = cql_cmd % "i_profile_genre" self.i_genres_res = self.session.execute(cmd) cmd = cql_cmd % "u_profile_artist" self.u_artists_res = self.session.execute(cmd) cmd = cql_cmd % "u_profile_composer" self.u_composers_res = self.session.execute(cmd) cmd = cql_cmd % "u_profile_genre" self.u_genres_res = self.session.execute(cmd) cql_cmd = "SELECT uid, song_id FROM %s" events = self.session.execute(cql_cmd % "user_event") self.events = dict() for event in events: songs = self.events.get(event.uid) if songs is None: self.events[event.uid] = [event.song_id] else: self.events[event.uid].append(event.song_id) self.session.execute("CREATE TABLE IF NOT EXISTS " "result_cb_user_item_genre (" "uid text PRIMARY KEY," "recommendations list<text>);") self.session.execute("CREATE TABLE IF NOT EXISTS " "result_cb_user_item_artist (" "uid text PRIMARY KEY," "recommendations list<text>);") self.session.execute("CREATE TABLE IF NOT EXISTS " "result_cb_user_item_composer (" "uid text PRIMARY KEY," "recommendations list<text>);")
def __init__(self): self.sparkConfig = SparkConf()\ .setMaster("local[4]")\ .setAppName("MCF")\ .set("spark.cassandra.connection.host", "127.0.0.1")\ .set("spark.cassandra.input.consistency.level", "LOCAL_ONE") self.sparkContext = CassandraSparkContext(conf=self.sparkConfig) self.rank = 10 self.numIteration = 10 self.numberOfPreds = 10 self.cluster = Cluster() self.session = self.cluster.connect("music_recommendation") self.rawData = self.session.execute("SELECT uid, song_id, payload " "FROM user_event " "WHERE action_type='rate'") self.session.execute("CREATE TABLE IF NOT EXISTS result_cf (" "uid text PRIMARY KEY," "recommendations list<text>);")
def setUpClass(cls): super(CassandraTestCase, cls).setUpClass() cls.sc = CassandraSparkContext(conf=SparkConf().setAppName("PySpark Cassandra Test")) cls.session = Cluster().connect() cls.session.execute(''' CREATE KEYSPACE IF NOT EXISTS test_pyspark_cassandra WITH replication = {'class': 'SimpleStrategy', 'replication_factor': 1}; ''') cls.session.set_keyspace('test_pyspark_cassandra')
def main(): pwords = load_wordlist("../Dataset/positive.txt") nwords = load_wordlist("../Dataset/negative.txt") sterms = load_wordlist("../Dataset/keyWords.txt") conf = SparkConf().\ setMaster("local[2]").\ setAppName("TweeStreamer").\ set("spark.cassandra.connection.host",\ "52.25.173.31, 35.165.251.179, 52.27.187.234, 52.38.246.84") sc = CassandraSparkContext(conf=conf) sc.setLogLevel("WARN") # Creating a streaming context with batch interval of 10 sec ssc = StreamingContext(sc, 10) ssc.checkpoint("checkpoint") kstream = KafkaUtils.createDirectStream( ssc, topics=['twitter-topic1'], kafkaParams={"metadata.broker.list": 'localhost:9092'}) tweets = kstream.map(lambda x: json.loads(x[1])) tweets.count().map(lambda x: 'Tweets in this batch: %s' % x).pprint() tweetsUsentiment = tweets.map( lambda tweet: tweetwithSentiment(tweet, pwords, nwords, sterms)) searchTermUsentiment = tweetsUsentiment.flatMap( lambda tweet: searchTermFunction(tweet, sterms)).reduceByKey( lambda a, b: a + b) searchTermUsentiment = searchTermUsentiment.map( lambda (key, value): { "searchterm": "_" + key, "insertion_time": datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S'), "sentiment": value }) searchTermUsentiment.pprint() searchTermUsentiment.saveToCassandra("tweetdb", "searchtermtable") # searchTermSentiment = tweetsUsentiment.map(lambda tweet: searchTermFunction(tweet,sterms)) ssc.start() ssc.awaitTerminationOrTimeout(1000) ssc.stop(stopGraceFully=True)
class MusicCollaborativeFiltering: def __init__(self): self.sparkConfig = SparkConf()\ .setMaster("local[4]")\ .setAppName("MCF")\ .set("spark.cassandra.connection.host", "127.0.0.1")\ .set("spark.cassandra.input.consistency.level", "LOCAL_ONE") self.sparkContext = CassandraSparkContext(conf=self.sparkConfig) self.rank = 10 self.numIteration = 10 self.numberOfPreds = 10 self.cluster = Cluster() self.session = self.cluster.connect("music_recommendation") self.rawData = self.session.execute("SELECT uid, song_id, payload " "FROM user_event " "WHERE action_type='rate'") self.session.execute("CREATE TABLE IF NOT EXISTS result_cf (" "uid text PRIMARY KEY," "recommendations list<text>);") def run(self): """ This function will run the collaborative filtering algorithm and get the predictions for the system. """ userEventData = handle_raw_data(self.rawData) dist_data = self.sparkContext.parallelize(userEventData) userMapIdBase, musicMapIdBase = get_id_based_maps(dist_data) user_event = convert_to_index_based(dist_data, userMapIdBase, musicMapIdBase) ratings = convert_to_rating_type(user_event) test_data = user_event.map(lambda a: (a[0], a[1])) model = ALS.train(ratings, self.rank, self.numIteration) predictions = model.predictAll(test_data).map(lambda r: ((r[0], r[1]), r[2])) ratesAndPreds = ratings.map(lambda r: ((r[0], r[1]), r[2])).join( predictions) MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean() self.sparkContext.stop() return MSE
class SparkCassandra: appNameCassandra = "WikiOlapCassandra" appNameSQL = "WikiOlapSQL" master = "spark://"+socket.gethostname()+":7077" confCassandra = SparkConf() \ .setAppName(appNameCassandra) \ .setMaster(master) \ .set("spark.cassandra.connection.host", os.environ['CASSANDRA_PORT_9042_TCP_ADDR']) sc = CassandraSparkContext(conf=confCassandra) sqlContext = SQLContext(sc)
class MusicCollaborativeFiltering: def __init__(self): self.sparkConfig = SparkConf()\ .setMaster("local[4]")\ .setAppName("MCF")\ .set("spark.cassandra.connection.host", "127.0.0.1")\ .set("spark.cassandra.input.consistency.level", "LOCAL_ONE") self.sparkContext = CassandraSparkContext(conf=self.sparkConfig) self.rank = 10 self.numIteration = 10 self.numberOfPreds = 10 self.cluster = Cluster() self.session = self.cluster.connect("music_recommendation") self.rawData = self.session.execute("SELECT uid, song_id, payload " "FROM user_event " "WHERE action_type='rate'") self.session.execute("CREATE TABLE IF NOT EXISTS result_cf (" "uid text PRIMARY KEY," "recommendations list<text>);") def run(self): """ This function will run the collaborative filtering algorithm and get the predictions for the system. """ userEventData = handle_raw_data(self.rawData) dist_data = self.sparkContext.parallelize(userEventData) userMapIdBase, musicMapIdBase = get_id_based_maps(dist_data) user_event = convert_to_index_based(dist_data, userMapIdBase, musicMapIdBase) nonRatedUserMusic = get_non_rated_user_music(user_event) ratings = convert_to_rating_type(user_event) model = ALS.train(ratings, self.rank, self.numIteration) predictions = model.predictAll(nonRatedUserMusic) predictions = get_final_result(self.numberOfPreds, predictions, userMapIdBase, musicMapIdBase) predictions.saveToCassandra("music_recommendation", "result_cf", {"uid", "recommendations"})
mongo_client= MongoClient() mongo_client.drop_database(db_out) mongo_client.close() print 'database cleared' col_tenant_id = 1 col_user_id = 2 col_item_id = 3 num_to_recomm_per_user = 10 num_to_recomm_per_item = 10 conf = SparkConf().setAppName("PysparkCollaborativeFiltering").set("spark.cassandra.connection.host", spark_cassandra_connection_host) sc = CassandraSparkContext(conf=conf) sc.setCheckpointDir('checkpoint/') data = sc.cassandraTable("mykeyspace", "transactions",row_format=1).collect() # row_format: tuple # (id, tenant_id, user_id, item_id) tenant_ids = set(list(map(lambda x:x[col_tenant_id],data))) data_rdd = sc.parallelize(data) # data_rdd = sc.parallelize(data).map(list) all_results_per_user = sc.emptyRDD() all_results_per_item = sc.emptyRDD() for t_id in tenant_ids: print("\nComputing recommendation for tenant {}...\n".format(t_id)) per_tenant_rdd = data_rdd.filter( lambda x: x[col_tenant_id] == t_id).map( lambda l: ((l[col_user_id],l[col_item_id]),1.0)).reduceByKey(
from pyspark import SparkConf from pyspark_cassandra import CassandraSparkContext from pyspark.sql import SQLContext,Row from pyspark.mllib.regression import LabeledPoint, LinearRegressionWithSGD, LinearRegressionModel from pyspark.mllib.feature import StandardScaler import numpy as np conf = SparkConf().setAppName("Regression on Song Hotness Analysis").setMaster("spark://muziki:7077") sc= CassandraSparkContext(conf=conf) sqlContext = SQLContext(sc) # Make Spark less verbose logger = sc._jvm.org.apache.log4j logger.LogManager.getLogger("org"). setLevel( logger.Level.ERROR ) logger.LogManager.getLogger("akka").setLevel( logger.Level.ERROR ) def parsePoint(data): #return LabeledPoint(data[3],np.append(data[0:3],data[4:])) return LabeledPoint(data[0],data[1:]) # store the data from cassandra to a data frame and remove the NA value data=sc.cassandraTable("msd_01", "songs").select("song_hotttnesss","loudness","year","sentiment","tempo","unique_words").toDF() data=data.filter("year>0").na.drop() print data.count() # Scale the features with Standard Scaler data2=data.map(lambda x: [x.song_hotttnesss, x.loudness,x.year, x.sentiment,x.tempo,x.unique_words])#Convert each sql.row to an array scaler= StandardScaler(withMean=True, withStd=True).fit(data2) #fit a scaler on the every column scaledData = scaler.transform(data2)# transform our data
import json as json from pyspark.sql import SQLContext from pyspark import SparkConf from pyspark import SparkContext from pyspark.sql.types import * from pyspark_cassandra import CassandraSparkContext from operator import add start_time = time.time() if __name__ == "__main__": conf = SparkConf().setAppName("Query App").setMaster( "spark://spark01.cs.ucr.edu:7077") sc = CassandraSparkContext(conf=conf) sqlContext = SQLContext(sc) # pulling data from cassandra x = sc.cassandraTable("census", "test", row_format=pyspark_cassandra.RowFormat.TUPLE).select("zipcode", "pop", "meanincome")\ .map(lambda x: (x[0], x[1], x[2])) \ y = sc.cassandraTable("hospitals", "test", row_format=pyspark_cassandra.RowFormat.TUPLE).select("zipcode", "numberofdoctors")\ .map(lambda x: (x[0], x[1]))\ .reduceByKey(lambda x,y: x + y) df_x = sqlContext.createDataFrame(x) df_y = sqlContext.createDataFrame(y) # (zipcode, pop, meanincome, zipcode, numberofdoctors) -> (zipcode, pop, meanincome, numberofdoctors, pop/numberofdoctors)
day = result.day return int(time.mktime(time.strptime('%s-%s-%s' %(year,month,day), '%Y-%m-%d'))) - time.timezone pass """ spark-submit --packages anguenot:pyspark-cassandra:0.7.0 spark-calculate-pageview.py """ if __name__ == '__main__': if len(sys.argv) != 1: print("Usage: spark-calculate-pageview", file=sys.stderr) exit(-1) conf = SparkConf() \ .setAppName("spark-calculate-pageview") \ .set("spark.cassandra.connection.host", "10.88.113.74") sc = CassandraSparkContext(conf=conf) spark = SparkSession(sc) i = 1511740800 while i <= 1514505600: # while True: date_temp = i i = i + 86400 # current_date = getGMT() # future_date = getNextGMT() rdd = sc.cassandraTable("web_analytic","fsa_log_visit").select("m_date","location_path")\ .filter(lambda x: date_temp <= int(x['m_date']) < i) if rdd.isEmpty() == False: x = rdd.toDF().groupBy(['location_path']).count() # x.show() array = [] for row in x.collect():
print 'saved to ' + collection if __name__ == "__main__": t0 = time.time() mongo_client= MongoClient() mongo_client.drop_database(db_out) # print 'database cleared' num_to_recomm_per_user = 10 num_to_recomm_per_item = 10 conf = SparkConf().setAppName("PysparkCollaborativeFiltering") print 'conf' sc = CassandraSparkContext(conf=conf) sc.setCheckpointDir('checkpoint/') if LOAD_DATA_FROM_DB: data_rdd = sc.cassandraTable(cassandra_keyspace, cassandra_table) # row_format: Row # print data t1 = time.time() tenant_ids = data_rdd.map(lambda trans:trans[col_tenant_id]).distinct().collect() elapsed = (time.time() - t0) print ("\nIt took %.2fsec to complete" % elapsed) t1 = time.time() cluster = Cluster() session = cluster.connect(cassandra_keyspace)
# Pop int, # MeanIncome int, # TaxonomyCode1 varchar, # ProviderNumber varchar, # HospitalName varchar, # X varchar, # Y varchar, # Count int, # PRIMARY KEY(ZipCode, TaxonomyCode1, ProviderNumber) #); if __name__ == "__main__": conf = SparkConf().setAppName("Query App").setMaster( "spark://spark01.cs.ucr.edu:7077") sc = CassandraSparkContext(conf=conf) sqlContext = SQLContext(sc) # pulling data from cassandra x = sc.cassandraTable("census", "test", row_format=pyspark_cassandra.RowFormat.TUPLE).select("zipcode", "pop", "meanincome")\ .map(lambda x: (x[0], x[1], x[2])) # BusinessPracticeLocationPostalCode -> ZipCode in providers.taxonomy_count y = sc.cassandraTable("providers", "test", row_format=pyspark_cassandra.RowFormat.TUPLE).select("businesspracticelocationpostalcode", "taxonomycode1")\ .map(lambda x: (x[0], x[1])) df_x = sqlContext.createDataFrame(x) df_y = sqlContext.createDataFrame(y) # joining x and y
import json import time from dateutil import tz from datetime import datetime, timezone, date, timedelta """ spark-submit --packages anguenot:pyspark-cassandra:0.7.0 spark-calculate-device.py """ if __name__ == '__main__': if len(sys.argv) != 1: print("Usage: spark-calculate-device.py ", file=sys.stderr) exit(-1) conf = SparkConf() \ .setAppName("spark-calculate-device") \ .set("spark.cassandra.connection.host", "10.88.113.74") sc = CassandraSparkContext(conf=conf) spark = SparkSession(sc) sql = SQLContext(sc) while True: rdd = sc.cassandraTable("web_analytic", "fsa_log_visit").select( "config_device", "fsa") if rdd.isEmpty() == False: x = rdd.toDF().dropDuplicates(['fsa']) x = x.groupBy(['config_device']).count() array = [] for row in x.collect(): x = { 'config_device': row['config_device'], 'device_count': row['count'], 'bucket': 3
import time from dateutil import tz from datetime import datetime, timezone, date, timedelta """ spark-submit --packages anguenot:pyspark-cassandra:0.7.0 spark-calculate-last-like.py """ if __name__ == '__main__': if len(sys.argv) != 1: print("Usage: spark-calculate-last-like.py ", file=sys.stderr) exit(-1) conf = SparkConf() \ .setAppName("spark-calculate-last-like") \ .set("spark.cassandra.connection.host", "10.88.113.74") sc = CassandraSparkContext(conf=conf) spark = SparkSession(sc) sql = SQLContext(sc) while True: rdd = sc.cassandraTable("db","user_event_model").select("config_browser","m_date") if rdd.isEmpty() == False: x = rdd.toDF().groupBy(['config_browser']).count() array = [] for row in x.collect(): x = { 'config_browser': row['config_browser'], 'browser_count': row['count'], 'bucket':4 } array.append(x)
import json import time from dateutil import tz from datetime import datetime, timezone, date, timedelta """ spark-submit --packages anguenot:pyspark-cassandra:0.7.0 spark-calculate-location.py """ if __name__ == '__main__': if len(sys.argv) != 1: print("Usage: spark-calculate-location.py ", file=sys.stderr) exit(-1) conf = SparkConf() \ .setAppName("spark-calculate-location") \ .set("spark.cassandra.connection.host", "10.88.113.74") sc = CassandraSparkContext(conf=conf) spark = SparkSession(sc) sql = SQLContext(sc) while True: rdd = sc.cassandraTable("web_analytic", "fsa_log_visit").select( "location_country_name", "location_country_code", "fsa") # rdd.toDF().show() if rdd.isEmpty() == False: x = rdd.toDF().dropDuplicates(['fsa']) x = x.groupBy(['location_country_name', 'location_country_code']).count() array = [] for row in x.collect(): x = { 'location_country_name': row['location_country_name'],
import json import time from dateutil import tz from datetime import datetime, timezone, date, timedelta """ spark-submit --packages anguenot:pyspark-cassandra:0.7.0 spark-calculate-language.py """ if __name__ == '__main__': if len(sys.argv) != 1: print("Usage: spark-calculate-language.py ", file=sys.stderr) exit(-1) conf = SparkConf() \ .setAppName("spark-calculate-language") \ .set("spark.cassandra.connection.host", "10.88.113.74") sc = CassandraSparkContext(conf=conf) spark = SparkSession(sc) sql = SQLContext(sc) while True: rdd = sc.cassandraTable("web_analytic", "fsa_log_visit").select( "location_browser_lan", "fsa") if rdd.isEmpty() == False: x = rdd.toDF().dropDuplicates(['fsa']) x = x.groupBy(['location_browser_lan']).count() array = [] for row in x.collect(): x = { 'browser_language': row['location_browser_lan'], 'count': row['count'], 'bucket': 6
# print('=====================================' + s[1]) vecSec[int(s[0])] = float(s[1]) for i in range(3000): vecFirst[i] = vecFirst[i] + vecSec[i] result = '' for i in range(3000): if vecFirst[0] > 0: result = '%s;%d %.4f' % (result, i, vecFirst[i]) return result conf = SparkConf()\ .set("spark.cassandra.connection.host", "localhost") sc = CassandraSparkContext(conf=conf) vecSum = sc.cassandraTable('reishi', 'dockmeans')\ .select("cluster_id", "vector")\ .where("cluster_id=?", 0)\ .map(lambda x: (x['cluster_id'], x['vector']))\ .reduceByKey(lambda x, y: maxVector(x, y))\ .collect() vector = [] #print(vecSum) v = vecSum[0] v = v[1] #print('=================================' + v) splt = v.split(';')
start_time = datetime.datetime(2010, 10, 7, 0, 0) sample_frequency = datetime.timedelta(minutes=30) num_tests = 2 list_times = [start_time + x * sample_frequency for x in range(num_tests)] num_of_meters = 30 window_size = datetime.timedelta(hours=24) init_model_params = {} meter_ids = get_meters() mk = 3 lrate = 0.75 SE = 0 i = 1 program_start_time = t.time() sc = CassandraSparkContext(appName="PySpark Cassandra Test", master="local[*]") '''DataFrame Tests''' #for current_time in list_times: current_time = list_times[0] readings = sc \ .cassandraTable("cer", "readings") \ .select("meter_id", "date", "measurement") \ .where("date <= '{}' AND date >= '{}'".format(current_time, current_time-mk*sample_frequency))\ .map(lambda x: (x["meter_id"], (x["date"], x["measurement"])))\ .groupByKey()\ .mapValues(lambda x: pd.Series(list(i[1] for i in x), index=list(i[0] for i in x))) model_parameters = sc \ .cassandraTable("cer", "models") \ .map(lambda x: (x["meter_id"], np.asanyarray(x["w"])))
#import matplotlib.pyplot as plt from pyspark_cassandra import CassandraSparkContext, Row from pyspark import SparkContext, SparkConf from subprocess import call import subprocess import commands aa=commands.getstatusoutput("b=0") conf = SparkConf() \ .setAppName("User Food Migration") \ .setMaster("spark://128.138.202.110:7077") \ .set("spark.cassandra.connection.host", "128.138.202.117") sc = CassandraSparkContext(conf=conf) users = sc.cassandraTable("junk", "trump2") trump = users.map(lambda x: {"tweet_id":x['tweet_id'], "tweet":x['tweet']} ) #to access Twitter API consumer_key = "43b4urzsW8nMY3oGzB5tIIM8B" consumer_secret = "fbGLMhkFyipYbTAz0s0S6yrN6cDGGWnEMmNaciceYjr4sgEdP2" garbage = 0
8: 'SUPERIOR COMPLETO', } ETNIA = { 1: 'BRANCA', 2: 'PRETA', 3: 'PARDA', 4: 'AMARELA', 5: 'INDÍGENA', } # Cria o SparkContext conf = SparkConf() \ .setAppName("Pergunta1") \ .set("spark.cassandra.connection.host", "10.7.40.94") csc = CassandraSparkContext(conf=conf) # Prepara os RDDs das tabelas. candidatos = csc.cassandraTable("eleicoes", "candidatos2014") resultados = csc.cassandraTable("eleicoes", "resultados2014") # Busca o código dos candidatos eleitos e dinstintos (para desconsiderar segundo turno). # Existem 3 tipos de candidatos eleitos (1, 2, 3). cod_eleitos1 = resultados.select('sq_candidato').where( "codigo_sit_cand_tot=? ", 1) cod_eleitos2 = resultados.select('sq_candidato').where( "codigo_sit_cand_tot=? ", 2) cod_eleitos3 = resultados.select('sq_candidato').where( "codigo_sit_cand_tot=? ", 3)
return (str(val[1]), [str(val[2])]) def produce_song_pairs(song_list): song_pairs = combinations(song_list, 2) song_pairs_list = map(lambda song_pair: (song_pair, 1), song_pairs) return song_pairs_list def cassandra_row_format(song_pair): songs = song_pair[0] frequency = song_pair[1] return [{"song_id": int(songs[0]), "freq_song_id": int(songs[1]), "frequency": frequency}, {"song_id": int(songs[1]), "freq_song_id": int(songs[0]), "frequency": frequency}] if __name__ == "__main__": conf = SparkConf().setAppName("FrequentPatternsSongs").setMaster(config.SPARK_MASTER).set("spark.cassandra.connection.host", config.CASSANDRA_SEED_NODE_IP) sc = CassandraSparkContext(conf=conf) frequency_threshold = 3 filename = datetime.now().strftime("%Y-%m-%d")+"-usersonglog.txt" sc.textFile(config.HDFS_URL+":"+config.HDFS_PORT+config.LOG_FOLDER+filename) \ .filter(time_range_filter) \ .map(parse_log_entry) \ .reduceByKey(lambda song1, song2: song1+song2) \ .map(lambda x: sorted(set(x[1]))) \ .flatMap(produce_song_pairs) \ .reduceByKey(lambda a,b: a+b) \ .filter(lambda song_pair: song_pair[1] > frequency_threshold) \ .flatMap(cassandra_row_format) \ .saveToCassandra(config.CASSANDRA_KEYSPACE, "frequent_song_pairs")
day = result.day return int(time.mktime(time.strptime('%s-%s-%s' %(year,month,day), '%Y-%m-%d'))) - time.timezone pass """ spark-submit --packages anguenot:pyspark-cassandra:0.7.0 spark-calculate-pageview-total-user.py """ if __name__ == '__main__': if len(sys.argv) != 1: print("Usage: spark-calculate-pageview-total-user", file=sys.stderr) exit(-1) conf = SparkConf() \ .setAppName("spark-calculate-pageview-total-user") \ .set("spark.cassandra.connection.host", "10.88.113.74") sc = CassandraSparkContext(conf=conf) spark = SparkSession(sc) # i = 1514505600 # while i <= 1514764800: while True: current_date = getGMT() future_date = getNextGMT() # date_temp = i # i = i + 86400 raw = sc.cassandraTable("web_analytic","fsa_log_visit").select("m_date","userid","fsa","fsid","location_path") if raw.isEmpty() == False: df = raw.toDF() current_day = df.filter( df.m_date >= current_date ).filter(df.m_date < future_date).dropDuplicates(['fsa',"fsid"]).select('fsa','fsid') previous_day = df.filter(df.m_date < current_date).select('fsa','fsid') # current_day = df.filter( df.m_date >= date_temp ).filter(df.m_date < i).dropDuplicates(['fsa',"fsid"]).select('fsa','fsid') # previous_day = df.filter(df.m_date < date_temp).select('fsa','fsid')
year=datetime.now().year month=datetime.now().month day=datetime.now().day result = int(time.mktime(time.strptime('%s-%s-%s' %(year,month,day), '%Y-%m-%d'))) - time.timezone return int(datetime.now().timestamp()) if __name__ == '__main__': if len(sys.argv) != 3: print("Usage: direct_kafka_wordcount.py <broker_list> <topic>", file=sys.stderr) exit(-1) conf = SparkConf() \ .setAppName("spark-streaming") \ .set("spark.cassandra.connection.host", "127.0.0.1") sc = CassandraSparkContext(conf=conf) ssc = StreamingContext(sc, 1) brokers, topic = sys.argv[1:] kvs = KafkaUtils.createDirectStream(ssc, [topic], {"metadata.broker.list": brokers}) # kvs.pprint() parsed = kvs.map(lambda x: json.loads(x[1])) # parsed.pprint() ob = parsed.map(lambda x: { "idx_user": x['idx_user'], "time": x['time'], "idx_movie": x['idx_movie'], "movie_id": x['movie_id'], "value": x['value'], "type_event": x['type_event']
def transfer_time(text): #return "2018-06-25" return datetime.today().strftime("%Y-%m-%d %H:%M:%S") def process(rdd): spark = getSparkSessionInstance(rdd.context.getConf()) tweetsDataFrame = spark.read.json(rdd) df = tweetsDataFrame.withColumn('hashtag', func(tweetsDataFrame.text)) df = df.withColumn('time',transfer_time(tweetsDataFrame.time)) df.createOrReplaceTempView("historicaltweets") df = spark.sql("SELECT MAX(time) AS time,hashtag, count(*) AS count FROM historicaltweets WHERE hashtag IS NOT NULL GROUP BY hashtag ORDER BY count DESC") rdd = df.rdd.map(tuple) rdd.saveToCassandra("twitter","tweet") df.show() if __name__ == "__main__": sc = CassandraSparkContext(appName="tweet") sc.setLogLevel("WARN") ssc = StreamingContext(sc,600) topic_name = "twitter" streamFromKafka = KafkaUtils.createDirectStream(ssc, [topic_name],{"metadata.broker.list":'*'}) lines = streamFromKafka.map(lambda x: x[1]) lines.count().pprint() lines.foreachRDD(process) #text_counts = lines.map(lambda tweet: (tweet['hashtag'],1)).reduceByKey(lambda x,y: x + y) ssc.start() ssc.awaitTermination()
from pyspark_cassandra import CassandraSparkContext, Row from pyspark import SparkContext, SparkConf from pyspark.sql import SQLContext conf = SparkConf().setAppName("NBARetrieval") \ .set("spark.cassandra.connection.timeout_ms","20000") \ .set("spark.cassandra.connection.host", "192.168.0.10") \ .set("spark.cassandra.auth.username", "mdi") \ .set("spark.cassandra.auth.password", "W2yIJw6ntl5RYC54VChe3lJoXa") sc = CassandraSparkContext(conf=conf) rdd = sc.cassandraTable("test", "kv") print rdd.first()
import pyspark from pyspark.context import SparkContext, SparkConf from pyspark.sql import SQLContext #, HiveContext #from pyspark.storagelevel import StorageLevel import atexit from pyspark_cassandra import CassandraSparkContext from datetime import tzinfo, timedelta, datetime from pytz import timezone conf = SparkConf() #conf.setMaster("local") conf.setAppName("My app") conf.set("spark.cassandra.connection.host", "10.0.40.42") sc = CassandraSparkContext(conf = conf) atexit.register(lambda: sc.stop()) rdd = sc.cassandraTable("el_test", "cockpit2_testIndexes") # for( d in range 2015-10-01 ~ 2015-10-10 ) do: # # SELECT url,date,site,cnts,cnt from cockpit2_allTogether where `date` = d and site = giga and tags contains resort:android # # after this query, every row has to be updated with new value for cnts: # # UPDATE cnts = ga_videoPlays + sda_downloads + fb_socialFacebookLikes + fb_socialFacebookShares + fb_socialFacebookComments + tw_socialTwitterShares + ga_socialGooglePlusShares + gigya_socialComments def filterDateRage(_from, _to, col): loc = timezone('Europe/Berlin')
from pyspark_cassandra import CassandraSparkContext from pyspark import SparkConf conf = SparkConf() \ .setAppName("ZeusDB") \ .setMaster("local") \ .set("spark.cassandra.connection.host", "YOUR_CLUSTER_HOST_NAME") sc = CassandraSparkContext(conf=conf) result = sc.cassandraTable("zeus", "edge") \ .select("destination", "visit_count", "type") \ .filter(lambda x: x["type"] == "visited") \ .map(lambda x: (x["destination"], int(x["visit_count"]))) \ .reduceByKey(lambda a, b: a + b) \ .top(10, key=lambda x: x[1]) \ print print "================================" print "TOP 10 FREQUENTLY VISITED PLACES" print "================================" for row in result: print str(row[0]) + "\t\t" + str(row[1]) print print "================================" print
'test_93').on('name').collect()) self.assertEqual(len(joined), 2) if __name__ == '__main__': try: # connect to cassandra and create a keyspace for testing CassandraTestCase.session = Cluster().connect() CassandraTestCase.session.execute(''' CREATE KEYSPACE IF NOT EXISTS %s WITH replication = {'class': 'SimpleStrategy', 'replication_factor': 1}; ''' % (CassandraTestCase.keyspace, )) CassandraTestCase.session.set_keyspace(CassandraTestCase.keyspace) # create a cassandra spark context CassandraTestCase.sc = CassandraSparkContext( conf=SparkConf().setAppName("PySpark Cassandra Test")) # perform the unit tests unittest.main() # suite = unittest.TestLoader().loadTestsFromTestCase(RegressionTest) # unittest.TextTestRunner().run(suite) finally: # stop the spark context and cassandra session # stop the spark context and cassandra session if hasattr(CassandraTestCase, 'sc'): CassandraTestCase.sc.stop() if hasattr(CassandraTestCase, 'session'): CassandraTestCase.session.shutdown()
from pyspark_cassandra import CassandraSparkContext, Row from pyspark import SparkContext, SparkConf conf = SparkConf().setAppName("PySpark Cassandra Test").set("spark.cassandra.connection.host", "127.0.0.1") sc = CassandraSparkContext(conf=conf) data = sc.cassandraTable("mykeyspace", "user",row_format = 1).collect() rdd = sc.parallelize(data) print (rdd.collect())
t0 = time.time() mongo_client= MongoClient() mongo_client.drop_database(db_out) print 'database cleared' col_tenant_id = 1 col_user_id = 2 col_item_id = 3 num_to_recomm_per_user = 10 num_to_recomm_per_item = 10 conf = SparkConf().setAppName("PysparkCollaborativeFiltering").set("spark.cassandra.connection.host", spark_cassandra_connection_host) print ('conf') sc = CassandraSparkContext(conf=conf) sc.setCheckpointDir('checkpoint/') if LOAD_DATA_FROM_DB: data = sc.cassandraTable(cassandra_keyspace, cassandra_table, row_format=1).collect() # row_format: tuple # (id, tenant_id, user_id, item_id) tenant_ids = set(list(map(lambda x:x[col_tenant_id],data))) data_rdd = sc.parallelize(data) # data_rdd = sc.parallelize(data).map(list) all_results_per_user = sc.emptyRDD() all_results_per_item = sc.emptyRDD() for t_id in tenant_ids: print("\nComputing recommendation for tenant {}...\n".format(t_id))
--conf spark.cassandra.connection.host=52.32.192.156,52.32.200.206,54.70.213.12 \ /home/ubuntu/pipeline/kafka_spark_cass_imageQuery.py localhost:2181 imgSearchRequests #Opening spark shell with cassandra $SPARK_HOME/bin/pyspark \ --master spark://ip-172-31-0-173:7077 \ --packages TargetHolding/pyspark-cassandra:0.3.5 \ --conf spark.cassandra.connection.host=52.32.192.156,52.32.200.206,54.70.213.12 """ db_table=0 #global rdd producer = KafkaProducer(bootstrap_servers = 'ec2-52-41-224-1.us-west-2.compute.amazonaws.com:9092', value_serializer=lambda v: json.dumps(v).encode('ascii')) # Kafka and Spark Streaming specific vars batch_interval = 5 #question, why is batch interval of 5 so much better than 3? 3 seemed like needed to wait a long time sc = CassandraSparkContext(appName="PythonStreamingVSS") #http://www.slideshare.net/JonHaddad/intro-to-py-spark-and-cassandra ssc = StreamingContext(sc, batch_interval) keyspace="vss_large" """ Example usages: db_table.select("hashvalue", "partitionby","videoname").map(lambda x: x['hashvalue']).take(3) will result in [u'6daab6a32cb6b209', u'77a888d7aa2f882b', u'571d23371cc358d5'] """ def main(): global db_table; global producer; if len(sys.argv) != 3: #print("Usage: thisfile.py <zk> <topic>", file=sys.stderr) #i get an error about file=sys.stderr for some reason
from pyspark_cassandra import CassandraSparkContext, Row from pyspark import SparkContext, SparkConf from pyspark.sql import SQLContext # needed for toDF() conf = SparkConf() \ .setAppName("User Food Migration") \ .setMaster("spark://127.0.0.1:7077") \ .set("spark.cassandra.connection.host", "127.0.0.1") sc = CassandraSparkContext(conf=conf) sql = SQLContext(sc) users = sc.cassandraTable("demo", "user").toDF() food_count = users.select("favorite_food").groupBy("favorite_food").count()
import pyspark from pyspark.context import SparkContext, SparkConf from pyspark.sql import SQLContext #, HiveContext #from pyspark.storagelevel import StorageLevel import atexit from pyspark_cassandra import CassandraSparkContext from datetime import tzinfo, timedelta, datetime from pytz import timezone conf = SparkConf() #conf.setMaster("local") conf.setAppName("My app") conf.set("spark.cassandra.connection.host", "10.0.40.42") sc = CassandraSparkContext(conf = conf) atexit.register(lambda: sc.stop()) rdd = sc.cassandraTable("el_test", "cockpit2_testTogether") # for( d in range 2015-10-01 ~ 2015-10-10 ) do: # # SELECT url,date,site,cnts,cnt from cockpit2_allTogether where `date` = d and site = giga and tags contains resort:android # # after this query, every row has to be updated with new value for cnts: # # UPDATE cnts = ga_videoPlays + sda_downloads + fb_socialFacebookLikes + fb_socialFacebookShares + fb_socialFacebookComments + tw_socialTwitterShares + ga_socialGooglePlusShares + gigya_socialComments def filterDateRage(_from, _to, col): loc = timezone('Europe/Berlin')
import time # Example Usage(will get tweets from Jan 22 but not 24): # *.py "Jan 22" "Jan 24" startDate = str(sys.argv[1]) endDate = str(sys.argv[2]) conf = ( SparkConf() .setAppName("User Food Migration") .setMaster("spark://128.138.202.110:7077") .set("spark.cassandra.connection.host", "128.138.202.117") ) sc = CassandraSparkContext(conf=conf) if __name__ == "__main__": rdd = sc.cassandraTable("junk", "bernie4") temp = 0 # returns list of tweets listBernie = ( rdd.filter(lambda row: row.created_at[4:] > startDate) .filter(lambda row: row.created_at[4:] < endDate) .collect() ) for tweet in listBernie: if tweet.retweet_count > 0: print tweet.retweet_count temp += 1
from pyspark import SparkConf, SparkContext import pyspark_cassandra from pyspark_cassandra import CassandraSparkContext conf = SparkConf()\ .setAppName("PySpark Cassandra Test") \ .setMaster("local[2]") \ .set("spark.cassandra.connection.host","52.25.173.31, 35.165.251.179, 52.27.187.234, 52.38.246.84") # .set("spark.cassandra.connection.host","52.25.173.31, 35.165.251.179, 52.27.187.234, 52.38.246.84") sc = CassandraSparkContext(conf=conf) print((sc.cassandraTable( "tweetdb", "tweettable").select("tweet").map(lambda a: a).collect())) #sc.pprint() #rdd = sc.parallelize([{"tweet":"first second third tweet"}]) #rdd.saveToCassandra( # "tweetdb", # "tweettable")
def time_range_filter(line): val = line.split(",") if len(val) < 3: return False return (int(val[0]) > five_weeks_back and int(val[0]) < (now+1)) def parse_log_entry(line): val = line.split(",") if len(val) < 3: return None return (str(val[1]), [str(val[2])]) if __name__ == "__main__": conf = SparkConf().setAppName("UserUserRelevance").setMaster(config.SPARK_MASTER).set("spark.cassandra.connection.host", config.CASSANDRA_SEED_NODE_IP) sc = CassandraSparkContext(conf=conf) filename = datetime.now().strftime("%Y-%m-%d")+"-usersonglog.txt" users = sc.textFile(config.HDFS_URL+":"+config.HDFS_PORT+config.LOG_FOLDER+filename) \ .filter(time_range_filter) \ .map(parse_log_entry) \ .keys() \ .collect() song_map = {} # store song to user mapping for use in later stages usersongdb = sc.cassandraTable(config.CASSANDRA_KEYSPACE, "user_to_song") songuserdb = sc.cassandraTable(config.CASSANDRA_KEYSPACE, "song_to_user") for user in users: