Example #1
0
def feeder(start_date, end_date, e1, e2, q):

    conf = SparkConf().setAppName("Simple App").setMaster(
        "spark://127.0.0.1:7077").set("spark.cassandra.connection.host",
                                      "127.0.0.1")

    sc = CassandraSparkContext(conf=conf)
    spark = SparkSession(sc)
    a = ""
    l = ['"SP1"', '"SP2"']
    asia = pytz.timezone("Asia/Kolkata")

    #creating a dataframe for the date range and ric names
    rdd = sc.cassandraTable("testkeyspace", "stock_test").select(
        "ric", "time_stamp", "high",
        "low").where("ric in ?", ["SP1", "SP2", "SP3"]).where(
            "time_stamp > ? and time_stamp < ?",
            datetime(2010, 11, 26, 12, 30, tzinfo=asia),
            datetime(2010, 12, 10, 12, 30, tzinfo=asia)).toDF()
    # making a batch according to the time_stamp
    rdd = rdd.orderBy("time_stamp").groupBy("time_stamp").agg(
        collect_list(struct('ric', 'time_stamp', 'high', 'low'))).collect()
    # sending one batch to analytical engine
    for gr in rdd:
        e2.clear()
        send = gr[1]
        q.put(send)  #adding the batch to the queue
        e2.set()
        e1.wait()
Example #2
0
def run_driver(keyspace, table, cass_host):
    conf = SparkConf().setAppName("PySpark Cassandra Sample Driver")
    conf.set("spark.cassandra.connection.host", cass_host)
    sc = CassandraSparkContext(conf=conf)

    # Read some data from Cassandra
    pixels = sc.cassandraTable(keyspace, table)
    print pixels.first()

    # Count unique visitors, notice that the data returned by Cassandra is
    # a dict-like, you can access partition, clustering keys as well as
    # columns by name. CQL collections: lists, sets and maps are converted
    # to proper Python data types
    visitors = pixels.map(lambda p: (p["data"]["visitor_id"],))\
                .distinct()
    print "Visitors: {:,}".format(visitors.count())

    # Insert some new pixels into the table
    pixels = ({
        "customer_id": "example.com",
        "url": "http://example.com/article1/",
        "hour": dt.datetime(2014, 1, 2, 1),
        "ts": dt.datetime(2014, 1, 2, 1, 8, 23),
        "pixel_id": str(uuid4()),
        "data": {
            "visitor_id": "xyz"
        }
    }, )
    saveToCassandra(sc.parallelize(pixels), keyspace, table)
    print "Wrote new pixels to Cassandra {!r}.{!r}".format(keyspace, table)
Example #3
0
def main():
    pwords = load_wordlist("./Dataset/positive.txt")
    nwords = load_wordlist("./Dataset/negative.txt")

    conf = SparkConf().\
        setMaster("local[2]").\
        setAppName("TweeStreamer").\
        set("spark.cassandra.connection.host",\
        "52.25.173.31, 35.165.251.179, 52.27.187.234, 52.38.246.84")
    sc = CassandraSparkContext(conf=conf)
    sc.setLogLevel("WARN")
    sql = SQLContext(sc)
    # Creating a streaming context with batch interval of 1 sec
    ssc = StreamingContext(sc, 10)
    ssc.checkpoint("checkpoint")

    kstream = KafkaUtils.createDirectStream(
        ssc,
        topics=['twitter-topic1'],
        kafkaParams={"metadata.broker.list": 'localhost:9092'})

    #tweets = kstream.map(lambda x: json.loads( x[1].decode('utf-8')))
    tweets = kstream.map(lambda x: json.loads(x[1]))
    tweetsUsentiment = tweets.map(
        lambda tweet: tweetwithSentiment(tweet, pwords, nwords))

    #searchTermSentiment =
    tweetsUsentiment.pprint()

    tweetsUsentiment.saveToCassandra("tweetdb", "tweettable")

    ssc.start()
    ssc.awaitTerminationOrTimeout(100)
    ssc.stop(stopGraceFully=True)
def run_driver(keyspace, table):
    conf = SparkConf().setAppName("PySpark Cassandra Sample Driver")
    conf.set("spark.cassandra.connection.host", "127.0.0.1")
    sc = CassandraSparkContext(conf=conf)

    # Read some data from Cassandra
    pixels = sc.cassandraTable(keyspace, table)
    print pixels.first()

    # Count unique visitors, notice that the data returned by Cassandra is
    # a dict-like, you can access partition, clustering keys as well as
    # columns by name. CQL collections: lists, sets and maps are converted
    # to proper Python data types
    visitors = pixels.map(lambda p: (p["data"]["visitor_id"],))\
                .distinct()
    print "Visitors: {:,}".format(visitors.count())

    # Insert some new pixels into the table
    pixels = (
        {
            "customer_id": "example.com",
            "url": "http://example.com/article1/",
            "hour": dt.datetime(2014, 1, 2, 1),
            "ts": dt.datetime(2014, 1, 2, 1, 8, 23),
            "pixel_id": str(uuid4()),
            "data": {"visitor_id": "xyz"}
        },
    )
    saveToCassandra(sc.parallelize(pixels), keyspace, table)
    print "Wrote new pixels to Cassandra {!r}.{!r}".format(keyspace, table)
def streaming_logic():
    """
    :function: initial spark context and all the streaming logic
    :return: None
    """

    # - read configuration from file
    spark_config, kafka_config, cassandra_config = read_config()

    # - initial spark context
    conf = SparkConf().setMaster(spark_config['master']).setAppName(spark_config['app_name']).set('spark.cassandra.connection.host', cassandra_config['cluster'])
    csc = CassandraSparkContext(conf=conf)
    csc.setLogLevel(spark_config['log_level'])
    ssc = StreamingContext(sparkContext=csc, batchDuration=spark_config['time_window'])

    # - creating kafka stream
    directKafkaStream = KafkaUtils.createDirectStream(ssc, [kafka_config['topic_in']], {'metadata.broker.list': kafka_config['cluster']})

    # - start to process data
    # - output data structure: MetadData
    structured_stock_data = directKafkaStream.map(lambda data : preprocess_data(data=data))
    structured_stock_data.pprint(20)

    stock_data_list = structured_stock_data.reduceByKey(lambda a,b : aggregate_list(a,b))
    stock_data_list.pprint(20)

    # - get history data from cassandra
    alert_user_data = stock_data_list.mapValues(lambda dictlist : compute_stock_tending_in_window(dict_list=dictlist))
    alert_user_data.pprint(20)

    # - send alert to user
    alert_user_data.foreachRDD(lambda rdd : rdd.foreachPartition(lambda iter : send_alert_to_kafka(iterator=iter,kafka_config=kafka_config)))

    ssc.start()
    ssc.awaitTermination()
Example #6
0
def define_context():
    conf = SparkConf().setMaster("local[*]").setAppName(
        "twitter-artist-count").set("spark.cassandra.connection.host",
                                    "127.0.0.1")
    sc = CassandraSparkContext.getOrCreate(conf=conf)
    sc.setCheckpointDir("./checkpoints")
    ssc = StreamingContext(sc, 30)
    return ssc
 def __init__(self):
     self.spark_config = SparkConf()\
         .setMaster("local[4]")\
         .setAppName("Popularity")\
         .set("spark.cassandra.connection.host", "127.0.0.1")
     self.sparkContext = CassandraSparkContext(conf=self.spark_config)
     self.cluster = Cluster()
     self.session = self.cluster.connect("music_recommendation")
     self.raw_data = self.session.execute("SELECT song_id, timestamp "
                                          "FROM user_event "
                                          "WHERE action_type='listen';")
     self.session.execute("DROP TABLE IF EXISTS result_popularity ;")
     self.session.execute("CREATE TABLE IF NOT EXISTS result_popularity ("
                          "sid text PRIMARY KEY,"
                          "rank int);")
     self.current_year = datetime.datetime.now().year
     self.current_month = datetime.datetime.now().month
class Popularity(object):
    def __init__(self):
        self.spark_config = SparkConf()\
            .setMaster("local[4]")\
            .setAppName("Popularity")\
            .set("spark.cassandra.connection.host", "127.0.0.1")
        self.sparkContext = CassandraSparkContext(conf=self.spark_config)
        self.cluster = Cluster()
        self.session = self.cluster.connect("music_recommendation")
        self.raw_data = self.session.execute("SELECT song_id, timestamp "
                                             "FROM user_event "
                                             "WHERE action_type='listen';")
        self.session.execute("DROP TABLE IF EXISTS result_popularity ;")
        self.session.execute("CREATE TABLE IF NOT EXISTS result_popularity ("
                             "sid text PRIMARY KEY,"
                             "rank int);")
        self.current_year = datetime.datetime.now().year
        self.current_month = datetime.datetime.now().month

    def _compare_date_time(self, month, year):
        if self.current_year == year:
            if self.current_month == month:
                return True

        else:
            return False

    def _handle_raw_data(self):
        new_data_set = list()
        for row in self.raw_data:
            month = row.timestamp.month
            year = row.timestamp.year
            if self._compare_date_time(month, year):
                new_tuple = tuple([row.song_id, 1])
                new_data_set.append(new_tuple)

        return new_data_set

    def calculate(self):
        dist_data = self.sparkContext.parallelize(self._handle_raw_data())
        counts = dist_data.reduceByKey(lambda a, b: a + b)
        counts = counts.sortBy(lambda a: a[1], ascending=False).take(10)
        result = self.sparkContext.parallelize(counts)
        result.saveToCassandra("music_recommendation", "result_popularity")

        print result.collect()
Example #9
0
    def __init__(self):
        self.spark_config = SparkConf() \
            .setMaster("local[4]") \
            .setAppName("ContentBased") \
            .set("spark.cassandra.connection.host", "127.0.0.1")
        self.sparkContext = CassandraSparkContext(conf=self.spark_config)

        self.cluster = Cluster()
        self.session = self.cluster.connect("music_recommendation")

        cql_cmd = "SELECT * FROM %s"
        cmd = cql_cmd % "i_profile_artist"
        self.i_artists_res = self.session.execute(cmd)
        cmd = cql_cmd % "i_profile_composer"
        self.i_composers_res = self.session.execute(cmd)
        cmd = cql_cmd % "i_profile_genre"
        self.i_genres_res = self.session.execute(cmd)
        cmd = cql_cmd % "u_profile_artist"
        self.u_artists_res = self.session.execute(cmd)
        cmd = cql_cmd % "u_profile_composer"
        self.u_composers_res = self.session.execute(cmd)
        cmd = cql_cmd % "u_profile_genre"
        self.u_genres_res = self.session.execute(cmd)

        cql_cmd = "SELECT uid, song_id FROM %s"
        events = self.session.execute(cql_cmd % "user_event")
        self.events = dict()
        for event in events:
            songs = self.events.get(event.uid)
            if songs is None:
                self.events[event.uid] = [event.song_id]
            else:
                self.events[event.uid].append(event.song_id)

        self.session.execute("CREATE TABLE IF NOT EXISTS "
                             "result_cb_user_item_genre ("
                             "uid text PRIMARY KEY,"
                             "recommendations list<text>);")
        self.session.execute("CREATE TABLE IF NOT EXISTS "
                             "result_cb_user_item_artist ("
                             "uid text PRIMARY KEY,"
                             "recommendations list<text>);")
        self.session.execute("CREATE TABLE IF NOT EXISTS "
                             "result_cb_user_item_composer ("
                             "uid text PRIMARY KEY,"
                             "recommendations list<text>);")
Example #10
0
 def __init__(self):
     self.sparkConfig = SparkConf()\
         .setMaster("local[4]")\
         .setAppName("MCF")\
         .set("spark.cassandra.connection.host", "127.0.0.1")\
         .set("spark.cassandra.input.consistency.level", "LOCAL_ONE")
     self.sparkContext = CassandraSparkContext(conf=self.sparkConfig)
     self.rank = 10
     self.numIteration = 10
     self.numberOfPreds = 10
     self.cluster = Cluster()
     self.session = self.cluster.connect("music_recommendation")
     self.rawData = self.session.execute("SELECT uid, song_id, payload "
                                         "FROM user_event "
                                         "WHERE action_type='rate'")
     self.session.execute("CREATE TABLE IF NOT EXISTS result_cf ("
                          "uid text PRIMARY KEY,"
                          "recommendations list<text>);")
 def setUpClass(cls):
     super(CassandraTestCase, cls).setUpClass()
     cls.sc = CassandraSparkContext(conf=SparkConf().setAppName("PySpark Cassandra Test"))
     cls.session = Cluster().connect()
     cls.session.execute('''
         CREATE KEYSPACE IF NOT EXISTS test_pyspark_cassandra
         WITH replication = {'class': 'SimpleStrategy', 'replication_factor': 1};
     ''')
     cls.session.set_keyspace('test_pyspark_cassandra')
Example #12
0
def main():
    pwords = load_wordlist("../Dataset/positive.txt")
    nwords = load_wordlist("../Dataset/negative.txt")
    sterms = load_wordlist("../Dataset/keyWords.txt")
    conf = SparkConf().\
        setMaster("local[2]").\
        setAppName("TweeStreamer").\
        set("spark.cassandra.connection.host",\
        "52.25.173.31, 35.165.251.179, 52.27.187.234, 52.38.246.84")
    sc = CassandraSparkContext(conf=conf)
    sc.setLogLevel("WARN")

    # Creating a streaming context with batch interval of 10 sec
    ssc = StreamingContext(sc, 10)
    ssc.checkpoint("checkpoint")

    kstream = KafkaUtils.createDirectStream(
        ssc,
        topics=['twitter-topic1'],
        kafkaParams={"metadata.broker.list": 'localhost:9092'})

    tweets = kstream.map(lambda x: json.loads(x[1]))
    tweets.count().map(lambda x: 'Tweets in this batch: %s' % x).pprint()
    tweetsUsentiment = tweets.map(
        lambda tweet: tweetwithSentiment(tweet, pwords, nwords, sterms))

    searchTermUsentiment = tweetsUsentiment.flatMap(
        lambda tweet: searchTermFunction(tweet, sterms)).reduceByKey(
            lambda a, b: a + b)
    searchTermUsentiment = searchTermUsentiment.map(
        lambda (key, value): {
            "searchterm": "_" + key,
            "insertion_time": datetime.datetime.now().strftime(
                '%Y-%m-%d %H:%M:%S'),
            "sentiment": value
        })
    searchTermUsentiment.pprint()

    searchTermUsentiment.saveToCassandra("tweetdb", "searchtermtable")
    # searchTermSentiment = tweetsUsentiment.map(lambda tweet: searchTermFunction(tweet,sterms))

    ssc.start()
    ssc.awaitTerminationOrTimeout(1000)
    ssc.stop(stopGraceFully=True)
class MusicCollaborativeFiltering:
    def __init__(self):
        self.sparkConfig = SparkConf()\
            .setMaster("local[4]")\
            .setAppName("MCF")\
            .set("spark.cassandra.connection.host", "127.0.0.1")\
            .set("spark.cassandra.input.consistency.level", "LOCAL_ONE")
        self.sparkContext = CassandraSparkContext(conf=self.sparkConfig)
        self.rank = 10
        self.numIteration = 10
        self.numberOfPreds = 10
        self.cluster = Cluster()
        self.session = self.cluster.connect("music_recommendation")
        self.rawData = self.session.execute("SELECT uid, song_id, payload "
                                            "FROM user_event "
                                            "WHERE action_type='rate'")
        self.session.execute("CREATE TABLE IF NOT EXISTS result_cf ("
                             "uid text PRIMARY KEY,"
                             "recommendations list<text>);")

    def run(self):
        """ This function will run the collaborative filtering algorithm and
        get the predictions for the system.
        """
        userEventData = handle_raw_data(self.rawData)
        dist_data = self.sparkContext.parallelize(userEventData)
        userMapIdBase, musicMapIdBase = get_id_based_maps(dist_data)

        user_event = convert_to_index_based(dist_data, userMapIdBase,
                                            musicMapIdBase)

        ratings = convert_to_rating_type(user_event)

        test_data = user_event.map(lambda a: (a[0], a[1]))
        model = ALS.train(ratings, self.rank, self.numIteration)
        predictions = model.predictAll(test_data).map(lambda r:
                                                      ((r[0], r[1]), r[2]))
        ratesAndPreds = ratings.map(lambda r: ((r[0], r[1]), r[2])).join(
            predictions)
        MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean()
        self.sparkContext.stop()
        return MSE
Example #14
0
class SparkCassandra:
    appNameCassandra = "WikiOlapCassandra"
    appNameSQL = "WikiOlapSQL"
    master = "spark://"+socket.gethostname()+":7077"

    confCassandra = SparkConf() \
        .setAppName(appNameCassandra) \
        .setMaster(master) \
        .set("spark.cassandra.connection.host", os.environ['CASSANDRA_PORT_9042_TCP_ADDR'])


    sc = CassandraSparkContext(conf=confCassandra)
    sqlContext = SQLContext(sc)
Example #15
0
class MusicCollaborativeFiltering:
    def __init__(self):
        self.sparkConfig = SparkConf()\
            .setMaster("local[4]")\
            .setAppName("MCF")\
            .set("spark.cassandra.connection.host", "127.0.0.1")\
            .set("spark.cassandra.input.consistency.level", "LOCAL_ONE")
        self.sparkContext = CassandraSparkContext(conf=self.sparkConfig)
        self.rank = 10
        self.numIteration = 10
        self.numberOfPreds = 10
        self.cluster = Cluster()
        self.session = self.cluster.connect("music_recommendation")
        self.rawData = self.session.execute("SELECT uid, song_id, payload "
                                            "FROM user_event "
                                            "WHERE action_type='rate'")
        self.session.execute("CREATE TABLE IF NOT EXISTS result_cf ("
                             "uid text PRIMARY KEY,"
                             "recommendations list<text>);")

    def run(self):
        """ This function will run the collaborative filtering algorithm and
        get the predictions for the system.
        """
        userEventData = handle_raw_data(self.rawData)
        dist_data = self.sparkContext.parallelize(userEventData)
        userMapIdBase, musicMapIdBase = get_id_based_maps(dist_data)

        user_event = convert_to_index_based(dist_data, userMapIdBase,
                                            musicMapIdBase)

        nonRatedUserMusic = get_non_rated_user_music(user_event)
        ratings = convert_to_rating_type(user_event)

        model = ALS.train(ratings, self.rank, self.numIteration)

        predictions = model.predictAll(nonRatedUserMusic)
        predictions = get_final_result(self.numberOfPreds, predictions,
                                       userMapIdBase, musicMapIdBase)
        predictions.saveToCassandra("music_recommendation", "result_cf",
                                    {"uid", "recommendations"})
    
    mongo_client= MongoClient()
    mongo_client.drop_database(db_out)
    mongo_client.close()
    print 'database cleared'
    
    col_tenant_id = 1
    col_user_id = 2
    col_item_id = 3

    num_to_recomm_per_user = 10
    num_to_recomm_per_item = 10
    
    
    conf = SparkConf().setAppName("PysparkCollaborativeFiltering").set("spark.cassandra.connection.host", spark_cassandra_connection_host)
    sc = CassandraSparkContext(conf=conf)
    sc.setCheckpointDir('checkpoint/')
    data = sc.cassandraTable("mykeyspace", "transactions",row_format=1).collect() # row_format: tuple
    # (id, tenant_id, user_id, item_id)
    tenant_ids = set(list(map(lambda x:x[col_tenant_id],data)))
    data_rdd = sc.parallelize(data)
    # data_rdd = sc.parallelize(data).map(list)
    
    all_results_per_user = sc.emptyRDD()
    all_results_per_item = sc.emptyRDD()
    
    for t_id in tenant_ids:
        print("\nComputing recommendation for tenant {}...\n".format(t_id))
        per_tenant_rdd = data_rdd.filter(
            lambda x: x[col_tenant_id] == t_id).map(
            lambda l: ((l[col_user_id],l[col_item_id]),1.0)).reduceByKey(
from pyspark import SparkConf
from pyspark_cassandra import  CassandraSparkContext
from pyspark.sql import SQLContext,Row
from pyspark.mllib.regression import LabeledPoint, LinearRegressionWithSGD, LinearRegressionModel
from pyspark.mllib.feature import StandardScaler
import numpy as np

conf = SparkConf().setAppName("Regression on Song Hotness Analysis").setMaster("spark://muziki:7077")
sc= CassandraSparkContext(conf=conf)
sqlContext = SQLContext(sc)

# Make Spark less verbose
logger = sc._jvm.org.apache.log4j
logger.LogManager.getLogger("org"). setLevel( logger.Level.ERROR )
logger.LogManager.getLogger("akka").setLevel( logger.Level.ERROR )

def parsePoint(data):
	#return LabeledPoint(data[3],np.append(data[0:3],data[4:]))
	return LabeledPoint(data[0],data[1:])

# store the data from cassandra to a data frame and remove the NA value 
data=sc.cassandraTable("msd_01", "songs").select("song_hotttnesss","loudness","year","sentiment","tempo","unique_words").toDF()

data=data.filter("year>0").na.drop()
print data.count()


# Scale the features with Standard Scaler
data2=data.map(lambda x: [x.song_hotttnesss, x.loudness,x.year, x.sentiment,x.tempo,x.unique_words])#Convert each sql.row to an array
scaler= StandardScaler(withMean=True, withStd=True).fit(data2) #fit a scaler on the every column
scaledData = scaler.transform(data2)# transform our data
Example #18
0
import json as json
from pyspark.sql import SQLContext
from pyspark import SparkConf
from pyspark import SparkContext
from pyspark.sql.types import *
from pyspark_cassandra import CassandraSparkContext

from operator import add

start_time = time.time()

if __name__ == "__main__":

    conf = SparkConf().setAppName("Query App").setMaster(
        "spark://spark01.cs.ucr.edu:7077")
    sc = CassandraSparkContext(conf=conf)
    sqlContext = SQLContext(sc)

    # pulling data from cassandra

    x = sc.cassandraTable("census", "test", row_format=pyspark_cassandra.RowFormat.TUPLE).select("zipcode", "pop", "meanincome")\
        .map(lambda x: (x[0], x[1], x[2])) \

    y = sc.cassandraTable("hospitals", "test", row_format=pyspark_cassandra.RowFormat.TUPLE).select("zipcode", "numberofdoctors")\
 .map(lambda x: (x[0], x[1]))\
 .reduceByKey(lambda x,y: x + y)

    df_x = sqlContext.createDataFrame(x)
    df_y = sqlContext.createDataFrame(y)

    # (zipcode, pop, meanincome, zipcode, numberofdoctors) -> (zipcode, pop, meanincome, numberofdoctors, pop/numberofdoctors)
    day = result.day
    return int(time.mktime(time.strptime('%s-%s-%s' %(year,month,day), '%Y-%m-%d'))) - time.timezone
    pass

"""
spark-submit --packages anguenot:pyspark-cassandra:0.7.0 spark-calculate-pageview.py
"""

if __name__ == '__main__':
    if len(sys.argv) != 1:
        print("Usage: spark-calculate-pageview", file=sys.stderr)
        exit(-1)
    conf = SparkConf() \
	.setAppName("spark-calculate-pageview") \
	.set("spark.cassandra.connection.host", "10.88.113.74")
    sc = CassandraSparkContext(conf=conf)
    spark = SparkSession(sc)
    i = 1511740800
    while i <= 1514505600:
    # while True:
        date_temp = i
        i = i + 86400
        # current_date = getGMT()
        # future_date = getNextGMT()
        rdd = sc.cassandraTable("web_analytic","fsa_log_visit").select("m_date","location_path")\
                .filter(lambda x: date_temp <= int(x['m_date']) < i)
        if rdd.isEmpty() == False:
            x = rdd.toDF().groupBy(['location_path']).count()
            # x.show()
            array = []
            for row in x.collect():
    print 'saved to ' + collection

if __name__ == "__main__":
    
    t0 = time.time()
    
    mongo_client= MongoClient()
    mongo_client.drop_database(db_out)
    # print 'database cleared'


    num_to_recomm_per_user = 10
    num_to_recomm_per_item = 10
    conf = SparkConf().setAppName("PysparkCollaborativeFiltering")
    print 'conf'
    sc = CassandraSparkContext(conf=conf)
    sc.setCheckpointDir('checkpoint/')
    
    if LOAD_DATA_FROM_DB:
        
        data_rdd = sc.cassandraTable(cassandra_keyspace, cassandra_table) # row_format: Row
        # print data

        t1 = time.time()
        tenant_ids = data_rdd.map(lambda trans:trans[col_tenant_id]).distinct().collect()
        elapsed = (time.time() - t0)
        print ("\nIt took %.2fsec to complete" % elapsed)

        t1 = time.time()
        cluster = Cluster()
        session = cluster.connect(cassandra_keyspace)
Example #21
0
#	Pop int,
#	MeanIncome int,
#	TaxonomyCode1 varchar,
#	ProviderNumber varchar,
#	HospitalName varchar,
#	X varchar,
#	Y varchar,
#	Count int,
#	PRIMARY KEY(ZipCode, TaxonomyCode1, ProviderNumber)
#);

if __name__ == "__main__":

    conf = SparkConf().setAppName("Query App").setMaster(
        "spark://spark01.cs.ucr.edu:7077")
    sc = CassandraSparkContext(conf=conf)
    sqlContext = SQLContext(sc)

    # pulling data from cassandra

    x = sc.cassandraTable("census", "test", row_format=pyspark_cassandra.RowFormat.TUPLE).select("zipcode", "pop", "meanincome")\
        .map(lambda x: (x[0], x[1], x[2]))

    # BusinessPracticeLocationPostalCode -> ZipCode in providers.taxonomy_count
    y = sc.cassandraTable("providers", "test", row_format=pyspark_cassandra.RowFormat.TUPLE).select("businesspracticelocationpostalcode", "taxonomycode1")\
 .map(lambda x: (x[0], x[1]))

    df_x = sqlContext.createDataFrame(x)
    df_y = sqlContext.createDataFrame(y)

    # joining x and y
Example #22
0
import json
import time
from dateutil import tz
from datetime import datetime, timezone, date, timedelta
"""
spark-submit --packages anguenot:pyspark-cassandra:0.7.0 spark-calculate-device.py
"""

if __name__ == '__main__':
    if len(sys.argv) != 1:
        print("Usage: spark-calculate-device.py ", file=sys.stderr)
        exit(-1)
    conf = SparkConf() \
 .setAppName("spark-calculate-device") \
 .set("spark.cassandra.connection.host", "10.88.113.74")
    sc = CassandraSparkContext(conf=conf)
    spark = SparkSession(sc)
    sql = SQLContext(sc)

    while True:
        rdd = sc.cassandraTable("web_analytic", "fsa_log_visit").select(
            "config_device", "fsa")
        if rdd.isEmpty() == False:
            x = rdd.toDF().dropDuplicates(['fsa'])
            x = x.groupBy(['config_device']).count()
            array = []
            for row in x.collect():
                x = {
                    'config_device': row['config_device'],
                    'device_count': row['count'],
                    'bucket': 3
import time
from dateutil import tz
from datetime import datetime, timezone, date, timedelta

"""
spark-submit --packages anguenot:pyspark-cassandra:0.7.0 spark-calculate-last-like.py
"""

if __name__ == '__main__':
    if len(sys.argv) != 1:
        print("Usage: spark-calculate-last-like.py ", file=sys.stderr)
        exit(-1)
    conf = SparkConf() \
	.setAppName("spark-calculate-last-like") \
	.set("spark.cassandra.connection.host", "10.88.113.74")
    sc = CassandraSparkContext(conf=conf)
    spark = SparkSession(sc)
    sql = SQLContext(sc)

    while True:
        rdd = sc.cassandraTable("db","user_event_model").select("config_browser","m_date")
        if rdd.isEmpty() == False:
            x = rdd.toDF().groupBy(['config_browser']).count()
            array = []
            for row in x.collect():
                x = {
                    'config_browser': row['config_browser'], 
                    'browser_count': row['count'],
                    'bucket':4
                    }
                array.append(x)
import json
import time
from dateutil import tz
from datetime import datetime, timezone, date, timedelta
"""
spark-submit --packages anguenot:pyspark-cassandra:0.7.0 spark-calculate-location.py
"""

if __name__ == '__main__':
    if len(sys.argv) != 1:
        print("Usage: spark-calculate-location.py ", file=sys.stderr)
        exit(-1)
    conf = SparkConf() \
 .setAppName("spark-calculate-location") \
 .set("spark.cassandra.connection.host", "10.88.113.74")
    sc = CassandraSparkContext(conf=conf)
    spark = SparkSession(sc)
    sql = SQLContext(sc)

    while True:
        rdd = sc.cassandraTable("web_analytic", "fsa_log_visit").select(
            "location_country_name", "location_country_code", "fsa")
        # rdd.toDF().show()
        if rdd.isEmpty() == False:
            x = rdd.toDF().dropDuplicates(['fsa'])
            x = x.groupBy(['location_country_name',
                           'location_country_code']).count()
            array = []
            for row in x.collect():
                x = {
                    'location_country_name': row['location_country_name'],
Example #25
0
import json
import time
from dateutil import tz
from datetime import datetime, timezone, date, timedelta
"""
spark-submit --packages anguenot:pyspark-cassandra:0.7.0 spark-calculate-language.py
"""

if __name__ == '__main__':
    if len(sys.argv) != 1:
        print("Usage: spark-calculate-language.py ", file=sys.stderr)
        exit(-1)
    conf = SparkConf() \
 .setAppName("spark-calculate-language") \
 .set("spark.cassandra.connection.host", "10.88.113.74")
    sc = CassandraSparkContext(conf=conf)
    spark = SparkSession(sc)
    sql = SQLContext(sc)

    while True:
        rdd = sc.cassandraTable("web_analytic", "fsa_log_visit").select(
            "location_browser_lan", "fsa")
        if rdd.isEmpty() == False:
            x = rdd.toDF().dropDuplicates(['fsa'])
            x = x.groupBy(['location_browser_lan']).count()
            array = []
            for row in x.collect():
                x = {
                    'browser_language': row['location_browser_lan'],
                    'count': row['count'],
                    'bucket': 6
Example #26
0
#			print('=====================================' + s[1])
			vecSec[int(s[0])] = float(s[1])
	for i in range(3000):
		vecFirst[i] = vecFirst[i] + vecSec[i]
		
	result = ''
	for i in range(3000):
		if vecFirst[0] > 0:
			result = '%s;%d %.4f' % (result, i, vecFirst[i])
	return result
	

conf = SparkConf()\
	.set("spark.cassandra.connection.host", "localhost")

sc = CassandraSparkContext(conf=conf)

vecSum = sc.cassandraTable('reishi', 'dockmeans')\
	.select("cluster_id", "vector")\
	.where("cluster_id=?", 0)\
	.map(lambda x: (x['cluster_id'], x['vector']))\
	.reduceByKey(lambda x, y: maxVector(x, y))\
	.collect()
	
vector = []
#print(vecSum)

v = vecSum[0]
v = v[1]
#print('=================================' + v)
splt = v.split(';')
Example #27
0
start_time = datetime.datetime(2010, 10, 7, 0, 0)
sample_frequency = datetime.timedelta(minutes=30)
num_tests = 2
list_times = [start_time + x * sample_frequency for x in range(num_tests)]
num_of_meters = 30
window_size = datetime.timedelta(hours=24)
init_model_params = {}
meter_ids = get_meters()
mk = 3
lrate = 0.75
SE = 0
i = 1

program_start_time = t.time()

sc = CassandraSparkContext(appName="PySpark Cassandra Test", master="local[*]")
'''DataFrame Tests'''
#for current_time in list_times:
current_time = list_times[0]
readings = sc \
    .cassandraTable("cer", "readings") \
    .select("meter_id", "date", "measurement") \
    .where("date <= '{}' AND date >= '{}'".format(current_time, current_time-mk*sample_frequency))\
    .map(lambda x: (x["meter_id"], (x["date"], x["measurement"])))\
    .groupByKey()\
    .mapValues(lambda x: pd.Series(list(i[1] for i in x), index=list(i[0] for i in x)))

model_parameters = sc \
    .cassandraTable("cer", "models") \
    .map(lambda x: (x["meter_id"], np.asanyarray(x["w"])))
#import matplotlib.pyplot as plt

from pyspark_cassandra import CassandraSparkContext, Row
from pyspark import SparkContext, SparkConf
from subprocess import call
import subprocess
import commands
aa=commands.getstatusoutput("b=0")


conf = SparkConf() \
    .setAppName("User Food Migration") \
    .setMaster("spark://128.138.202.110:7077") \
    .set("spark.cassandra.connection.host", "128.138.202.117")

sc = CassandraSparkContext(conf=conf)

users = sc.cassandraTable("junk", "trump2")
trump = users.map(lambda x:
	       {"tweet_id":x['tweet_id'],
		"tweet":x['tweet']} )





#to access Twitter API
consumer_key = "43b4urzsW8nMY3oGzB5tIIM8B"
consumer_secret = "fbGLMhkFyipYbTAz0s0S6yrN6cDGGWnEMmNaciceYjr4sgEdP2"
garbage = 0
Example #29
0
    8: 'SUPERIOR COMPLETO',
}

ETNIA = {
    1: 'BRANCA',
    2: 'PRETA',
    3: 'PARDA',
    4: 'AMARELA',
    5: 'INDÍGENA',
}

# Cria o SparkContext
conf = SparkConf() \
    .setAppName("Pergunta1") \
    .set("spark.cassandra.connection.host", "10.7.40.94")
csc = CassandraSparkContext(conf=conf)

# Prepara os RDDs das tabelas.
candidatos = csc.cassandraTable("eleicoes", "candidatos2014")
resultados = csc.cassandraTable("eleicoes", "resultados2014")

# Busca o código dos candidatos eleitos e dinstintos (para desconsiderar segundo turno).
# Existem 3 tipos de candidatos eleitos (1, 2, 3).

cod_eleitos1 = resultados.select('sq_candidato').where(
    "codigo_sit_cand_tot=? ", 1)
cod_eleitos2 = resultados.select('sq_candidato').where(
    "codigo_sit_cand_tot=? ", 2)
cod_eleitos3 = resultados.select('sq_candidato').where(
    "codigo_sit_cand_tot=? ", 3)
	return (str(val[1]), [str(val[2])])

def produce_song_pairs(song_list):
	song_pairs = combinations(song_list, 2)
	song_pairs_list = map(lambda song_pair: (song_pair, 1), song_pairs)
	return song_pairs_list

def cassandra_row_format(song_pair):
	songs = song_pair[0]
	frequency = song_pair[1]
	return [{"song_id": int(songs[0]), "freq_song_id": int(songs[1]), "frequency": frequency}, {"song_id": int(songs[1]), "freq_song_id": int(songs[0]), "frequency": frequency}]


if __name__ == "__main__":
	conf = SparkConf().setAppName("FrequentPatternsSongs").setMaster(config.SPARK_MASTER).set("spark.cassandra.connection.host", config.CASSANDRA_SEED_NODE_IP)
	sc = CassandraSparkContext(conf=conf)
	frequency_threshold = 3

	filename = datetime.now().strftime("%Y-%m-%d")+"-usersonglog.txt"

	sc.textFile(config.HDFS_URL+":"+config.HDFS_PORT+config.LOG_FOLDER+filename) \
		.filter(time_range_filter) \
		.map(parse_log_entry) \
		.reduceByKey(lambda song1, song2: song1+song2) \
		.map(lambda x: sorted(set(x[1]))) \
		.flatMap(produce_song_pairs) \
		.reduceByKey(lambda a,b: a+b) \
		.filter(lambda song_pair: song_pair[1] > frequency_threshold) \
		.flatMap(cassandra_row_format) \
		.saveToCassandra(config.CASSANDRA_KEYSPACE, "frequent_song_pairs")
Example #31
0
    day = result.day
    return int(time.mktime(time.strptime('%s-%s-%s' %(year,month,day), '%Y-%m-%d'))) - time.timezone
    pass

"""
spark-submit --packages anguenot:pyspark-cassandra:0.7.0 spark-calculate-pageview-total-user.py
"""

if __name__ == '__main__':
    if len(sys.argv) != 1:
        print("Usage: spark-calculate-pageview-total-user", file=sys.stderr)
        exit(-1)
    conf = SparkConf() \
	.setAppName("spark-calculate-pageview-total-user") \
	.set("spark.cassandra.connection.host", "10.88.113.74")
    sc = CassandraSparkContext(conf=conf)
    spark = SparkSession(sc)
    # i = 1514505600
    # while i <= 1514764800:
    while True:
        current_date = getGMT()
        future_date = getNextGMT()
        # date_temp = i
        # i = i + 86400
        raw = sc.cassandraTable("web_analytic","fsa_log_visit").select("m_date","userid","fsa","fsid","location_path")
        if raw.isEmpty() == False:
            df = raw.toDF()
            current_day = df.filter( df.m_date >= current_date ).filter(df.m_date < future_date).dropDuplicates(['fsa',"fsid"]).select('fsa','fsid')
            previous_day =  df.filter(df.m_date < current_date).select('fsa','fsid')
            # current_day = df.filter( df.m_date >= date_temp ).filter(df.m_date < i).dropDuplicates(['fsa',"fsid"]).select('fsa','fsid')
            # previous_day =  df.filter(df.m_date < date_temp).select('fsa','fsid')
Example #32
0
    year=datetime.now().year
    month=datetime.now().month
    day=datetime.now().day
    result = int(time.mktime(time.strptime('%s-%s-%s' %(year,month,day), '%Y-%m-%d'))) - time.timezone
    return int(datetime.now().timestamp())

if __name__ == '__main__':

    if len(sys.argv) != 3:
        print("Usage: direct_kafka_wordcount.py <broker_list> <topic>", file=sys.stderr)
        exit(-1)

    conf = SparkConf() \
	.setAppName("spark-streaming") \
	.set("spark.cassandra.connection.host", "127.0.0.1")
    sc = CassandraSparkContext(conf=conf)
    ssc = StreamingContext(sc, 1)
    brokers, topic = sys.argv[1:]
    kvs = KafkaUtils.createDirectStream(ssc, [topic], {"metadata.broker.list": brokers})
    # kvs.pprint()
    parsed = kvs.map(lambda x: json.loads(x[1]))
    # parsed.pprint()
    ob = parsed.map(lambda x: 
        { 
            "idx_user": x['idx_user'],
            "time": x['time'],
            "idx_movie": x['idx_movie'],
            "movie_id": x['movie_id'],
            "value": x['value'],
            "type_event": x['type_event']
            
Example #33
0
def transfer_time(text):
    #return "2018-06-25"
    return datetime.today().strftime("%Y-%m-%d %H:%M:%S")

def process(rdd):
    spark = getSparkSessionInstance(rdd.context.getConf())
    tweetsDataFrame = spark.read.json(rdd)
    df = tweetsDataFrame.withColumn('hashtag', func(tweetsDataFrame.text))
    df = df.withColumn('time',transfer_time(tweetsDataFrame.time))
    df.createOrReplaceTempView("historicaltweets")
    df = spark.sql("SELECT MAX(time) AS time,hashtag, count(*) AS count FROM historicaltweets WHERE hashtag IS NOT NULL GROUP BY hashtag ORDER BY count DESC")
    rdd = df.rdd.map(tuple)
    rdd.saveToCassandra("twitter","tweet")
    df.show()

if __name__ == "__main__":
    sc = CassandraSparkContext(appName="tweet")
    sc.setLogLevel("WARN")
    ssc = StreamingContext(sc,600)
    topic_name = "twitter"
    streamFromKafka = KafkaUtils.createDirectStream(ssc, [topic_name],{"metadata.broker.list":'*'})
    lines = streamFromKafka.map(lambda x: x[1])
    lines.count().pprint()
    lines.foreachRDD(process)
    #text_counts = lines.map(lambda tweet: (tweet['hashtag'],1)).reduceByKey(lambda x,y: x + y)
    ssc.start() 
    ssc.awaitTermination()



Example #34
0
from pyspark_cassandra import CassandraSparkContext, Row
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext


conf = SparkConf().setAppName("NBARetrieval") \
    .set("spark.cassandra.connection.timeout_ms","20000") \
    .set("spark.cassandra.connection.host", "192.168.0.10") \
    .set("spark.cassandra.auth.username", "mdi") \
    .set("spark.cassandra.auth.password", "W2yIJw6ntl5RYC54VChe3lJoXa")


sc = CassandraSparkContext(conf=conf)
rdd = sc.cassandraTable("test", "kv")

print rdd.first()
Example #35
0
import pyspark
from pyspark.context import SparkContext, SparkConf
from pyspark.sql import SQLContext #, HiveContext
#from pyspark.storagelevel import StorageLevel
import atexit
from pyspark_cassandra import CassandraSparkContext
from datetime import tzinfo, timedelta, datetime
from pytz import timezone

conf = SparkConf()

#conf.setMaster("local")
conf.setAppName("My app")
conf.set("spark.cassandra.connection.host", "10.0.40.42")

sc = CassandraSparkContext(conf = conf)
atexit.register(lambda: sc.stop())

rdd = sc.cassandraTable("el_test", "cockpit2_testIndexes")


# for( d in range 2015-10-01 ~ 2015-10-10 ) do:
#
#    SELECT url,date,site,cnts,cnt from cockpit2_allTogether where `date` = d and site = giga and tags contains resort:android
#
# after this query, every row has to be updated with new value for cnts:
#
# UPDATE cnts = ga_videoPlays + sda_downloads + fb_socialFacebookLikes + fb_socialFacebookShares + fb_socialFacebookComments + tw_socialTwitterShares + ga_socialGooglePlusShares + gigya_socialComments

def filterDateRage(_from, _to, col):
    loc = timezone('Europe/Berlin')
Example #36
0
from pyspark_cassandra import CassandraSparkContext
from pyspark import SparkConf

conf = SparkConf() \
    .setAppName("ZeusDB") \
    .setMaster("local") \
    .set("spark.cassandra.connection.host", "YOUR_CLUSTER_HOST_NAME")

sc = CassandraSparkContext(conf=conf)

result = sc.cassandraTable("zeus", "edge") \
    .select("destination", "visit_count", "type") \
    .filter(lambda x: x["type"] == "visited") \
    .map(lambda x: (x["destination"], int(x["visit_count"]))) \
    .reduceByKey(lambda a, b: a + b) \
    .top(10, key=lambda x: x[1]) \

print
print "================================"
print "TOP 10 FREQUENTLY VISITED PLACES"
print "================================"
for row in result:
    print str(row[0]) + "\t\t" + str(row[1])
print
print "================================"
print
Example #37
0
                                  'test_93').on('name').collect())

        self.assertEqual(len(joined), 2)


if __name__ == '__main__':
    try:
        # connect to cassandra and create a keyspace for testing
        CassandraTestCase.session = Cluster().connect()
        CassandraTestCase.session.execute('''
            CREATE KEYSPACE IF NOT EXISTS %s WITH
            replication = {'class': 'SimpleStrategy', 'replication_factor': 1};
        ''' % (CassandraTestCase.keyspace, ))
        CassandraTestCase.session.set_keyspace(CassandraTestCase.keyspace)

        # create a cassandra spark context
        CassandraTestCase.sc = CassandraSparkContext(
            conf=SparkConf().setAppName("PySpark Cassandra Test"))

        # perform the unit tests
        unittest.main()
        # suite = unittest.TestLoader().loadTestsFromTestCase(RegressionTest)
        # unittest.TextTestRunner().run(suite)
    finally:
        # stop the spark context and cassandra session
        # stop the spark context and cassandra session
        if hasattr(CassandraTestCase, 'sc'):
            CassandraTestCase.sc.stop()
        if hasattr(CassandraTestCase, 'session'):
            CassandraTestCase.session.shutdown()
from pyspark_cassandra import CassandraSparkContext, Row
from pyspark import SparkContext, SparkConf

conf = SparkConf().setAppName("PySpark Cassandra Test").set("spark.cassandra.connection.host", "127.0.0.1")
sc = CassandraSparkContext(conf=conf)
data = sc.cassandraTable("mykeyspace", "user",row_format = 1).collect()
rdd = sc.parallelize(data)
print (rdd.collect())
    t0 = time.time()
    
    mongo_client= MongoClient()
    mongo_client.drop_database(db_out)
    print 'database cleared'

    
    col_tenant_id = 1
    col_user_id = 2
    col_item_id = 3

    num_to_recomm_per_user = 10
    num_to_recomm_per_item = 10
    conf = SparkConf().setAppName("PysparkCollaborativeFiltering").set("spark.cassandra.connection.host", spark_cassandra_connection_host)
    print ('conf')
    sc = CassandraSparkContext(conf=conf)
    sc.setCheckpointDir('checkpoint/')
    
    if LOAD_DATA_FROM_DB:
        
        data = sc.cassandraTable(cassandra_keyspace, cassandra_table, row_format=1).collect() # row_format: tuple
        # (id, tenant_id, user_id, item_id)
        tenant_ids = set(list(map(lambda x:x[col_tenant_id],data)))
        data_rdd = sc.parallelize(data)
        # data_rdd = sc.parallelize(data).map(list)
        
        all_results_per_user = sc.emptyRDD()
        all_results_per_item = sc.emptyRDD()
        
        for t_id in tenant_ids:
            print("\nComputing recommendation for tenant {}...\n".format(t_id))
--conf spark.cassandra.connection.host=52.32.192.156,52.32.200.206,54.70.213.12 \
/home/ubuntu/pipeline/kafka_spark_cass_imageQuery.py localhost:2181 imgSearchRequests

#Opening spark shell with cassandra
$SPARK_HOME/bin/pyspark \
--master spark://ip-172-31-0-173:7077 \
--packages TargetHolding/pyspark-cassandra:0.3.5 \
--conf spark.cassandra.connection.host=52.32.192.156,52.32.200.206,54.70.213.12
"""


db_table=0 #global rdd
producer = KafkaProducer(bootstrap_servers = 'ec2-52-41-224-1.us-west-2.compute.amazonaws.com:9092', value_serializer=lambda v: json.dumps(v).encode('ascii'))
# Kafka and Spark Streaming specific vars
batch_interval = 5 #question, why is batch interval of 5 so much better than 3? 3 seemed like needed to wait a long time
sc = CassandraSparkContext(appName="PythonStreamingVSS") #http://www.slideshare.net/JonHaddad/intro-to-py-spark-and-cassandra
ssc = StreamingContext(sc, batch_interval)
keyspace="vss_large"

"""
Example usages:
db_table.select("hashvalue", "partitionby","videoname").map(lambda x: x['hashvalue']).take(3)
will result in
[u'6daab6a32cb6b209', u'77a888d7aa2f882b', u'571d23371cc358d5']
"""

def main():
    global db_table;
    global producer;
    if len(sys.argv) != 3:
        #print("Usage: thisfile.py <zk> <topic>", file=sys.stderr) #i get an error about file=sys.stderr for some reason
from pyspark_cassandra import CassandraSparkContext, Row
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext # needed for toDF()

conf = SparkConf() \
    .setAppName("User Food Migration") \
    .setMaster("spark://127.0.0.1:7077") \
    .set("spark.cassandra.connection.host", "127.0.0.1")

sc = CassandraSparkContext(conf=conf)
sql = SQLContext(sc)

users = sc.cassandraTable("demo", "user").toDF()
food_count = users.select("favorite_food").groupBy("favorite_food").count()
Example #42
0
import pyspark
from pyspark.context import SparkContext, SparkConf
from pyspark.sql import SQLContext #, HiveContext
#from pyspark.storagelevel import StorageLevel
import atexit
from pyspark_cassandra import CassandraSparkContext
from datetime import tzinfo, timedelta, datetime
from pytz import timezone

conf = SparkConf()

#conf.setMaster("local")
conf.setAppName("My app")
conf.set("spark.cassandra.connection.host", "10.0.40.42")

sc = CassandraSparkContext(conf = conf)
atexit.register(lambda: sc.stop())

rdd = sc.cassandraTable("el_test", "cockpit2_testTogether")


# for( d in range 2015-10-01 ~ 2015-10-10 ) do:
#
#    SELECT url,date,site,cnts,cnt from cockpit2_allTogether where `date` = d and site = giga and tags contains resort:android
#
# after this query, every row has to be updated with new value for cnts:
#
# UPDATE cnts = ga_videoPlays + sda_downloads + fb_socialFacebookLikes + fb_socialFacebookShares + fb_socialFacebookComments + tw_socialTwitterShares + ga_socialGooglePlusShares + gigya_socialComments

def filterDateRage(_from, _to, col):
    loc = timezone('Europe/Berlin')
import time

# Example Usage(will get tweets from Jan 22 but not 24):
# *.py "Jan 22" "Jan 24"

startDate = str(sys.argv[1])
endDate = str(sys.argv[2])

conf = (
    SparkConf()
    .setAppName("User Food Migration")
    .setMaster("spark://128.138.202.110:7077")
    .set("spark.cassandra.connection.host", "128.138.202.117")
)

sc = CassandraSparkContext(conf=conf)

if __name__ == "__main__":

    rdd = sc.cassandraTable("junk", "bernie4")
    temp = 0
    # returns list of tweets
    listBernie = (
        rdd.filter(lambda row: row.created_at[4:] > startDate)
        .filter(lambda row: row.created_at[4:] < endDate)
        .collect()
    )
    for tweet in listBernie:
        if tweet.retweet_count > 0:
            print tweet.retweet_count
            temp += 1
Example #44
0
from pyspark import SparkConf, SparkContext
import pyspark_cassandra
from pyspark_cassandra import CassandraSparkContext

conf = SparkConf()\
 .setAppName("PySpark Cassandra Test") \
 .setMaster("local[2]") \
        .set("spark.cassandra.connection.host","52.25.173.31, 35.165.251.179, 52.27.187.234, 52.38.246.84")
#	.set("spark.cassandra.connection.host","52.25.173.31, 35.165.251.179, 52.27.187.234, 52.38.246.84")

sc = CassandraSparkContext(conf=conf)
print((sc.cassandraTable(
    "tweetdb", "tweettable").select("tweet").map(lambda a: a).collect()))
#sc.pprint()

#rdd = sc.parallelize([{"tweet":"first second third tweet"}])

#rdd.saveToCassandra(
#	"tweetdb",
#	"tweettable")
def time_range_filter(line):
	val = line.split(",")
	if len(val) < 3:
		return False
	return (int(val[0]) > five_weeks_back and int(val[0]) < (now+1))

def parse_log_entry(line):
	val = line.split(",")
	if len(val) < 3:
		return None
	return (str(val[1]), [str(val[2])])

if __name__ == "__main__":
	conf = SparkConf().setAppName("UserUserRelevance").setMaster(config.SPARK_MASTER).set("spark.cassandra.connection.host", config.CASSANDRA_SEED_NODE_IP)
	sc = CassandraSparkContext(conf=conf)
	
	filename = datetime.now().strftime("%Y-%m-%d")+"-usersonglog.txt"

	users = sc.textFile(config.HDFS_URL+":"+config.HDFS_PORT+config.LOG_FOLDER+filename) \
						.filter(time_range_filter) \
						.map(parse_log_entry) \
						.keys() \
						.collect()

	song_map = {} # store song to user mapping for use in later stages

	usersongdb = sc.cassandraTable(config.CASSANDRA_KEYSPACE, "user_to_song")
	songuserdb = sc.cassandraTable(config.CASSANDRA_KEYSPACE, "song_to_user")

	for user in users: