Beispiel #1
0
from pyspark.mllib.fpm import FPGrowth
from pyspark import SparkContext
from cassandra.cluster import Cluster


if __name__ == "__main__":
    #First connection working only works through python3 execution
    cluster = Cluster()
    session = cluster.connect()
    results = session.execute(SELECT * FROM dev.facture)
    print(results)

    #Second protocol, also working through python3 execution only
    sc = SparkContext("spark://127.0.0.1:7077", "First App")
    rdd = sc.cassandraTable("dev", "facture")
    transactions = rdd.map(lambda x: list(set(x)))
    model = FPGrowth.train(transactions, minSupport=0.2, numPartitions=10)
    result = model.freqItemsets().collect()
    for fi in result:
        print(fi)

#        output=sqlContext.createDataFrame(rowRDD)
#        output.write\
#        	.format("org.apache.spark.sql.cassandra")\
#        	.options(table='records1s', keyspace='ad_flow')\
#        	.save(mode="append")

if __name__ == "__main__":
    if len(sys.argv) != 3:
        print("Usage: kafka_wordcount.py <zk> <EventsTopic> ", file=sys.stderr)
        exit(-1)

    sc = SparkContext(appName="biddingStream")
    ssc = StreamingContext(sc, 10)
    bidprice = sc.cassandraTable("ad_flow", "bidprice")
    #    bidprice=sqlContext.read\
    #                       .format("org.apache.spark.sql.cassandra")\
    #                       .options(keyspace="ad_flow", table="bidprice")\
    #                       .load().rdd
    tmp = {}
    for item in bidprice.collect():
        tmp[item['pid']] = item['price']
    bidpriceBC = sc.broadcast(tmp)
    print(tmp)
    zkQuorum, topic1, topic2 = sys.argv[1:]
    kvs = KafkaUtils.createStream(ssc, zkQuorum, "sparkStreamingGetNewEvents",
                                  {topic1: 1})
    lines = kvs.map(lambda x: json.loads(x[1]))
    #    lines.pprint()
    #    uidVec=lines.map(lambda x: ((x['uid'], x['tick']), np.asarray([float(i) for i in x['topic']])))\
print rdd.first().value
print rdd.first()[0]
print rdd.first()[1]
print rdd.collect()
print rdd.filter(lambda row: row.key > 1).collect()

rdd = sc.cassandraTable('test', 'kv')
rdd.saveToCassandra('test', 'kv2', [key])

rdd.saveToCassandra('test', 'kv2')

otherRdd = sc.parallelize([{"key": 3, "value": "foobar"}])

conf = SparkConf().setAppName("Spark App")
sc = SparkContext(conf=conf)
x = sc.cassandraTable("test", "kv").collect()
print x

HOME = getenv("HOME")
DSE_HOME = getenv("DSE_HOME", join(HOME, "dse-4.6.0"))
SPARK_HOME = join(DSE_HOME, "resources", "spark")

os.environ['SPARK_HOME']=SPARK_HOME

PYSPARK_DIR = join(DSE_HOME, 'resources', 'spark', 'python')
ADD_PATH = [PYSPARK_DIR] for PATH in ADD_PATH: 
	sys.path.insert(1, PATH)