from pyspark.mllib.fpm import FPGrowth from pyspark import SparkContext from cassandra.cluster import Cluster if __name__ == "__main__": #First connection working only works through python3 execution cluster = Cluster() session = cluster.connect() results = session.execute(SELECT * FROM dev.facture) print(results) #Second protocol, also working through python3 execution only sc = SparkContext("spark://127.0.0.1:7077", "First App") rdd = sc.cassandraTable("dev", "facture") transactions = rdd.map(lambda x: list(set(x))) model = FPGrowth.train(transactions, minSupport=0.2, numPartitions=10) result = model.freqItemsets().collect() for fi in result: print(fi)
# output=sqlContext.createDataFrame(rowRDD) # output.write\ # .format("org.apache.spark.sql.cassandra")\ # .options(table='records1s', keyspace='ad_flow')\ # .save(mode="append") if __name__ == "__main__": if len(sys.argv) != 3: print("Usage: kafka_wordcount.py <zk> <EventsTopic> ", file=sys.stderr) exit(-1) sc = SparkContext(appName="biddingStream") ssc = StreamingContext(sc, 10) bidprice = sc.cassandraTable("ad_flow", "bidprice") # bidprice=sqlContext.read\ # .format("org.apache.spark.sql.cassandra")\ # .options(keyspace="ad_flow", table="bidprice")\ # .load().rdd tmp = {} for item in bidprice.collect(): tmp[item['pid']] = item['price'] bidpriceBC = sc.broadcast(tmp) print(tmp) zkQuorum, topic1, topic2 = sys.argv[1:] kvs = KafkaUtils.createStream(ssc, zkQuorum, "sparkStreamingGetNewEvents", {topic1: 1}) lines = kvs.map(lambda x: json.loads(x[1])) # lines.pprint() # uidVec=lines.map(lambda x: ((x['uid'], x['tick']), np.asarray([float(i) for i in x['topic']])))\
print rdd.first().value print rdd.first()[0] print rdd.first()[1] print rdd.collect() print rdd.filter(lambda row: row.key > 1).collect() rdd = sc.cassandraTable('test', 'kv') rdd.saveToCassandra('test', 'kv2', [key]) rdd.saveToCassandra('test', 'kv2') otherRdd = sc.parallelize([{"key": 3, "value": "foobar"}]) conf = SparkConf().setAppName("Spark App") sc = SparkContext(conf=conf) x = sc.cassandraTable("test", "kv").collect() print x HOME = getenv("HOME") DSE_HOME = getenv("DSE_HOME", join(HOME, "dse-4.6.0")) SPARK_HOME = join(DSE_HOME, "resources", "spark") os.environ['SPARK_HOME']=SPARK_HOME PYSPARK_DIR = join(DSE_HOME, 'resources', 'spark', 'python') ADD_PATH = [PYSPARK_DIR] for PATH in ADD_PATH: sys.path.insert(1, PATH)