Example #1
0
def main(argv):
    port = 9999
    out_file = "myout2"

    port = int(argv[0])
    out_file = argv[1]

    with open(out_file, "w") as fout:
        fout.close()

    conf = SparkConf().setMaster("local[*]") \
        .setAppName("Flajolet-Martin") \
        .set("spark.executor.memory", "4g") \
        .set("spark.driver.memory", "4g")
    sc = SparkContext(conf=conf)
    sc.setLogLevel("OFF")

    ssc = StreamingContext(sc , BATCH_DURATION)
    stream = ssc.socketTextStream("localhost", port) \
        .window(WINDOW_LENGTH, SLIDING_INTERVAL) \
        .map(lambda x: json.loads(x))

    hashParams = hashFuncs()

    with open(out_file, 'a') as fout: 
        output = csv.writer(fout)
        output.writerow(["Time", "Ground Truth", "Estimation"])
        fout.close()

    stream.map(lambda x: x["city"]).filter(lambda x: x != "") \
        .foreachRDD(lambda rdd: Flajolet_Martin(rdd, hashParams, out_file))

    ssc.start()
    ssc.awaitTermination()
Example #2
0
# 6.2.4절 예제 6-12
from pyspark import SparkContext, SparkConf, storagelevel
from pyspark.streaming.context import StreamingContext
from pyspark.streaming.kafka import KafkaUtils

## pyspark에서 실행할 경우 sparkContext는 생성하지 않습니다!
# ./pyspark --packages org.apache.spark:spark-streaming-kafka-0-8-assembly_2.11:2.0.2
conf = SparkConf()
sc = SparkContext(master="local[*]", appName="KafkaSample", conf=conf)
ssc = StreamingContext(sc, 3)

ds1 = KafkaUtils.createStream(ssc, "localhost:2181", "test-consumer-group1", {"test": 3})
ds2 = KafkaUtils.createDirectStream(ssc, ["test"], {"metadata.broker.list": "localhost:9092"})

ds1.pprint()
ds2.pprint()

ssc.start()
ssc.awaitTermination()
Example #3
0
# 6.2.3절

from pyspark import SparkContext, SparkConf
from pyspark.streaming.context import StreamingContext

conf = SparkConf()
sc = SparkContext(master="local[*]", appName="QueueSample", conf=conf)
ssc = StreamingContext(sc, 3)

rdd1 = sc.parallelize(["a", "b", "c"])
rdd2 = sc.parallelize(["c", "d", "e"])

queue = [rdd1, rdd2]

ds = ssc.queueStream(queue)

ds.pprint()

ssc.start()
ssc.awaitTermination()
Example #4
0
class DeleteFromCassandraStreamingTest(SimpleTypesTestBase):
    size = 10
    interval = .1

    def setUp(self):
        super(DeleteFromCassandraStreamingTest, self).setUp()
        self.ssc = StreamingContext(self.sc, self.interval)

        self.rdds = [
            self.sc.parallelize(range(0, self.size)).map(lambda i: {
                'key': i,
                'int': i,
                'text': i
            })
        ]
        data = self.rdds[0]
        data.saveToCassandra(self.keyspace, self.table)

        # verify the RDD length and actual content
        data = self.rdd()
        self.assertEqual(len(data.collect()), self.size)

        # verify we have actually data for `text` and `int`
        row = data.select('text', 'int').where('key=?', '0').first()
        self.assertEqual(row.text, u'0')
        self.assertEqual(row.int, 0)

        # stream we will use in tests.
        self.stream = self.ssc.queueStream(self.rdds)

    def test_delete_single_column(self):
        self.stream \
            .deleteFromCassandra(self.keyspace, self.table,
                                 deleteColumns=['text'])

        self.ssc.start()
        self.ssc.awaitTermination((self.size + 1) * self.interval)
        self.ssc.stop(stopSparkContext=False, stopGraceFully=True)

        data = self.rdd()
        self.assertEqual(len(data.collect()), self.size)

        # verify we have actually data for `text` and `int`
        row = data.select('text', 'int').where('key=?', '0').first()
        self.assertEqual(row.int, 0)
        self.assertIsNone(row.text)

    def test_delete_2_columns(self):
        self.stream \
            .deleteFromCassandra(self.keyspace, self.table,
                                 deleteColumns=['text', 'int'])

        self.ssc.start()
        self.ssc.awaitTermination((self.size + 1) * self.interval)
        self.ssc.stop(stopSparkContext=False, stopGraceFully=True)

        data = self.rdd()
        self.assertEqual(len(data.collect()), self.size)

        # verify we have actually data for `text` and `int`
        row = data.select('text', 'int').where('key=?', '0').first()
        self.assertIsNone(row.int)
        self.assertIsNone(row.text)

    def test_delete_all_rows_default(self):
        self.stream \
            .deleteFromCassandra(self.keyspace, self.table)

        self.ssc.start()
        self.ssc.awaitTermination((self.size + 1) * self.interval)
        self.ssc.stop(stopSparkContext=False, stopGraceFully=True)

        data = self.rdd()
        self.assertEqual(len(data.collect()), 0)

    def test_delete_all_rows_explicit(self):
        self.stream \
            .deleteFromCassandra(self.keyspace, self.table, keyColumns=['key'])

        self.ssc.start()
        self.ssc.awaitTermination((self.size + 1) * self.interval)
        self.ssc.stop(stopSparkContext=False, stopGraceFully=True)

        data = self.rdd()
        self.assertEqual(len(data.collect()), 0)