Example #1
1
def main():
    # Create a local StreamingContext with two working thread and batch interval of 5 second
    sc = SparkContext("spark://ip-172-31-29-29:7077", "MyKafkaStream")

    # stream interval of 5 seconds
    ssc = StreamingContext(sc, 5)
    kafkaStream = KafkaUtils.createStream(ssc, "52.3.61.194:2181", "GroupNameDoesntMatter", {"parking_sensor_data": 2})
    messages = kafkaStream.flatMap(lambda s: create_tuple(s[1])).reduceByKey(lambda a,b: (int(a)+int(b))/2)
    messages1 = messages.filter(lambda s: s[1] > 0)
    messages1.pprint()

    ssc.start()             # Start the computation
    ssc.awaitTermination()  # Wait for the computation to terminate
Example #2
0
def createContext(host, port, outputPath):
    # If you do not see this printed, that means the StreamingContext has been loaded
    # from the new checkpoint
    print "Creating new context"
    if os.path.exists(outputPath):
        os.remove(outputPath)
    sc = SparkContext(appName="PythonStreamingRecoverableNetworkWordCount")
    ssc = StreamingContext(sc, 120)

    # Create a socket stream on target ip:port and count the
    # words in input stream of \n delimited text (eg. generated by 'nc')
    lines = ssc.socketTextStream(host, port)
    print '\n\n\nconnectionMade\n\n\n'
    addresses = lines.map(splitLine)
    transcationsum = addresses.map(lambda x: (x[0], (1, x[1]))).reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1]))

    def echo(time, rdd):
        counts = "Counts at time %s %s" % (time, rdd.collect())
        print counts
        print "Appending to " + os.path.abspath(outputPath)
        with open(outputPath, 'a') as f:
            f.write(counts + "\n")

    transcationsum.foreachRDD(echo)
    return ssc
def ss_direct_kafka_bucket_counter(brokers, topic, bucket_interval, output_msg, message_parse, valueDecoder=None):
    """Starts a Spark Streaming job from a Kafka input and parses message time

	WARNING!! This function only works for spark 1.4.0+ 

	Args:
		brokers: the kafka broker that we look at for the topic
		topic: the kafka topic for input
		timeinterval: the time interval in seconds (int) that the job will 
			bucket

	Returns:
		None
		
	"""
    sc = SparkContext(appName="PythonKafkaBucketCounter")
    ssc = StreamingContext(sc, timeinterval + 5)

    if valueDecoder:
        kvs = KafkaUtils.createDirectStream(ssc, [topic], {"metadata.broker.list": brokers}, valueDecoder=valueDecoder)
    else:
        kvs = KafkaUtils.createDirectStream(ssc, [topic], {"metadata.broker.list": brokers})

    lines = kvs.map(lambda x: x[1])
    interval_counts = lines.map(lambda line: (message_parse(line), 1)).reduceByKey(lambda a, b: a + b)

    output_msg_func = output_msg(sc, ssc)

    interval_counts.foreachRDD(output_msg_func)

    ssc.start()
    ssc.awaitTermination()
class BaseStreamingTestCase(unittest.TestCase):
    """ From https://github.com/apache/spark/blob/
    master/python/pyspark/streaming/tests.py """

    timeout = 10  # seconds
    duration = .5

    def setUp(self):
        self.ssc = StreamingContext(sc, self.duration)

    def tearDown(self):
        self.ssc.stop(False)

    def wait_for(self, result, n):
        start_time = time.time()
        while len(result) < n and time.time() - start_time < self.timeout:
            time.sleep(0.01)
        if len(result) < n:
            print("timeout after", self.timeout)

    def _collect(self, dstream, n):
        result = []

        def get_output(_, rdd):
            if rdd and len(result) < n:
                r = rdd.collect()
                if r:
                    result.append(r)

        dstream.foreachRDD(get_output)

        self.ssc.start()
        self.wait_for(result, n)
        return result
Example #5
0
def start():
    sconf = SparkConf()
    sconf.set('spark.cores.max', 2)
    sc = SparkContext(appName='KafkaDirectWordCount', conf=sconf)
    ssc = StreamingContext(sc, 2)

    brokers = "localhost:9092"
    topics = ['test']

    kafkaStreams_lines = KafkaUtils.createDirectStream(ssc, topics, kafkaParams={"metadata.broker.list": brokers})

    lines1 = kafkaStreams_lines.map(lambda x: x[1])  # 注意 取tuple下的第二个即为接收到的kafka流

    words = lines1.flatMap(lambda line: line.split(" "))

    pairs = words.map(lambda word: (word, 1))

    wordcounts = pairs.reduceByKey(lambda x, y: x + y)

    print(wordcounts)

    kafkaStreams_lines.transform(storeOffsetRanges).foreachRDD(printOffsetRanges)

    wordcounts.pprint()
    # 统计生成的随机数的分布情况
    ssc.start()  # Start the computation
    ssc.awaitTermination()  # Wait for the computation to terminate
def start():
    sconf = SparkConf()
    sconf.set('spark.cores.max', 2)
    sc = SparkContext(appName='KafkaDirectWordCount', conf=sconf)
    ssc = StreamingContext(sc, 2)

    brokers = "192.192.0.27:9092"
    topics = ['topic7']

    kafkaStreams_lines = KafkaUtils.createDirectStream(ssc, topics, kafkaParams={"metadata.broker.list": brokers})

    lines1 = kafkaStreams_lines.map(lambda x: x[1])  # 注意 取tuple下的第二个即为接收到的kafka流

    words = lines1.flatMap(lambda line: line.split(" "))

    pairs = words.map(lambda word: (word, 1))

    wordcounts = pairs.reduceByKey(lambda x, y: x + y)

    wordcounts.saveAsTextFiles("/var/lib/hadoop-hdfs/spark-libin/kafka")

    wordcounts.pprint()
    # 统计生成的随机数的分布情况
    ssc.start()  # Start the computation
    ssc.awaitTermination()  # Wait for the computation to terminate
Example #7
0
def createStreamingContext():

    # Create a local StreamingContext with two working thread and batch interval of 1 second
    sc = SparkContext("spark://%s:7077" % MASTER_NAME, appName="GlutenTweet", pyFiles=PYFILES)
    ssc = StreamingContext(sc, 2)

    # Create a DStream of raw data
    raw = ssc.socketTextStream(MASTER_IP, 9999)

    # Convert into models
    tweets = raw.map(lambda r: Tweet(raw_json=r))

    # Store models
    tweets.foreachRDD(storeTweetsRDD)

    # Sliding window analysis
    window = tweets.window(20*60, 30)
    hashtagCounts = analysisHahtagCount(window)
    streamTop(hashtagCounts).pprint()

    # Keyword extraction - note tweets is immutable
    tweetsKeyword = tweets.map(lambda t: keywordExtraction(t))

    # Update models
    tweetsKeyword.foreachRDD(updateTweetsRDD)

    # Sliding window analysis
    window2 = tweetsKeyword.window(20*60, 30)
    keywordCounts = analysisKeywordCount(window2)
    streamTop(keywordCounts).pprint()

    ssc.checkpoint(CHECKPOINT_DIR)
    return ssc
def main():
    conf = SparkConf().setMaster("local[2]").setAppName("Streamer")
    sc = SparkContext(conf=conf)
    ssc = StreamingContext(sc, 10)   # Create a streaming context with batch interval of 10 sec
    ssc.checkpoint("checkpoint")
    geolocator = Nominatim()
    stream(ssc,geolocator,100) 
Example #9
0
def main():
    if len(sys.argv) != 4:
        print("Usage: kafka_wordcount.py <zk> <topic> <timeout>",
              file=sys.stderr)
        exit(-1)

    sc = SparkContext(appName="PythonStreamingKafkaWordCount")
    ssc = StreamingContext(sc, 1)
    timeout = None
    if len(sys.argv) == 4:
        zk, topic, timeout = sys.argv[1:]
        timeout = int(timeout)
    else:
        zk, topic = sys.argv[1:]
    kvs = KafkaUtils.createStream(
        ssc, zk, "spark-streaming-consumer", {topic: 1})
    lines = kvs.map(lambda x: x[1])
    counts = lines.flatMap(lambda line: (line.split(" "))
                           .map(lambda word: (word, 1))
                           .reduceByKey(lambda a, b: a+b))
    counts.pprint()
    kwargs = {}
    if timeout:
        kwargs['timeout'] = timeout
    ssc.start()
    ssc.awaitTermination(**kwargs)
Example #10
0
def main():
    sc = SparkContext(appName="IntrusionDetector")
    ssc = StreamingContext(sc, batch_durations)

    kvs = KafkaUtils.createDirectStream(ssc, [input_topic], {"metadata.broker.list": broker})
    kvs.foreachRDD(processRDD)
    ssc.start()
    ssc.awaitTermination()
def kafka_spark_streaming_sql_main(app_name, brokers, topic, interval_seconds, sql_function):
    sc = SparkContext(appName=app_name)
    sqlContext = SQLContext(sc)
    # ssc = StreamingContext(sc, interval_seconds)
    ssc = StreamingContext(sc, 10)
    kvs = KafkaUtils.createDirectStream(ssc, [topic], {"metadata.broker.list": brokers})
    kvs.foreachRDD(sql_function)
    ssc.start()
    ssc.awaitTermination()
def main():
    conf = SparkConf().setMaster("local[2]").setAppName("Streamer")
    sc = SparkContext(conf=conf)
    ssc = StreamingContext(sc, 10)   # Create a streaming context with batch interval of 10 sec
    ssc.checkpoint("checkpoint")
    pwords = load_wordlist("positive.txt")
    nwords = load_wordlist("negative.txt")
    counts = stream(ssc, pwords, nwords, 100)
    make_plot(counts)
def read_tweets():

    sc = SparkContext(appName="sentimentProducer")
    ssc = StreamingContext(sc,600)  # Test 60 segundos
    brokers = "localhost:9092"
    kvs = KafkaUtils.createDirectStream(ssc, ["test"], {"metadata.broker.list": brokers})
    kvs.foreachRDD(create_format)
    producer.flush()
    ssc.start()
    ssc.awaitTermination()
def functionToCreateContext():
    sc = SparkContext(appName="StreamingExampleWithKafka")
    ssc = StreamingContext(sc, 10)
    ssc.checkpoint("checkpoint")
    opts = {"metadata.broker.list": "node1.example.com:6667,node2.example.com:6667"}
    kvs = KafkaUtils.createDirectStream(ssc, ["mytopic"], opts)
    lines = kvs.map(lambda x: x[1])
    counts = lines.flatMap(lambda line: line.split(" ")).map(lambda word: (word, 1)).updateStateByKey(updateFunction)
    counts.pprint()
    return ssc
Example #15
0
 def setup():
     conf = SparkConf().set("spark.default.parallelism", 1)
     sc = SparkContext(conf=conf)
     ssc = StreamingContext(sc, 2)
     dstream = ssc.textFileStream(inputd).map(lambda x: (x, 1))
     wc = dstream.updateStateByKey(updater)
     wc.map(lambda x: "%s,%d" % x).saveAsTextFiles(outputd + "test")
     wc.checkpoint(2)
     self.setupCalled = True
     return ssc
Example #16
0
        def setup():
            conf = SparkConf().set("spark.default.parallelism", 1)
            sc = SparkContext(conf=conf)
            ssc = StreamingContext(sc, 0.5)

            # A function that cannot be serialized
            def process(time, rdd):
                sc.parallelize(range(1, 10))

            ssc.textFileStream(inputd).foreachRDD(process)
            return ssc
def main():
    conf = SparkConf().setAppName("kafka_source_mongo_sink_pymongo_filtered")
    sc = SparkContext(conf=conf)
    ssc = StreamingContext(sc, 1)
    try:
        kafka_streams = KafkaUtils.createStream(ssc, "localhost:2181", "spark-streaming-consumer", {"splash_json": 2})
        kafka_streams.foreachRDD(process_rdd)
    except Exception as e:
        print e
    ssc.start()
    ssc.awaitTermination()
def invoke():
    # object to keep track of offsets
    ConfigInitializer.basic_config()

    # app name
    application_name = "mon_metrics_kafka"

    my_spark_conf = SparkConf().setAppName(application_name)

    spark_context = SparkContext(conf=my_spark_conf)

    # read at the configured interval
    spark_streaming_context = \
        StreamingContext(spark_context, cfg.CONF.service.stream_interval)

    kafka_stream = MonMetricsKafkaProcessor.get_kafka_stream(
        cfg.CONF.messaging.topic,
        spark_streaming_context)

    # transform to recordstore
    MonMetricsKafkaProcessor.transform_to_recordstore(kafka_stream)

    # catch interrupt, stop streaming context gracefully
    # signal.signal(signal.SIGINT, signal_handler)

    # start processing
    spark_streaming_context.start()

    # FIXME: stop spark context to relinquish resources

    # FIXME: specify cores, so as not to use all the resources on the cluster.

    # FIXME: HA deploy multiple masters, may be one on each control node

    try:
        # Wait for the Spark driver to "finish"
        spark_streaming_context.awaitTermination()
    except Exception as e:
        MonMetricsKafkaProcessor.log_debug(
            "Exception raised during Spark execution : " + str(e))
        # One exception that can occur here is the result of the saved
        # kafka offsets being obsolete/out of range.  Delete the saved
        # offsets to improve the chance of success on the next execution.

        # TODO(someone) prevent deleting all offsets for an application,
        # but just the latest revision
        MonMetricsKafkaProcessor.log_debug(
            "Deleting saved offsets for chance of success on next execution")

        MonMetricsKafkaProcessor.reset_kafka_offsets(application_name)

        # delete pre hourly processor offsets
        if cfg.CONF.stage_processors.pre_hourly_processor_enabled:
            PreHourlyProcessor.reset_kafka_offsets()
Example #19
0
class MLLibStreamingTestCase(unittest.TestCase):
    def setUp(self):
        self.sc = sc
        self.ssc = StreamingContext(self.sc, 1.0)

    def tearDown(self):
        self.ssc.stop(False)

    @staticmethod
    def _ssc_wait(start_time, end_time, sleep_time):
        while time() - start_time < end_time:
            sleep(0.01)
Example #20
0
def createContext(conf):
    spConf = conf.getSparkConf()
    sc = SparkContext(conf=spConf)
    ssc = StreamingContext(sc, conf.INTERVAL)
    ssc.remember(conf.REMEMBER)
    # get reader
    lines = conf.getReader(ssc)
    # use window
    lines = lines.window(conf.WINDOW, conf.WINDOW)
    lines = lines.map(lambda line: jsonDecode(line))
    deal(lines, conf)
    return ssc
        def createContext():
            uBATCH_INTERVAL = 10
            sc = SparkContext(SPARK_MASTER, appName="StreamingKafka")
            sc.broadcast(batchUserPostDict)
            sc.broadcast(batchPostUserDict)
            #sc = SparkContext("local[*]", appName="StreamingKafka")
            # streaming batch interval of 5 sec first, and reduce later to 1 sec or lower
            ssc = StreamingContext(sc, uBATCH_INTERVAL)
            ssc.checkpoint(CHECKPOINT_DIR)   # set checkpoint directory in HDFS
            #ssc.checkpoint(10 * uBATCH_INTERVAL)
            return ssc

            ssc = StreamingContext.getOrCreate(CHECKPOINT_DIR, createContext)
def main():
    conf = SparkConf()
    conf.setAppName("TopAirports")
    conf.set("spark.streaming.kafka.maxRatePerPartition", "0")
    conf.set("spark.dynamicAllocation.enabled", "true")
    sc = SparkContext(conf = conf)
    ssc = StreamingContext(sc, 1) # Stream every 1 second
    ssc.checkpoint("checkpoint")

    # Clear the cassandra table
    init_cassandra().execute('TRUNCATE {}'.format(top_airports_table))

    stream_kafka(ssc)
def main():
    parser = OptionParser()
    parser.add_option('', '--enriched_data_path', action='store', dest='enriched_data_path', help='path to write enriched data')
    parser.add_option('', '--model_path', action='store', dest='model_path', help='path for model data')
    parser.add_option('', '--kafka_zookeeper_hosts', action='store', dest='kafka_zookeeper_hosts', help='list of Zookeeper hosts (host:port)')
    parser.add_option('', '--kafka_broker_list', action='store', dest='kafka_broker_list', help='list of Kafka brokers (host:port)')
    parser.add_option('', '--kafka_message_topic', action='store', dest='kafka_message_topic', help='topic to consume input messages from')
    parser.add_option('', '--kafka_alert_topic', action='store', dest='kafka_alert_topic', help='topic to produce alert messages to')
    parser.add_option('', '--kafka_enriched_data_topic', action='store', dest='kafka_enriched_data_topic', help='topic to produce enriched data to')
    parser.add_option('', '--streaming_batch_duration_sec', type='float', default=15.0,
        action='store', dest='streaming_batch_duration_sec', help='Streaming batch duration in seconds')
    parser.add_option('', '--max_batches', type='int', default=0,
        action='store', dest='max_batches', help='Number of batches to process (0 means forever)')
    options, args = parser.parse_args()

    sc = SparkContext()
    ssc = StreamingContext(sc, options.streaming_batch_duration_sec)
    sqlContext = getSqlContextInstance(sc)

    # Load saved model.
    model = None
    if options.model_path:
        model = RandomForestModel.load(sc, options.model_path)
    else:
        print('No model loaded.')

    # Create Kafka stream to receive new messages.
    kvs = KafkaUtils.createDirectStream(ssc, [options.kafka_message_topic], {
        'metadata.broker.list': options.kafka_broker_list,
        'group.id': 'spark_streaming_processor.py'})

    # Take only the 2nd element of the tuple.
    messages = kvs.map(lambda x: x[1])

    # Convert RDD of JSON strings to RDD of Rows.
    rows = messages.map(json_to_row)

    # Process messages.
    rows.foreachRDD(lambda time, rdd: 
        process_messages(time, rdd,
            ssc=ssc,
            model=model,
            enriched_data_path=options.enriched_data_path,
            zookeeper_hosts=options.kafka_zookeeper_hosts,
            kafka_alert_topic=options.kafka_alert_topic,
            kafka_enriched_data_topic=options.kafka_enriched_data_topic,
            max_batches=options.max_batches))

    ssc.start()
    ssc.awaitTermination()
Example #24
0
def main():
    sc = SparkContext(appName="PythonStreamingKafkaWordCount")
    ssc = StreamingContext(sc, 1)

    zkQuorum = "localhost:2181"
    topic = "twitter_raw"
    kvs = KafkaUtils.createStream(ssc, zkQuorum, "spark-streaming-consumer", {topic: 1})
    lines = kvs.map(lambda x: pickle.loads(x[1].decode("utf-8"))["text"])  # fetch the text
    count = lines.map(lambda line: len(line.split())).reduce(add)  # split into words and count
    count.foreachRDD(publishToRedis)  # publish to redis
    count.pprint()

    ssc.start()
    ssc.awaitTermination()
def createContext():

        conf = SparkConf().setMaster('spark://{}:7077'.format(MASTER_URL)).set('spark.executor.memory', '2g')
        sc = SparkContext(conf=conf)

        ssc = StreamingContext(sc, STREAMING_INTERVAL)
        lines = ssc.textFileStream('hdfs://{}/data/on_time/streaming/'.format(MASTER_URL))

        ssc.checkpoint(CHECKPOINT_DIR)

        # main split-combine-apply logic put here
        pairs = lines.map(lambda x: x.split(",")).map(lambda x: (x[8], 1))
        runningCounts = pairs.updateStateByKey(updateFunction)

        sortedCounts = runningCounts.transform(lambda rdd: rdd.sortBy(lambda (airport, freq): freq, ascending=False))
Example #26
0
    def start(self):

        sc = SparkContext(appName="PythonStreamingNOTHS")
        ssc = StreamingContext(sc, 10)

        kvs = KafkaUtils.createStream(ssc, self.zkQuorum, "spark-streaming-consumer", {self.topic: 1})
        print('******* Event received in window: ', kvs.pprint())

        if topic == 'NOTHS-crawler-topic':
            kvs.foreachRDD(self.save_crawler_hbase)
        elif topic == 'NOTHS-trends-topic':
            kvs.foreachRDD(self.save_trends_hbase)

        ssc.start()
        ssc.awaitTermination()
class xStreamProcessor:
    ip = socket.gethostbyname(socket.gethostname())
    port = 9999
    dstream = None
    sc = None
    ssc = None

    #def __init__(self,ip=None,port=None,spark_master = 'spark://localhost:7077'):
    def __init__(self,ip=None,port=None,spark_master = 'mesos://10.0.2.85:5050'):
        if ip is not None:
            self.ip = ip
        if port is not None:
            self.port = port
        self.sc = SparkContext(master=spark_master,appName='StreamProcessor')
        self.ssc = StreamingContext(self.sc, 1)
        #self.ssc.checkpoint(directory=None)
        hiveContext = HiveContext(self.sc)
        hiveContext.sql('DROP TABLE IF EXISTS default.tweet_stream')
        hiveContext.sql('CREATE TABLE IF NOT EXISTS default.tweet_stream (ip STRING, port STRING, date_time STRING, user STRING, msg STRING)')

        hiveContext.sql('DROP TABLE IF EXISTS default.email_stream')
        hiveContext.sql('CREATE TABLE IF NOT EXISTS default.email_stream (ip STRING, port STRING, date_time STRING, \
        fr STRING,to STRING, subject STRING, content STRING, subject_sentiment INT, content_sentiment INT, \
        subject_power INT, content_power INT,  subject_topic INT, content_topic INT, fraud_score DOUBLE)')

        hiveContext.sql('DROP TABLE IF EXISTS default.email_graph')
        hiveContext.sql('CREATE TABLE IF NOT EXISTS default.email_graph (fr STRING,to STRING, dt STRING)')

        hiveContext.sql('DROP TABLE IF EXISTS default.trans_stream')
        hiveContext.sql('CREATE TABLE IF NOT EXISTS default.trans_stream (ip STRING,port STRING, date_time STRING, user STRING, amount DOUBLE, \
        big_trans INT, is_in_odd_day INT, is_at_odd_time INT)')

        self.dstream = self.ssc.socketTextStream(self.ip, self.port)


        self.process_stream()

        self.ssc.start()
        self.ssc.awaitTermination()

    def process_stream(self):
        parts = self.dstream.flatMap(lambda line: line.split("|"))
        words = parts.map(lambda p: p[3])
        pairs = words.map(lambda word: (word, 1))
        wordCounts = pairs.reduceByKey(lambda x, y: x + y)

        # Print the first ten elements of each RDD generated in this DStream to the console
        wordCounts.pprint()
def main():
    global ssc

    conf = SparkConf()
    conf.setAppName("TopAirports")
    conf.set("spark.streaming.kafka.maxRatePerPartition", "0")
    conf.set('spark.streaming.stopGracefullyOnShutdown', True)

    sc = SparkContext(conf=conf)

    ssc = StreamingContext(sc, 1)  # Stream every 1 second
    ssc.checkpoint("/tmp/checkpoint")

    signal.signal(signal.SIGINT, stop_streaming)

    stream_kafka()
Example #29
0
def createStreamingContext():
    conf = SparkConf().setMaster("local[2]").setAppName("amqp_temperature")
    conf.set("spark.streaming.receiver.writeAheadLog.enable", "true")

    sc = SparkContext(conf=conf)
    ssc = StreamingContext(sc, 1)
    ssc.checkpoint("/tmp/spark-streaming-amqp")

    receiveStream = AMQPUtils.createStream(ssc, "localhost", 5672, "temperature")

    temperature = receiveStream.map(getTemperature)
    max = temperature.reduceByWindow(getMax, None, 5, 5)

    max.pprint()

    return ssc
def main():
    brokers = 'localhost:9092'
    topic = 'openbmp.parsed.unicast_prefix'
    sc = SparkContext(appName='BGPPrefixOriginValidation')
    ssc = StreamingContext(sc,2)
 
    directKafkaStream = KafkaUtils.createDirectStream(ssc, [topic], {'metadata.broker.list':brokers})
    #directKafkaStream.pprint()

    lines = directKafkaStream.flatMap(lambda x: x[1].splitlines()).filter(lambda line: line.startswith('add'))
    structured_rdd = lines.map(structure_data)
 
    structured_rdd.foreachRDD(lambda rdd: rdd.foreachPartition(validate_bgp_prefix)) 
    
    ssc.start()
    ssc.awaitTermination()
			break
	string = ",".join(Hlist)
	print(string)
    #Hlist=HList[0:-1]
    
        



if __name__ == "__main__":

    window_size, batch_size = int(sys.argv[1]), int(sys.argv[2])
    conf = SparkConf()
    conf.setAppName("BigData")
    sc = SparkContext(conf=conf)
    ssc = StreamingContext(sc, int(batch_size))
    # ssc.checkpoint("/home/cdiya/Downloads/checkpoints")
    ssc.checkpoint("/checkpoint_BIGDATA")
    lines = ssc.socketTextStream("localhost", 9009)
    lines = lines.window(int(window_size),1)
    # lines.pprint()
    words = lines.map(lambda line: line.split(";")[7])
    # words.pprint()
    words = words.flatMap(lambda x: x.split(","))
    # words.pprint()
            
    hashtag = words.map(lambda x: (x,1))
    # hashtag.pprint()
    
    #hashtag = hashtag.rdd
    
# Note: credentials will be pulled from IAM role assigned to EMR nodes. Make sure permissions are set properly for access to your Kinesis stream

# define variables
s3_target_bucket_name = 'mattsona-spark-demo'  # replace with your bucket name for target data
aws_region = 'us-west-2'  # replace w/ AWS region used for Kinesis stream
kinesis_stream = 'spark_streaming_kinesis_demo'  # replace with your Kinesis stream name
kinesis_endpoint = 'https://kinesis.' + aws_region + '.amazonaws.com'  # the public endpiont of the AWS region this is executed from
kinesis_app_name = 'alex_test_app'  # app name used to track process through the Kinesis stream
kinesis_initial_position = InitialPositionInStream.LATEST  # InitialPositionInStream.TRIM_HORIZON | InitialPositionInStream.LATEST
kinesis_checkpoint_interval = 10  # define how long to checkpoint when processing through the Kinesis stream
spark_batch_interval = 10  # how many seconds before pulling the next batch of data from the Kinesis stream

# configure spark elements
spark_context = SparkContext(appName=kinesis_app_name)
# spark_streaming_context = StreamingContext(sc, 1) # sc valid for running in pyspark interactive mode
spark_streaming_context = StreamingContext(spark_context, spark_batch_interval)

kinesis_stream = KinesisUtils.createStream(
    spark_streaming_context, kinesis_app_name, kinesis_stream,
    kinesis_endpoint, aws_region, kinesis_initial_position,
    kinesis_checkpoint_interval
)  # previous example had ', StorageLevel.MEMORY_AND_DISK_2' at the end of the call

# take kinesis stream JSON data and convert to CSV # just realized we're still dealing with dstreams, not RDD, so naming is inaccurate
py_dict_rdd = kinesis_stream.map(lambda x: json.loads(x))
# need to convert int (time_stamp & random_int) to string
csv_rdd = py_dict_rdd.map(lambda x: x['user_name'] + ',' + str(
    datetime.datetime.utcfromtimestamp(x['time_stamp'])) + ',' + x[
        'data_string'] + ',' + str(x['random_int']))

# save that rdd to S3
empty_intervals = sc.accumulator(0)
images = sc.accumulator(0)
correct_preds_tot = sc.accumulator(0)

# Load model trained using BDL_KERAS_CIFAR_CNN.py
model = Model.loadModel(model_defs_path, model_weights_path)
print('%s.%03dZ: Loaded trained model definitions %s and weights %s' %
      (strftime("%Y-%m-%dT%H:%M:%S", gmtime()),
       (time() * 1000) % 1000, model_defs_path, model_weights_path))
print(
    '%s.%03dZ: Starting reading streaming data from %s:%d at interval %s seconds'
    % (strftime("%Y-%m-%dT%H:%M:%S", gmtime()),
       (time() * 1000) % 1000, IP_address, port, reporting_interval))

# Initialize StreamingContext, have it read TextStream through socket
ssc = StreamingContext(sc, reporting_interval)
image_stream = ssc.socketTextStream(IP_address, port)

# Run model on each batch
image_stream.foreachRDD(run_model)

# Start reading streaming data
ssc.start()
start_time = time()
ssc.awaitTermination()
elapsed_time = time(
) - start_time - empty_intervals.value * reporting_interval - 2.4  # Subtract empty intervals and time to shut down stream
print(
    '\n%s.%03dZ: %d images received in %.1f seconds (%d intervals), or %.0f images/second  Correct predictions: %d  Pct correct: %.1f'
    % (strftime("%Y-%m-%dT%H:%M:%S", gmtime()),
       (time() * 1000) % 1000, images.value, elapsed_time, interval.value,
Example #34
0
 def StreamingInit_Old(self):
     # 老版的Streaming没啥内容。就一个套路,实例化一个SparkSteam对象。textFile或者stock拿数据。增量。
     self.SpkStream = StreamingContext(self.SpkCont, Config.BATCHDUR)
Example #35
0
#    ascii_encode = lambda x: x.encode('ascii')
#    return dict(map(ascii_encode, pair) for pair in data.items())
def helper(data):
    return data.encode('ascii')


def enc(data):
    result = {k: helper(v) for k, v in data.items()}
    return result


#return dict(map(lambda line: line.encode('ascii'), pair.value) for pair in data.items())

conf = SparkConf().setMaster("local[*]").setAppName("StreamingDirectKafka")
sc = SparkContext(conf=conf)
ssc = StreamingContext(sc, 10)
skQuorum = "localhost:2181"
topic = ["meetup"]
kafkaParams = {"metadata.broker.list": "localhost:9092"}

#, kafkaParams = {"metadata.broker.list":"localhost:9092"}
kafkaStream = KafkaUtils.createDirectStream(ssc, topic, kafkaParams)
#stream = ssc.receiverStream( \
#    MeetupReceiver("https://stream.meetup.com/2/rsvps") \
#)
"""
data = kafkaStream.map(lambda line: json.loads(line)
"""
rsvp = kafkaStream.map(lambda line: line[1])
rsvp2 = rsvp.map(lambda line: json.loads(line.encode("ascii", "ignore")))
Example #36
0
sc.setLogLevel('ERROR')
codes = sc.parallelize([(1, 'alpha'), (2, 'beta'), (3, 'delta'), (4, 'gamma')])
codes = getSparkSessionInstance(config).createDataFrame(codes, schema = 'id:int, name:string')
codes.createOrReplaceTempView('codes')

print(codes.collect())

def process(time, rdd):
  try:
    spark = getSparkSessionInstance(rdd.context.getConf())
    rdd1 = rdd.map(lambda x : x[1].split(',')) \
              .map(lambda x : (int(x[0]), float(x[1])))
    df = spark.createDataFrame(rdd1, schema='id:int, amount:float')
    df.createOrReplaceTempView('newdata')
    join = spark.sql('select n.id, c.name, n.amount from newdata as n join codes as c on n.id = c.id')
    join.show()
  except:
    print(rdd.collect())

ssc = StreamingContext(sc, 5)
kafkaStream = KafkaUtils.createStream(ssc, '127.0.0.1:2181', 'spark-streaming', {'classroom':1})
#kafkaStream.pprint()
kafkaStream.foreachRDD(process)



ssc.start()
ssc.awaitTerminationOrTimeout(10000)
ssc.stop()

Example #37
0
from pyspark import SparkContext
from pyspark.streaming import StreamingContext

sc = SparkContext(master='local[4]')
ssc = StreamingContext(sc, 5)
sts = ssc.socketTextStream('localhost', 9999)

fm = sts.flatMap(lambda x: x.split(' ')).map(lambda y: (y, 1)).reduceByKey(
    lambda x, y: x + y)
fm.pprint()

ssc.start()
ssc.awaitTermination()

# Hello pyspark streaming
    # Extract words
    tokenizer = Tokenizer().setInputCol("message").setOutputCol("words")
    # Remove custom stopwords
    stopwords = StopWordsRemover().getStopWords() + ["-"]
    remover = StopWordsRemover().setStopWords(stopwords).setInputCol("words").setOutputCol("filtered")
    # create features
    hashingTF = HashingTF(numFeatures=10, inputCol="words", outputCol="features")
    pipeline = Pipeline().setStages([tokenizer, remover, hashingTF])

    # transform train and test streams
    featured = pipeline.fit(df).transform(df)
    featured_test = pipeline.fit(df_test).transform(df_test)

    ###########################################
    ssc = StreamingContext(sc, 1)
    # read Dstream from json files in monitored dir for training
    trainingData = ssc.textFileStream(get_hdfs_filepath(file_name="train_stream_json/")).map(parse_json_line)
    trainingData.pprint()

    # read Dstream from json files in monitored dir for prediction
    testData = ssc.textFileStream(get_hdfs_filepath(file_name="test_stream_json/")).map(parse_json_line)
    testData.pprint()

    numFeatures = 10

    # initialize a StreamingLinearRegression model
    model = StreamingLinearRegressionWithSGD()
    model.setInitialWeights([0.0, 0.0, 0.0, 0.0, 0.0, 0.0,0.0, 0.0, 0.0, 0.0])
    # train the model on training Dstream
    model.trainOn(trainingData)
Example #39
0
def extract_url_request(line):
    exp = pattern.match(line)
    if exp:
        request = exp.groupdict()["request"]
        if request:
            request_fields = request.split()
            if len(request_fields) > 1:
                return request_fields[1]


if __name__ == "__main__":
    sc = SparkContext(appName="StreamingFlumeLogAggregator")
    sc.setLogLevel("ERROR")
    batch_interval_s = 1
    ssc = StreamingContext(sc, batch_interval_s)

    flumeStream = FlumeUtils.createStream(ssc, "localhost", 9092)

    lines = flumeStream.map(lambda x: x[1])
    urls = lines.map(extract_url_request)

    # Reduce by URL over a 5-minute window sliding every second
    # Reduce: Count for each distinct URL
    window_interval = 300
    slide_interval = 1
    url_counts = urls.map(lambda x: (x, 1)).reduceByKeyAndWindow(
        lambda x, y: x + y, lambda x, y: x - y, window_interval,
        slide_interval)

    # Sort and print the results
Example #40
0
from pyspark.sql import types
import json
import csv
from json import loads
from flatten_json import flatten
from time import sleep
# import pandas as pd


print("PROGRAM START!!!")
print("PROGRAM START!!!")
print("PROGRAM START!!!")
print("PROGRAM START!!!")

sc= SparkContext()
ssc = StreamingContext(sc, 10)
sqlc= SQLContext(sc)
directKafkaStream = KafkaUtils.createDirectStream(ssc, ["kafkaNBA"], {"metadata.broker.list": "localhost:9099"})
lines= directKafkaStream.map(lambda x: x[1])

print("LINES START!!!")
print("LINES START!!!")
print("LINES START!!!")
print("LINES START!!!")

def transformer(rdd):
	my_obj= json.loads(rdd)
	return (my_obj["player"]["weight_pounds"])
transform= lines.map(transformer)

Example #41
0
from pyspark import SparkConf, SparkContext
from pyspark.streaming import StreamingContext
from pyspark.sql import Row, SQLContext
import sys
import requests
# create spark configuration
conf = SparkConf()
conf.setAppName("TwitterStreamApp")
# create spark context with the above configuration
sc = SparkContext(conf=conf)
sc.setLogLevel("ERROR")
# create the Streaming Context from the above spark context with interval size 2 seconds
ssc = StreamingContext(sc, 2)
# setting a checkpoint to allow RDD recovery
ssc.checkpoint("checkpoint_TwitterApp")
# read data from port 9009
dataStream = ssc.socketTextStream("localhost", 9009)


def aggregate_tags_count(new_values, total_sum):
    return sum(new_values) + (total_sum or 0)


def get_sql_context_instance(spark_context):
    if ('sqlContextSingletonInstance' not in globals()):
        globals()['sqlContextSingletonInstance'] = SQLContext(spark_context)
    return globals()['sqlContextSingletonInstance']


def process_rdd(time, rdd):
    print("----------- %s -----------" % str(time))
Example #42
0
from pyspark.streaming import StreamingContext
dvc = [[-0.1, -0.1], [0.1, 0.1], [1.1, 1.1], [0.75, 0.75], [0.9, 0.9]]
dvc = [sc.parallelize(i, 1) for i in dvc]
ssc = StreamingContext(sc, 2.0)
input_stream = ssc.queueStream(dvc)

def get_output(rdd):
    rdd_data = rdd.collect()
    if 0.75 in rdd_data:
    	print "Ending marker found", rdd_data
    	ssc.stop()
    else:
    	print "Not found ending marker. Continuing"
    	print rdd_data

input_stream.foreachRDD(get_output)
ssc.start()
from __future__ import print_function

import sys

from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils

if __name__ == '__main__':

    sc = SparkContext(appName="PythonStreamingKafkaWordCount")
    ssc = StreamingContext(sc, 1)

    kstream = KafkaUtils.createDirectStream(ssc, topics = ['CodeSubmission'], \
      kafkaParams = {"metadata.broker.list": '52.53.157.26:9092'})

    data = kstream.map(lambda x: x[1].encode("utf-8"))
    data.pprint()

    ssc.start()
    ssc.awaitTerminationOrTimeout(30)
    ssc.stop(stopGraceFully=True)
Example #44
0
conf.setAppName("Spark Streaming Examples")

## Initialize SparkContext. Run only once. Otherwise you get multiple
#Context Error.
#for streaming, create a spark context with 2 threads.
sc = SparkContext('local[4]', conf=conf)

from pyspark.streaming import StreamingContext

#............................................................................
##   Streaming with TCP/IP data
#............................................................................

#Create streaming context with latency of 1
streamContext = StreamingContext(sc, 3)

totalLines = 0
lines = streamContext.socketTextStream("localhost", 9000)

#Word count within RDD
words = lines.flatMap(lambda line: line.split(" "))
pairs = words.map(lambda word: (word, 1))
wordCounts = pairs.reduceByKey(lambda x, y: x + y)
wordCounts.pprint(5)

#Count lines
totalLines = 0
linesCount = 0

Example #45
0
        print(top[1][0] + str(',') + top[2][0] + str(',') + top[3][0] +
              str(',') + top[4][0] + str(',') + top[5][0])


WindowSize = int(sys.argv[1])
BatchDuration = int(
    sys.argv[2]
)  #pass window size and batch duration as command line arguments

#print(WindowSize, BatchDuration)

conf = SparkConf()
conf.setAppName("BigData")
sc = SparkContext(conf=conf)

ssc = StreamingContext(sc, BatchDuration)  #passing batch duration
ssc.checkpoint("/checkpoint_BIGDATA")

socket_stream = ssc.socketTextStream("localhost", 9009)  #stream the lines
lines = socket_stream.window(WindowSize)

cols = lines.flatMap(lambda line: [line.split(";")])  #split csv line into cols
#count=cols.reduceByKey(lambda x,y:x+y)
#cols.pprint()

hashtags = cols.flatMap(
    lambda col: col[7].split(","))  #split hashtag col into hashtags

hashtag_pairs = hashtags.map(lambda hashtag:
                             (hashtag, 1))  #make (hashtag, 1) tuple
Example #46
0
    def __init__(self):

        self.ssc = StreamingContext(sc, 1)
def hasht(x):
	#parts=x.split(',')
	parts=filter(None,x.split(','))
	for i in parts:
		return i
if __name__ == "__main__":
	if len(sys.argv) != 3:
		print("Usage: pagerank <file> <Window Size> <Batch Duration>", file=sys.stderr)
		sys.exit(-1)
	window_size=int(sys.argv[1])	
	batch_durn=int(sys.argv[2])	
	conf=SparkConf()
	conf.setAppName("BigData")
	sc=SparkContext(conf=conf)

	ssc=StreamingContext(sc,batch_durn)
	ssc.checkpoint("~/checkpoint_BIGDATA")

	dataStream=ssc.socketTextStream("localhost",9009)
	# dataStream.pprint()
	tweet=dataStream.map(lambda x:tmp(x))
	#tweet.pprint()
	tweet=tweet.map(lambda x:hasht(x)).filter(lambda x:x!=None)
	#tweet.pprint()
	totalcount=tweet.countByValueAndWindow(window_size,1)
	#totalcount.pprint()

	#To Perform operation on each RDD
	totalcount.foreachRDD(process_rdd)

	ssc.start()
Example #48
0
from pyspark import SparkContext, SparkConf
from pyspark.streaming import StreamingContext

conf = SparkConf()
conf.set("spark.master", "yarn")
conf.set("spark.app.name", "streamingapp")

sc = SparkContext(conf=conf)

streamc = StreamingContext(sc, batchDuration=15)

r1 = sc.textFile("s3://datasets-spark-learning/flat_files/au-500.csv")

ds1 = streamc.textFileStream(
    "s3://datasets-spark-learning/flat_files/csvfiles/")

ds2 = ds1.transform(lambda rdd: rdd.union(r1).map(lambda x: x + "spark"))
ds2.pprint()

streamc.start()
streamc.awaitTermination()
Example #49
0
import sys

from pyspark import SparkContext
from pyspark.streaming import StreamingContext

def updateFunc (new_values, last_sum):
    return sum(new_values) + (last_sum or 0)

sc = SparkContext(appName="PyStreamNWC", master="local[*]")
ssc = StreamingContext(sc, 5)
ssc.checkpoint("checkpoint")

lines = ssc.socketTextStream(sys.argv[1], int(sys.argv[2]))

counts = lines.flatMap(lambda line: line.split(" ")) \
              .map(lambda word: (word, 1)) \
              .updateStateByKey(updateFunc) \
              .transform(lambda x: x.sortByKey())

counts.pprint()

ssc.start()
ssc.awaitTermination()
            "rank": rank,
            "origin": origin,
            "destination": dest,
            "airlineid": airlineid,
            "airline": airline_lookup.value[str(airlineid)],
            "arrdelay": arrdelay
        })

    # Use LOWER characters
    carriersByPath.saveToCassandra("capstone", "carriersbypath")


#main function
if __name__ == "__main__":
    # Configure Spark. Create a new context or restore from checkpoint
    ssc = StreamingContext.getOrCreate(CHECKPOINT_DIR, functionToCreateContext)

    # get this spark context
    sc = ssc.sparkContext

    # http://stackoverflow.com/questions/24686474/shipping-python-modules-in-pyspark-to-other-nodes
    sc.addPyFile("common.py")

    # Create a Transformed DStream. Read Kafka from first offset
    # creating a stream
    # :param ssc:  StreamingContext object
    # :param zkQuorum:  Zookeeper quorum (hostname:port,hostname:port,..).
    # :param groupId:  The group id for this consumer.
    # :param topics:  Dict of (topic_name -> numPartitions) to consume.
    #                 Each partition is consumed in its own thread.
    # :param kafkaParams: Additional params for Kafka
Example #51
0
File: q22.py Project: sigefried/ccc

def updateFunction(newValues, runningCount):
    current = (sum(newValues), len(newValues))
    if not runningCount:
        runningCount = current
    else:
        runningCount = (runningCount[0] + current[0],
                        runningCount[1] + current[1])
    return runningCount


if __name__ == '__main__':
    # set up
    sc = SparkContext(appName="q22")
    ssc = StreamingContext(sc, TimeOut)
    brokers = BootStarpServers
    topic = TopicName
    sc.setLogLevel("WARN")
    ssc.checkpoint("/tmp/q22")

    kvs = KafkaUtils.createDirectStream(ssc, [topic], KafkaParams)

    # key logic
    def processRDD(rdd):
        print("start processing rdd...")
        rdd.foreachPartition(save_to_dynamoDB)
        print("rdd processed...")
        print("-----------------------------------------------")

    def save_to_dynamoDB(partition):
Example #52
0
from pyspark.streaming.kafka import KafkaUtils

os.environ["PYSPARK_PYTHON"] = "/usr/bin/python3"
os.environ["PYSPARK_DRIVER_PYTHON"] = "/usr/bin/python3"
os.environ['PYSPARK_SUBMIT_ARGS'] = \
    '--packages org.apache.spark:spark-streaming-kafka-0-8_2.11:2.3.1 ' \
    'pyspark-shell'

spark = SparkSession\
    .builder\
    .appName("word_count")\
    .master("local[*]")\
    .getOrCreate()

sc = spark.sparkContext
ssc = StreamingContext(sparkContext=sc, batchDuration=1)

# the topic to subscribe
topic_to_sub = ["test"]
# the address of kafka, separate with comma if there are many
bootstrap_servers = "localhost:9092"
# kafka config info
kafka_params = {"metadata.broker.list": bootstrap_servers}

# initialize stream to consume data from kafka
kafka_stream = KafkaUtils.createDirectStream(ssc=ssc,
                                             topics=topic_to_sub,
                                             kafkaParams=kafka_params)

kafka_stream.pprint()
r = redis.Redis("127.0.0.1")
    stream = KafkaUtils.createDirectStream(ssc, TOPICS, kafkaParams, offsets)
    stream.foreachRDD(process)

    ssc.checkpoint(CHECKPOINT)
    return ssc


parser = argparse.ArgumentParser()
parser.add_argument('topic', help="Tópico Kafka")
args = parser.parse_args()

if args.topic is None:
    parser.error("Es necesario especificar un tópico kafka!")
    sys.exit(1)

# Process data every 10 seconds
PERIOD = 10
BROKERS = 'localhost:9092'
TOPICS = [args.topic]
GROUP_ID = 'group.1'
APP_NAME = 'TwitterStreamML'
CHECKPOINT = '/tmp/%s' % APP_NAME
STREAM_CONTEXT_TIMEOUT = 70

if __name__ == "__main__":

    context = StreamingContext.getOrCreate(CHECKPOINT, functionToCreateContext)

    context.start()
    context.awaitTermination(timeout=STREAM_CONTEXT_TIMEOUT)
    context.stop()
def streaming(sc, reload: int = 5):
    from pyspark.streaming import StreamingContext
    ssc = StreamingContext(sc, reload)
    return ssc
Counts words in UTF8 encoded, '\n' delimted text received from the network every second.
Usage: network_wordcount.py <hostname> <port>

To run this on your local machine, you need to first run a NetCat server
`$ netcat -l -p 9999`
and then run the example
`$ $SPAKR_HOME/bin/spark-submit network_wordcount.py localhost 9999`

"""

from __future__ import print_function
import sys
from pyspark import SparkContext
from pyspark.streaming import StreamingContext

if __name__ == "__main__":
    if len(sys.argv) != 3:
        print("Usage: network_wordcount.py <hostname> <port>", file=sys.stderr)
        exit(-1)
    sc = SparkContext("local[2]", "NetworkWordCount")
    sc.setLogLevel("WARN")
    ssc = StreamingContext(sc, 1)
    lines = ssc.socketTextStream(sys.argv[1], int(sys.argv[2]))
    counts = lines.flatMap(lambda line: line.split(" "))\
             .map(lambda word: (word, 1)) \
             .reduceByKey(lambda a, b: a+b)
    counts.pprint()
    ssc.start()
    ssc.awaitTermination()
Example #56
0
                   (9453, 3586, 1200548), (-6172, 7805, 264695806),
                   (-75, -98, 177749096), (-1481, -9319, 515114864),
                   (1538, -4513, 459014124), (-427, 3347, 105394153),
                   (9563, 6773, 346529937), (-9007, 5215, 833170048),
                   (-5316, -3153, 701832096), (-7881, 6554, 211851653),
                   (8047, 5316, 653508160), (3615, -969, 408839209),
                   (1678, -5874, 964232482), (-9603, -7771, 612835737),
                   (-2613, -7682, 999683604), (-3867, -206, 73595183),
                   (-4841, -1371, 4259718), (-2310, 3912, 775274868),
                   (7567, 9614, 646354995), (-8238, 8253, 844226086),
                   (4501, -1611, 498009778), (9240, 2000, 694905063),
                   (7650, 4727, 68326721), (6351, 1386, 280839009),
                   (-6909, 3520, 957821259), (-1581, -8095, 885523760),
                   (1090, 5516, 254267011), (4288, -7581, 325047909),
                   (-4262, 7348, 3784554), (-7613, -3920, 724353002),
                   (-384, -2708, 395489622), (-8840, 4115, 303185341),
                   (6212, -1195, 991066480), (1213, 4812, 498566989),
                   (-640, -7705, 182088090), (-4553, 5934, 452918094),
                   (2513, 6315, 355348464), (3426, 1234, 304757776)]

    ssc = StreamingContext(sc, batch_dur)
    data = ssc.socketTextStream("localhost", port).map(lambda x:json.loads(x)["city"]).transform(lambda x:x.distinct()).window(window_length,sliding_interval).transform(lambda x:x.distinct())\
                  .map(hashtobit).flatMap(lambda x:x).groupByKey().foreachRDD(combine)#count()
    #inputStream.pprint(10)
    with open(sys.argv[2], 'w') as file:
        file.write("Time,Ground Truth,Estimation\n")
    #print(hash_tables)

    ssc.start()
    ssc.awaitTermination()

def forf(x):
    for i in x:
        yield (i, 1)


def rddprint(rdd):
    print(",".join(rdd.take(5)))


conf = SparkConf()
conf.setAppName("BigData")
sc = SparkContext(conf=conf)

ssc = StreamingContext(sc, 1)
ssc.checkpoint("/checkpoint_BIGDATA")

#Try in outpu1
inputStream = ssc.socketTextStream("localhost", 9009)
dataStream = inputStream.window(int(sys.argv[1]), int(sys.argv[2]))
tweet = dataStream.map(tmp)
septweet = tweet.flatMap(forf)
count = septweet.reduceByKey(lambda x, y: x + y)
sortcount = count.transform(
    lambda rdd: rdd.sortBy(lambda a: a[0], ascending=True))
sortcount1 = sortcount.transform(
    lambda rdd: rdd.sortBy(lambda a: a[1], ascending=False))
tweet1 = sortcount1.filter(lambda w: w[0] is not '')
#tweet1.pprint()
res = tweet1.map(lambda a: a[0])
# return to the pool for future reuse
# ConnectionPool.returnConnection(connection)

# To Run:
# sudo $SPARK_HOME/bin/spark-submit --packages org.apache.spark:spark-streaming-kafka-0-8_2.11:2.1.0 kafka-spark-test.py
if __name__ == "__main__":

    # To run on cluster:
    # conf = SparkConf().setAppName("Venmo-Graph-Analytics-Dev").setMaster("spark://ip-172-31-0-135:7077")
    # sc = SparkContext(conf=conf)

    # To run locally:
    sc = SparkContext(appName="Venmo-Graph-Analytics-Dev")

    # Set up resources
    ssc = StreamingContext(sc, 1)  # Set Spark Streaming context

    # brokers = "ec2-50-112-19-115.us-west-2.compute.amazonaws.com:9092,ec2-52-33-162-7.us-west-2.compute.amazonaws.com:9092,ec2-52-89-43-209.us-west-2.compute.amazonaws.com:9092"
    brokers = "ec2-52-25-139-222.us-west-2.compute.amazonaws.com:9092"
    topic = 'Venmo-Transactions-Dev'

    kafka_stream = KafkaUtils.createDirectStream(
        ssc, [topic], {"metadata.broker.list": brokers})

    transaction = kafka_stream.map(lambda kafka_response: json.loads(kafka_response[1]))\
        .map(lambda json_body: extract_data(json_body))\
        .foreachRDD(lambda rdd: rdd.foreachPartition(send_partition))
    # transaction.pprint()

    ssc.start()
    ssc.awaitTermination()
Example #59
0
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.sql import SQLContext, Row


def getSqlContextInstance(sparkContext):
    if ('sqlContextSingletonInstance' not in globals()):
        globals()['sqlContextSingletonInstance'] = SQLContext(sparkContext)
    return globals()['sqlContextSingletonInstance']


if __name__ == "__main__":
       
    sc = SparkContext()
    ssc = StreamingContext(sc, 5)

    # Create a socket stream on target ip:port and count the
    # words in input stream of \n delimited text (eg. generated by 'nc')
    lines = ssc.socketTextStream("localhost", 9999)
    words = lines.flatMap(lambda line: line.split(" "))

    # Convert RDDs of the words DStream to DataFrame and run SQL query
    def process(time, rdd):
        print("========= %s =========" % str(time))

        try:
            # Get the singleton instance of SQLContext
            sqlContext = getSqlContextInstance(rdd.context)

            # Convert RDD[String] to RDD[Row] to DataFrame
            rowRdd = rdd.map(lambda w: Row(word=w))
Example #60
0
# Spark Streaming vs. Structured Streaming https://dzone.com/articles/spark-streaming-vs-structured-streaming

"""
核心组件
pyspark.streaming.StreamingContext
Main entry point for Spark Streaming functionality.

pyspark.streaming.DStream
A Discretized Stream (DStream), the basic abstraction in Spark Streaming.
"""
import sys

from pyspark import SparkContext
from pyspark.streaming import StreamingContext

if __name__ == "__main__":
    sc = SparkContext(appName="Streaming ")
    ssc = StreamingContext(sc, 1)

    lines = ssc.textFileStream("03_pyspark.streaming.py") # return  DStream
    counts = lines.flatMap(lambda line: line.split(" "))\
                  .map(lambda x: (x, 1))\
                  .reduceByKey(lambda a, b: a+b)
    counts.pprint()

    ssc.start()
    ssc.awaitTermination()