Example #1
0
 def test_kinesis_stream_api(self):
     # Don't start the StreamingContext because we cannot test it in Jenkins
     KinesisUtils.createStream(
         self.ssc,
         "myAppNam",
         "mySparkStream",
         "https://kinesis.us-west-2.amazonaws.com",
         "us-west-2",
         InitialPositionInStream.LATEST,
         2,
         MetricsLevel.DETAILED,
         StorageLevel.MEMORY_AND_DISK_2,
     )
     KinesisUtils.createStream(
         self.ssc,
         "myAppNam",
         "mySparkStream",
         "https://kinesis.us-west-2.amazonaws.com",
         "us-west-2",
         InitialPositionInStream.LATEST,
         2,
         MetricsLevel.DETAILED,
         StorageLevel.MEMORY_AND_DISK_2,
         "awsAccessKey",
         "awsSecretKey",
     )
Example #2
0
 def test_kinesis_stream_api(self):
     # Don't start the StreamingContext because we cannot test it in Jenkins
     kinesisStream1 = KinesisUtils.createStream(
         self.ssc, "myAppNam", "mySparkStream",
         "https://kinesis.us-west-2.amazonaws.com", "us-west-2",
         InitialPositionInStream.LATEST, 2, StorageLevel.MEMORY_AND_DISK_2)
     kinesisStream2 = KinesisUtils.createStream(
         self.ssc, "myAppNam", "mySparkStream",
         "https://kinesis.us-west-2.amazonaws.com", "us-west-2",
         InitialPositionInStream.LATEST, 2, StorageLevel.MEMORY_AND_DISK_2,
         "awsAccessKey", "awsSecretKey")
def consume_records(interval=1,
                    StreamName=None,
                    region_name='us-west-2',
                    Bucket=None):
    """
    Create a local StreamingContext with two working
    thread and batch interval
    """
    assert StreamName is not None

    endpoint = 'https://kinesis.{}.amazonaws.com/'.format(region_name)
    #client = boto3.client('s3')
    #client.upload_file ('kinesis_event_consumer.py',Bucket,'kinesis_event_consumer.py')
    #print ('fichero subido')

    sc, stream_context = initialize_context(interval=interval)
    sc.setLogLevel("ERROR")
    print('create stream')
    stream = KinesisUtils.createStream(stream_context, 'EventLKinesisConsumer',
                                       StreamName, endpoint, region_name,
                                       InitialPositionInStream.TRIM_HORIZON,
                                       interval)
    #LATEST

    # counts number of events
    event_counts = aggregate_by_event_type(stream)
    global_counts = update_global_event_counts(event_counts)
    global_counts.pprint()
    # Sends data to S3
    global_counts.foreachRDD(lambda rdd: send_record(rdd, Bucket))
    stream_context.start()
    print('stream iniciado')
    stream_context.awaitTermination()
    stream_context.stop()
    sc.stop()
def consume_records(interval=1,
                    StreamName=None,
                    region_name='us-west-2',
                    Bucket=None):
    """
    Create a local StreamingContext with two working
    thread and batch interval
    """
    assert StreamName is not None

    endpoint = 'https://kinesis.{}.amazonaws.com/'.format(region_name)

    sc, stream_context = initialize_context(interval=interval)
    sc.setLogLevel("INFO")
    stream = KinesisUtils.createStream(stream_context, 'EventLKinesisConsumer',
                                       StreamName, endpoint, region_name,
                                       InitialPositionInStream.LATEST,
                                       interval)

    # counts number of events
    event_counts = aggregate_by_event_type(stream)
    global_counts = update_global_event_counts(event_counts)
    global_counts.pprint()
    # Sends data to S3
    global_counts.foreachRDD(lambda rdd: send_record(rdd, Bucket))
    stream_context.start()
    stream_context.awaitTermination()
    def test_kinesis_stream(self):
        if not are_kinesis_tests_enabled:
            sys.stderr.write(
                "Skipped test_kinesis_stream (enable by setting environment variable %s=1" % kinesis_test_environ_var
            )
            return

        import random

        kinesisAppName = "KinesisStreamTests-%d" % abs(random.randint(0, 10000000))
        kinesisTestUtilsClz = (
            self.sc._jvm.java.lang.Thread.currentThread()
            .getContextClassLoader()
            .loadClass("org.apache.spark.streaming.kinesis.KinesisTestUtils")
        )
        kinesisTestUtils = kinesisTestUtilsClz.newInstance()
        try:
            kinesisTestUtils.createStream()
            aWSCredentials = kinesisTestUtils.getAWSCredentials()
            stream = KinesisUtils.createStream(
                self.ssc,
                kinesisAppName,
                kinesisTestUtils.streamName(),
                kinesisTestUtils.endpointUrl(),
                kinesisTestUtils.regionName(),
                InitialPositionInStream.LATEST,
                10,
                StorageLevel.MEMORY_ONLY,
                aWSCredentials.getAWSAccessKeyId(),
                aWSCredentials.getAWSSecretKey(),
            )

            outputBuffer = []

            def get_output(_, rdd):
                for e in rdd.collect():
                    outputBuffer.append(e)

            stream.foreachRDD(get_output)
            self.ssc.start()

            testData = [i for i in range(1, 11)]
            expectedOutput = set([str(i) for i in testData])
            start_time = time.time()
            while time.time() - start_time < 120:
                kinesisTestUtils.pushData(testData)
                if expectedOutput == set(outputBuffer):
                    break
                time.sleep(10)
            self.assertEqual(expectedOutput, set(outputBuffer))
        except:
            import traceback

            traceback.print_exc()
            raise
        finally:
            self.ssc.stop(False)
            kinesisTestUtils.deleteStream()
            kinesisTestUtils.deleteDynamoDBTable(kinesisAppName)
Example #6
0
def main(appName, streamName, endpointUrl, regionName):
    sc = SparkContext(appName="BestApp")
    ssc = StreamingContext(sc, 10)
    data = KinesisUtils.createStream(ssc, appName, streamName, endpointUrl,
                                     regionName,
                                     InitialPositionInStream.LATEST, 10)
    result = data.window(60, 20).foreachRDD(computeGridVal)
    ssc.start()
    ssc.awaitTermination()
Example #7
0
 def run(self, appName, streamName, endpointUrl, region_name, anomaly_stream_name):
     sc = SparkContext(appName="PythonStreamingKinesisAnomalyDetection")
     print("Initialised SC")
     #TODO: log warn and above only
     logger = sc._jvm.org.apache.log4j
     logger.LogManager.getLogger("org").setLevel(logger.Level.WARN)
     ssc = StreamingContext(sc, 1)
     dstreamRecords = KinesisUtils.createStream(
         ssc, appName, streamName, endpointUrl, region_name, InitialPositionInStream.LATEST, 2)
     CloudTrailLogProcessor( anomaly_stream_name=anomaly_stream_name, region=region_name)\
         .process(sc, ssc, dstreamRecords)
     ssc.start()
     ssc.awaitTermination()
Example #8
0
    def test_kinesis_stream(self):
        import random

        kinesisAppName = "KinesisStreamTests-%d" % abs(
            random.randint(0, 10000000))
        kinesisTestUtils = self.ssc._jvm.org.apache.spark.streaming.kinesis.KinesisTestUtils(
            2)
        try:
            kinesisTestUtils.createStream()
            aWSCredentials = kinesisTestUtils.getAWSCredentials()
            stream = KinesisUtils.createStream(
                self.ssc,
                kinesisAppName,
                kinesisTestUtils.streamName(),
                kinesisTestUtils.endpointUrl(),
                kinesisTestUtils.regionName(),
                InitialPositionInStream.LATEST,
                10,
                MetricsLevel.DETAILED,
                StorageLevel.MEMORY_ONLY,
                aWSCredentials.getAWSAccessKeyId(),
                aWSCredentials.getAWSSecretKey(),
            )

            outputBuffer = []

            def get_output(_, rdd):
                for e in rdd.collect():
                    outputBuffer.append(e)

            stream.foreachRDD(get_output)
            self.ssc.start()

            testData = [i for i in range(1, 11)]
            expectedOutput = set([str(i) for i in testData])
            start_time = time.time()
            while time.time() - start_time < 120:
                kinesisTestUtils.pushData(testData)
                if expectedOutput == set(outputBuffer):
                    break
                time.sleep(10)
            self.assertEqual(expectedOutput, set(outputBuffer))
        except BaseException:
            import traceback

            traceback.print_exc()
            raise
        finally:
            self.ssc.stop(False)
            kinesisTestUtils.deleteStream()
            kinesisTestUtils.deleteDynamoDBTable(kinesisAppName)
Example #9
0
    def test_kinesis_stream(self):
        if not are_kinesis_tests_enabled:
            sys.stderr.write(
                "Skipped test_kinesis_stream (enable by setting environment variable %s=1"
                % kinesis_test_environ_var)
            return

        import random
        kinesisAppName = ("KinesisStreamTests-%d" %
                          abs(random.randint(0, 10000000)))
        kinesisTestUtilsClz = \
            self.sc._jvm.java.lang.Thread.currentThread().getContextClassLoader() \
                .loadClass("org.apache.spark.streaming.kinesis.KinesisTestUtils")
        kinesisTestUtils = kinesisTestUtilsClz.newInstance()
        try:
            kinesisTestUtils.createStream()
            aWSCredentials = kinesisTestUtils.getAWSCredentials()
            stream = KinesisUtils.createStream(
                self.ssc, kinesisAppName, kinesisTestUtils.streamName(),
                kinesisTestUtils.endpointUrl(), kinesisTestUtils.regionName(),
                InitialPositionInStream.LATEST, 10, StorageLevel.MEMORY_ONLY,
                aWSCredentials.getAWSAccessKeyId(),
                aWSCredentials.getAWSSecretKey())

            outputBuffer = []

            def get_output(_, rdd):
                for e in rdd.collect():
                    outputBuffer.append(e)

            stream.foreachRDD(get_output)
            self.ssc.start()

            testData = [i for i in range(1, 11)]
            expectedOutput = set([str(i) for i in testData])
            start_time = time.time()
            while time.time() - start_time < 120:
                kinesisTestUtils.pushData(testData)
                if expectedOutput == set(outputBuffer):
                    break
                time.sleep(10)
            self.assertEqual(expectedOutput, set(outputBuffer))
        except:
            import traceback
            traceback.print_exc()
            raise
        finally:
            self.ssc.stop(False)
            kinesisTestUtils.deleteStream()
            kinesisTestUtils.deleteDynamoDBTable(kinesisAppName)
Example #10
0
def creatingfunc():
    # create streaming context
    ssc = StreamingContext(sc, batchIntervalSeconds)
    LogToKinesis("creatingfunc", "StreamingContext", str(dir(ssc)))
    ssc.remember(10 * batchIntervalSeconds)

    # setup streams
    try:
        #paxRecords = ssc.textFileStream(SOURCE).map(ParsePassengerRecord)  # parse and enrich pax data
        kinesisStream = KinesisUtils.createStream(
            ssc, KINESIS_APPNAME, KINESIS_STREAM, KINESIS_ENDPOINT_URL,
            KINESIS_REGION, InitialPositionInStream.TRIM_HORIZON, 10,
            StorageLevel.MEMORY_AND_DISK_2, ACCESS_KEY, SECRET_KEY)
        LogToKinesis("kinesisStream", "KinesisUtils.createStream",
                     str(dir(kinesisStream)))

        # track total boarding and alighting per train/ownmoduleno
        # Note: rdd returned by updateStateByKey is (ownmoduleno, (alight, board))
        # for easy conversion to dataframe we map this rdd to (ownmoduleno, alight, board). (Not shure why the following did not work: map(lambda k,v: (k,v[0],v[1])) )
        """
    noOfPassengersOwnModuleToday = paxRecords.map(lambda record: (record[OWN_MODULE_NO],(record[TOTAL_ALIGHTING], record[TOTAL_BOARDING])))  \
                              .updateStateByKey(updatePassengerCount) \
                              .map(lambda v: (v[0],v[1][0],v[1][1]))  
        
    paxRecordsWindowStationLine = paxRecords.window(1800,20)  # compute aggregates on a 30 min window updated every 20 sec
    paxRecordsTable = paxRecords.window(900,900) # save to permanent storage every 15 min (how large/small amounts of data is optimal to save at a time?)
    LogToKinesis("creatingfunc", "Streams set up OK")
    """
    except Exception as e:
        LogToKinesis("creatingfunc", "EXCEPTION", str(e))

    # output streams
    try:
        #paxRecords.foreachRDD(processPax)
        #noOfPassengersOwnModuleToday.foreachRDD(processOwnModuleState) # send sum of alightings and boardings and pax present onboard for each train to Kinesis
        #paxRecordsWindowStationLine.foreachRDD(processStationLineWindow) #send aggregates to Kinesis periodically, i.e. last 30 mins updated every 20 secs
        #paxRecordsTable.foreachRDD(processTable) #save to permanent table periodically
        kinesisStream.foreachRDD(processKinesisPax)
    except Exception as e:
        LogToKinesis("mainLoop", "EXCEPTION", str(e))

    ssc.checkpoint(CHECKPOINTDIR)
    return ssc
def run():
    APP_NAME = 'kinesis-stream-test'
    STREAM_NAME = 'MY-TEST-STREAM'
    ENDPOINT_URL = 'https://kinesis.us-east-1.amazonaws.com'
    REGION = 'us-east-1'

    # The time interval to get a new RDD in seconds
    batchInterval = 5
    kinesisCheckpointInterval = batchInterval

    sc = SparkContext(appName=APP_NAME)
    sc.setLogLevel('ERROR')
    ssc = StreamingContext(sc, batchInterval)

    stream = KinesisUtils.createStream(
        ssc=ssc,
        kinesisAppName=APP_NAME,
        streamName=STREAM_NAME,
        endpointUrl=ENDPOINT_URL,
        regionName=REGION,
        initialPositionInStream=InitialPositionInStream.LATEST,
        checkpointInterval=kinesisCheckpointInterval,
        storageLevel=StorageLevel.MEMORY_AND_DISK_2,
    )

    def get_output(_, rdd):
        if (len(rdd.take(1)) == 0):
            return

        print('New RDD is coming ...')
        data = rdd.collect()
        for e in data:
            print(e)

        print(f'Data entry count = {len(data)}')

    stream.foreachRDD(get_output)

    ssc.start()
    ssc.awaitTermination()
def consume_records(
        interval=1, StreamName=None, region_name='us-west-2', port=9876):
    """
    Create a local StreamingContext with two working
    thread and batch interval
    """
    assert StreamName is not None

    endpoint = 'https://kinesis.{}.amazonaws.com/'.format(region_name)

    sc, stream_context = initialize_context(interval=interval)
    sc.setLogLevel("INFO")
    kinesis_stream = KinesisUtils.createStream(
        stream_context, 'EventLKinesisConsumer', StreamName, endpoint,
        region_name, InitialPositionInStream.LATEST, interval)

    tcp_stream = stream_context.socketTextStream('localhost', port)

    join_aggregation(kinesis_stream, tcp_stream)

    stream_context.start()
    stream_context.awaitTermination()
def creatingfunc():
  # create streaming context
  ssc = StreamingContext(sc, batchIntervalSeconds)
  LogToKinesis("creatingfunc", "StreamingContext", str(dir(ssc)))
  ssc.remember(10*batchIntervalSeconds)
  
  # setup streams
  try: 
    #paxRecords = ssc.textFileStream(SOURCE).map(ParsePassengerRecord)  # parse and enrich pax data
    kinesisStream = KinesisUtils.createStream(ssc, KINESIS_APPNAME, KINESIS_STREAM, KINESIS_ENDPOINT_URL, KINESIS_REGION, InitialPositionInStream.TRIM_HORIZON, 10, StorageLevel.MEMORY_AND_DISK_2, ACCESS_KEY, SECRET_KEY)
    LogToKinesis("kinesisStream", "KinesisUtils.createStream", str(dir(kinesisStream)))
    
    # track total boarding and alighting per train/ownmoduleno
    # Note: rdd returned by updateStateByKey is (ownmoduleno, (alight, board))
    # for easy conversion to dataframe we map this rdd to (ownmoduleno, alight, board). (Not shure why the following did not work: map(lambda k,v: (k,v[0],v[1])) )
    """
    noOfPassengersOwnModuleToday = paxRecords.map(lambda record: (record[OWN_MODULE_NO],(record[TOTAL_ALIGHTING], record[TOTAL_BOARDING])))  \
                              .updateStateByKey(updatePassengerCount) \
                              .map(lambda v: (v[0],v[1][0],v[1][1]))  
        
    paxRecordsWindowStationLine = paxRecords.window(1800,20)  # compute aggregates on a 30 min window updated every 20 sec
    paxRecordsTable = paxRecords.window(900,900) # save to permanent storage every 15 min (how large/small amounts of data is optimal to save at a time?)
    LogToKinesis("creatingfunc", "Streams set up OK")
    """
  except Exception as e:
    LogToKinesis("creatingfunc", "EXCEPTION", str(e))
 
  # output streams
  try: 
    #paxRecords.foreachRDD(processPax)
    #noOfPassengersOwnModuleToday.foreachRDD(processOwnModuleState) # send sum of alightings and boardings and pax present onboard for each train to Kinesis
    #paxRecordsWindowStationLine.foreachRDD(processStationLineWindow) #send aggregates to Kinesis periodically, i.e. last 30 mins updated every 20 secs
    #paxRecordsTable.foreachRDD(processTable) #save to permanent table periodically
    kinesisStream.foreachRDD(processKinesisPax)
  except Exception as e:
    LogToKinesis("mainLoop", "EXCEPTION", str(e))

  ssc.checkpoint(CHECKPOINTDIR)
  return ssc
Example #14
0
    def test_kinesis_stream(self):
        import random
        kinesisAppName = ("KinesisStreamTests-%d" % abs(random.randint(0, 10000000)))
        kinesisTestUtils = self.ssc._jvm.org.apache.spark.streaming.kinesis.KinesisTestUtils(2)
        try:
            kinesisTestUtils.createStream()
            aWSCredentials = kinesisTestUtils.getAWSCredentials()
            stream = KinesisUtils.createStream(
                self.ssc, kinesisAppName, kinesisTestUtils.streamName(),
                kinesisTestUtils.endpointUrl(), kinesisTestUtils.regionName(),
                InitialPositionInStream.LATEST, 10, StorageLevel.MEMORY_ONLY,
                aWSCredentials.getAWSAccessKeyId(), aWSCredentials.getAWSSecretKey())

            outputBuffer = []

            def get_output(_, rdd):
                for e in rdd.collect():
                    outputBuffer.append(e)

            stream.foreachRDD(get_output)
            self.ssc.start()

            testData = [i for i in range(1, 11)]
            expectedOutput = set([str(i) for i in testData])
            start_time = time.time()
            while time.time() - start_time < 120:
                kinesisTestUtils.pushData(testData)
                if expectedOutput == set(outputBuffer):
                    break
                time.sleep(10)
            self.assertEqual(expectedOutput, set(outputBuffer))
        except:
            import traceback
            traceback.print_exc()
            raise
        finally:
            self.ssc.stop(False)
            kinesisTestUtils.deleteStream()
            kinesisTestUtils.deleteDynamoDBTable(kinesisAppName)
Example #15
0
    sc = SparkContext()

    # Connect to the hive context of our spark context.
    sqlContext = HiveContext(sc)

    # Define an external hive table from the PARQUET files stored in S3 to be used to retrieve the schema of the data.
    # The schema will be used to parse the messages coming from the Kinesis stream and thus must match it.
    sqlContext.sql(
        "CREATE EXTERNAL TABLE IF NOT EXISTS yellow_trips_schema( pickup_timestamp BIGINT, dropoff_timestamp BIGINT, vendor_id STRING, pickup_datetime TIMESTAMP, dropoff_datetime TIMESTAMP, pickup_longitude FLOAT, pickup_latitude FLOAT, dropoff_longitude FLOAT, dropoff_latitude FLOAT, passenger_count INT, trip_distance FLOAT, payment_type STRING, fare_amount FLOAT, extra FLOAT, mta_tax FLOAT, tip_amount FLOAT, tolls_amount FLOAT, total_amount FLOAT, store_and_fwd_flag STRING) STORED AS PARQUET "
        + "LOCATION 's3://<YOUR_BUCKET_NAME>/kinesis-parquet/'")

    ssc = StreamingContext(sc, 1)

    # Create an RDD of a single row just to get the schema. No data will be actually read except for the schema.
    table = sqlContext.sql("select * from yellow_trips_schema limit 1")

    # Connect to the Kinesis stream - create an RDD of stream messages
    lines = KinesisUtils.createStream(
        ssc, appName, kinesisStreamName,
        'https://kinesis.us-east-1.amazonaws.com', 'us-east-1',
        InitialPositionInStream.LATEST, 2)
    # Iterate over messages as they arrive.
    lines.foreachRDD(write_lines)

    # Since we are using a streaming context we need tell the streaming context to start polling for new stream events.
    ssc.start()

    # The line below will keep the job running until it is explicitly stopped.
    ssc.awaitTermination()
    print("--------------------------------------------------------")
    for record in iter:
        temperature = json.loads(record)["value"]
        if(temperature > 25):
            sendEmail()
    print("--------------------------------------------------------")

sc = SparkContext()
ssc = StreamingContext(sc, 1)

streamName = 'lynf-datastream'
appName = 'lynf_data'
endpointUrl = '<kinesis endpointUrl>'
regionName = 'region_name'

dstream = KinesisUtils.createStream(ssc, appName, streamName, endpointUrl, regionName, InitialPositionInStream.TRIM_HORIZON, 5)
# py_rdd = dstream.map(lambda x: json.loads(x))
dstream.foreachRDD(lambda rdd: rdd.foreachPartition(sendPartition))

# py_rdd.pprint(10)
# py_rdd.saveAsTextFiles("s3n://maweijun-test4/lynf_data/output.txt")


ssc.start()
ssc.awaitTermination()
# ssc.stop()

------------------------------------------------------------------------------------------
# 提交任务
pyspark:
  spark-submit --packages org.apache.spark:spark-streaming-kinesis-asl_2.11:2.4.2 spark-streaming.py
client = boto3.client('kinesis')

sc = SparkContext()
ssc = StreamingContext(sc, 10)
sqlc = SQLContext(sc)

appName = "Lufthansa_1"
streamName = "Lufthansa"
endpointUrl = "https://kinesis.us-east-2.amazonaws.com"
regionName = "us-east-2"
awsAccessKeyId = "AKIAS5TGRVYITEV4Z4MH"
awsSecretKey = "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"

lines = KinesisUtils.createStream(ssc, appName, streamName, endpointUrl,
                                  regionName, InitialPositionInStream.LATEST,
                                  2, StorageLevel.MEMORY_AND_DISK_2,
                                  awsAccessKeyId, awsSecretKey)


def transformer(rdd):
    my_obj = json.loads(rdd)
    return (my_obj["Departure"]["AirportCode"],
            my_obj["Departure"]["ScheduledTimeLocal"]["DateTime"],
            my_obj["Departure"]["ScheduledTimeUTC"]["DateTime"],
            my_obj["Departure"]["TimeStatus"]["Code"],
            my_obj["Arrival"]["AirportCode"],
            my_obj["Arrival"]["ScheduledTimeLocal"]["DateTime"],
            my_obj["Arrival"]["ScheduledTimeUTC"]["DateTime"],
            my_obj["Arrival"]["TimeStatus"]["Code"],
            my_obj["OperatingCarrier"]["AirlineID"],
            my_obj["OperatingCarrier"]["FlightNumber"],
aws_region = 'us-west-2'  # replace w/ AWS region used for Kinesis stream
kinesis_stream = 'spark_streaming_kinesis_demo'  # replace with your Kinesis stream name
kinesis_endpoint = 'https://kinesis.' + aws_region + '.amazonaws.com'  # the public endpiont of the AWS region this is executed from
kinesis_app_name = 'alex_test_app'  # app name used to track process through the Kinesis stream
kinesis_initial_position = InitialPositionInStream.LATEST  # InitialPositionInStream.TRIM_HORIZON | InitialPositionInStream.LATEST
kinesis_checkpoint_interval = 10  # define how long to checkpoint when processing through the Kinesis stream
spark_batch_interval = 10  # how many seconds before pulling the next batch of data from the Kinesis stream

# configure spark elements
spark_context = SparkContext(appName=kinesis_app_name)
# spark_streaming_context = StreamingContext(sc, 1) # sc valid for running in pyspark interactive mode
spark_streaming_context = StreamingContext(spark_context, spark_batch_interval)

kinesis_stream = KinesisUtils.createStream(
    spark_streaming_context, kinesis_app_name, kinesis_stream,
    kinesis_endpoint, aws_region, kinesis_initial_position,
    kinesis_checkpoint_interval
)  # previous example had ', StorageLevel.MEMORY_AND_DISK_2' at the end of the call

# take kinesis stream JSON data and convert to CSV # just realized we're still dealing with dstreams, not RDD, so naming is inaccurate
py_dict_rdd = kinesis_stream.map(lambda x: json.loads(x))
# need to convert int (time_stamp & random_int) to string
csv_rdd = py_dict_rdd.map(lambda x: x['user_name'] + ',' + str(
    datetime.datetime.utcfromtimestamp(x['time_stamp'])) + ',' + x[
        'data_string'] + ',' + str(x['random_int']))

# save that rdd to S3
commit_to_s3 = csv_rdd.saveAsTextFiles(
    's3://' + s3_target_bucket_name + '/spark_streaming_processing/ ' +
    datetime.datetime.isoformat(datetime.datetime.now()).replace(':', '_'))
# commit_to_s3 = kinesis_stream.saveAsTextFiles('s3://mattsona-public/' + datetime.datetime.isoformat(datetime.datetime.now()).replace(':','_'))
  See http://spark.apache.org/docs/latest/streaming-kinesis-integration.html for more details on
  the Kinesis Spark Streaming integration.
"""
from __future__ import print_function
import sys

from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.streaming.kinesis import KinesisUtils, InitialPositionInStream

if __name__ == "__main__":
    if len(sys.argv) != 5:
        print(
            "Usage: kinesis_wordcount_asl.py <app-name> <stream-name> <endpoint-url> <region-name>",
            file=sys.stderr)
        sys.exit(-1)

    sc = SparkContext(appName="PythonStreamingKinesisWordCountAsl")
    ssc = StreamingContext(sc, 1)
    appName, streamName, endpointUrl, regionName = sys.argv[1:]
    lines = KinesisUtils.createStream(
        ssc, appName, streamName, endpointUrl, regionName, InitialPositionInStream.TRIM_HORIZON, 2)
    counts = lines.flatMap(lambda line: line.split(" ")) \
        .map(lambda word: (word, 1)) \
        .reduceByKey(lambda a, b: a+b)
    counts.pprint()

    ssc.start()
    ssc.awaitTermination()
Example #20
0
    sc.addPyFile(CODE_PATH + '/constants.py')

    sc._jsc.hadoopConfiguration().set("fs.s3n.awsAccessKeyId", S3ACCESSID)
    sc._jsc.hadoopConfiguration().set("fs.s3n.awsSecretAccessKey", S3SECRETKEY)

    sqlContext = SQLContext(sc)
    registerUDF(sqlContext)

    printOnConsole('Streaming started')

    kinesisStream = [
        KinesisUtils.createStream(ssc,
                                  APPLICATION_NAME,
                                  STREAM_NAME,
                                  ENDPOINT,
                                  REGION_NAME,
                                  INITIAL_POS,
                                  CHECKPOINT_INTERVAL,
                                  awsAccessKeyId=AWSACCESSID,
                                  awsSecretKey=AWSSECRETKEY,
                                  storageLevel=STORAGE_LEVEL)
        for _ in range(NUM_STREAMS)
    ]

    unifiedStream = ssc.union(*kinesisStream)

    print 'Started running'
    #unikinesisStream.reduceByKey(lambda x,y: x+y)
    #unifiedStream.count().pprint()

    unifiedStream.foreachRDD(processRdd)
Example #21
0
    return final_path


def handle_rdd(rdd):
    print("---------> Processing new RDD")
    rdd_count = rdd.count()
    print('---------> Count of Initial RDD {}'.format(rdd_count))
    if rdd_count > 0:
        lang = 'en'
        rdd_transformed = rdd.map(lambda e: process_event(e))
        print('---------> Count of Transformed RDD {}'.format(
            rdd_transformed.count()))
        rdd_filtered = rdd_transformed.filter(lambda e: e['lang'] == lang)
        # just a simple example to filter the RDD
        print('---------> Count of Filtered RDD {}'.format(
            rdd_filtered.count()))
        rdd.saveAsTextFile(build_path(lang=lang))


dstream = KinesisUtils.createStream(streaming_ctx, app_name,
                                    kinesis_stream_name, kinesis_endpoint_url,
                                    region_name,
                                    InitialPositionInStream.LATEST,
                                    windows_size_secs,
                                    StorageLevel.MEMORY_AND_DISK_2)
dstream.foreachRDD(handle_rdd)
streaming_ctx.start()
streaming_ctx.awaitTermination()

# streaming_ctx.stop()
Example #22
0
        conf.set("spark.mongodb.output.uri",
                 consumer_conf["MONGO_CONNECTION_STRING"])

        spark_session = SparkSession.builder.config(conf=conf).getOrCreate()
        spark_context = spark_session.sparkContext

        ## Streaming context
        spark_streaming_context = StreamingContext(spark_context,
                                                   spark_batch_interval)

        sql_context = SQLContext(spark_context)
        #gsdmm = spark_context.broadcast(model)

        ## Create Kinesis Stream
        kinesis_stream = KinesisUtils.createStream(
            spark_streaming_context, kinesis_app_name, kinesis_stream,
            kinesis_endpoint, aws_region, kinesis_initial_position,
            kinesis_checkpoint_interval)

        ## Convert strings to objects
        myrdd = kinesis_stream.map(convert_json)

        ## Process entry data point
        myrdd.foreachRDD(process)

        ## Start process and awaits
        spark_streaming_context.start()
        spark_streaming_context.awaitTermination()
        spark_streaming_context.stop()
    except Exception as e:
        print(e)
        pass
from pyspark.streaming.kinesis import KinesisUtils, InitialPositionInStream

appName="PythonKinesisApp"
sc = SparkContext(appName=appName)
ssc = StreamingContext(sc, 1)


streamName = 'DemoStream'
endpointUrl = 'https://kinesis.us-east-1.amazonaws.com'
regionName = 'us-east-1'
AWS_ACCESS_KEY_ID = ''
SECRET_ACCESS_KEY = ''
checkpointInterval = 5
kinesisstream = KinesisUtils.createStream(ssc, appName, 
                                    streamName, endpointUrl, regionName, 
                                    InitialPositionInStream.LATEST, 
                                    checkpointInterval, 
                                    awsAccessKeyId=AWS_ACCESS_KEY_ID, 
                                    awsSecretKey=SECRET_ACCESS_KEY)
lines = kinesisstream.map(lambda x: x[1])
counts = lines.flatMap(lambda line: line.split(" ")).map(lambda word: (word, 1)).reduceByKey(lambda a, b: a+b)
counts.pprint()

ssc.start()
time.sleep(600) # Run stream for 10 minutes just in case no detection of producer
# ssc.awaitTermination()
ssc.stop(stopSparkContext=True,stopGraceFully=True)


# ## References
# 1. https://spark.apache.org/docs/latest/streaming-kinesis-integration.html
# 2. https://spark.apache.org/docs/latest/streaming-programming-guide.html#performance-tuning
    except:
        pass


if __name__ == "__main__":
    if len(sys.argv) != 5:
        print(
            "Usage: kinesis_wordcount_asl.py <app-name> <stream-name> <endpoint-url> <region-name>",
            file=sys.stderr)
        sys.exit(-1)
    num_streams = 4
    sc = SparkContext(appName="Spark Streaming App")
    ssc = StreamingContext(sc, 60)
    appName, streamName, endpointUrl, regionName = sys.argv[1:]
    kinesis_streams = [
        KinesisUtils.createStream(ssc, appName, streamName, endpointUrl,
                                  regionName, InitialPositionInStream.LATEST,
                                  10) for _ in range(num_streams)
    ]

    unioned_streams = ssc.union(*kinesis_streams)

    # Split the spark context lines by the newline delimiter
    lines = unioned_streams.flatMap(lambda x: x.split("\n"))

    # For each dstream RDD, apply the processing
    lines.foreachRDD(process)

    ssc.start()
    ssc.awaitTermination()
Example #25
0
            else:
                process_dataframe_global(lines, connect, spark, schema)

    else:
        conf = SparkConf().setAppName(args.app_name)
        sc = SparkContext(conf=conf)
        spark = SparkSession.builder \
                                 .config(conf=conf) \
                                 .getOrCreate()
        ssc = StreamingContext(sc, args.batch_duration)
        sql = SQLContext(sc)
        lines = KinesisUtils.createStream(
            ssc,
            args.app_name,
            args.stream_name_kinesis,
            args.endpoint_url_kinesis,
            args.region_name,
            InitialPositionInStream.LATEST,
            awsAccessKeyId=args.aws_access_key_id,
            awsSecretKey=args.aws_secret_access_key,
            checkpointInterval=args.checkpoint_interval)
        lines.pprint()
        if args.type == "rdd":
            process_rdd(lines, connect, spark, schema)
        else:
            if args.all_or_batch == "batch":
                process_dataframe(lines, connect, sql, schema)
            else:
                process_dataframe_global(lines, connect, sql, schema)

    ssc.start()
    ssc.awaitTermination()
  See http://spark.apache.org/docs/latest/streaming-kinesis-integration.html for more details on
  the Kinesis Spark Streaming integration.
"""
import sys

from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.streaming.kinesis import KinesisUtils, InitialPositionInStream

if __name__ == "__main__":
    if len(sys.argv) != 5:
        print(
            "Usage: kinesis_wordcount_asl.py <app-name> <stream-name> <endpoint-url> <region-name>",
            file=sys.stderr)
        sys.exit(-1)

    sc = SparkContext(appName="PythonStreamingKinesisWordCountAsl")
    ssc = StreamingContext(sc, 1)
    appName, streamName, endpointUrl, regionName = sys.argv[1:]
    lines = KinesisUtils.createStream(ssc, appName, streamName, endpointUrl,
                                      regionName,
                                      InitialPositionInStream.LATEST, 2)
    counts = lines.flatMap(lambda line: line.split(" ")) \
        .map(lambda word: (word, 1)) \
        .reduceByKey(lambda a, b: a+b)
    counts.pprint()

    ssc.start()
    ssc.awaitTermination()
s3_target_bucket_name = 'mattsona-spark-demo' # replace with your bucket name for target data
aws_region = 'us-west-2' # replace w/ AWS region used for Kinesis stream
kinesis_stream = 'spark_streaming_kinesis_demo' # replace with your Kinesis stream name
kinesis_endpoint = 'https://kinesis.' + aws_region + '.amazonaws.com' # the public endpiont of the AWS region this is executed from
kinesis_app_name = 'alex_test_app' # app name used to track process through the Kinesis stream
kinesis_initial_position = InitialPositionInStream.LATEST # InitialPositionInStream.TRIM_HORIZON | InitialPositionInStream.LATEST
kinesis_checkpoint_interval = 10 # define how long to checkpoint when processing through the Kinesis stream
spark_batch_interval = 10 # how many seconds before pulling the next batch of data from the Kinesis stream

# configure spark elements
spark_context = SparkContext(appName=kinesis_app_name)
# spark_streaming_context = StreamingContext(sc, 1) # sc valid for running in pyspark interactive mode
spark_streaming_context = StreamingContext(spark_context, spark_batch_interval)

kinesis_stream = KinesisUtils.createStream(
    spark_streaming_context, kinesis_app_name, kinesis_stream, kinesis_endpoint,
    aws_region, kinesis_initial_position, kinesis_checkpoint_interval) # previous example had ', StorageLevel.MEMORY_AND_DISK_2' at the end of the call

# take kinesis stream JSON data and convert to CSV # just realized we're still dealing with dstreams, not RDD, so naming is inaccurate
py_dict_rdd = kinesis_stream.map(lambda x: json.loads(x))
# need to convert int (time_stamp & random_int) to string
csv_rdd = py_dict_rdd.map(lambda x: x['user_name'] + ',' + str(datetime.datetime.utcfromtimestamp(x['time_stamp'])) + ',' + x['data_string'] + ',' + str(x['random_int']))

# save that rdd to S3
commit_to_s3 = csv_rdd.saveAsTextFiles('s3://' + s3_target_bucket_name + '/spark_streaming_processing/ '+ datetime.datetime.isoformat(datetime.datetime.now()).replace(':','_'))
# commit_to_s3 = kinesis_stream.saveAsTextFiles('s3://mattsona-public/' + datetime.datetime.isoformat(datetime.datetime.now()).replace(':','_'))

spark_streaming_context.start()

spark_streaming_context.awaitTermination()
Example #28
0
        sc = SparkContext(conf=conf)
	ssc = StreamingContext(sc, SPARK_STREAM_BATCH)

	sc.addPyFile(CODE_PATH + '/pyspark_csv.py')
        sc.addPyFile(CODE_PATH + '/constants.py')

        sc._jsc.hadoopConfiguration().set("fs.s3n.awsAccessKeyId", S3ACCESSID)
        sc._jsc.hadoopConfiguration().set("fs.s3n.awsSecretAccessKey", S3SECRETKEY)

	sqlContext = SQLContext(sc)
	registerUDF(sqlContext)

	printOnConsole('Streaming started')

	
	kinesisStream = [KinesisUtils.createStream(ssc, APPLICATION_NAME, STREAM_NAME, ENDPOINT, REGION_NAME, INITIAL_POS, CHECKPOINT_INTERVAL, awsAccessKeyId =AWSACCESSID, awsSecretKey=AWSSECRETKEY, storageLevel=STORAGE_LEVEL) for _ in range (NUM_STREAMS)]
	
	unifiedStream = ssc.union(*kinesisStream)
		
	print 'Started running'
	#unikinesisStream.reduceByKey(lambda x,y: x+y)
	#unifiedStream.count().pprint()

	unifiedStream.foreachRDD(processRdd)
	
	ssc.start()
	ssc.awaitTermination()
	printOnConsole('Streaming suspended')


Example #29
0
    return israel_negative_sentences


if __name__ == '__main__':
    if len(sys.argv) != 10:
        print(
            "Usage: <app-name> <stream-name> <endpoint-url> <region-name> <aws-result-bucket> <aws-access-key> <aws-secret-key>"
        )
        sys.exit(-1)

    app_Name, streamName, end_point_url, region_name, aws_result_bucket, kinesis_key, kinesis_secret, bucket_key, bucket_secret = sys.argv[
        1:]
    sparkContext = SparkContext(appName=app_Name)
    streamingContext = StreamingContext(sparkContext, 2)
    dstream = KinesisUtils.createStream(streamingContext,
                                        app_Name,
                                        streamName,
                                        end_point_url,
                                        region_name,
                                        InitialPositionInStream.TRIM_HORIZON,
                                        10,
                                        awsAccessKeyId=kinesis_key,
                                        awsSecretKey=kinesis_secret)
    dstream\
        .flatMap(tokenize_text)\
        .map(analyze_sentence)\
        .foreachRDD(lambda x: upload_records_step(x, aws_result_bucket, region_name, bucket_key, bucket_secret))
    streamingContext.start()
    streamingContext.awaitTermination()
Example #30
0
from pyspark.sql import SparkSession
from pyspark.streaming.context import StreamingContext
from pyspark.streaming.kinesis import KinesisUtils, InitialPositionInStream

spark = SparkSession.builder.appName("test").master("local[*]").getOrCreate()
ssc = StreamingContext(spark.sparkContext, 10)

lines = KinesisUtils.createStream(
    ssc,
    "test",
    "test_s",
    "https://kinesis.eu-north-1.amazonaws.com",
    "eu-north-1",
    InitialPositionInStream.LATEST,
    awsAccessKeyId="AKIAJ5V6NEAI3YNTWGDA",
    awsSecretKey="xdyXL4jP1SYhiKO9OGhOLYijVbG0BwPnq7J6oRDZ",
    checkpointInterval=2)
Example #31
0
  See http://spark.apache.org/docs/latest/streaming-kinesis-integration.html for more details on
  the Kinesis Spark Streaming integration.
"""
from __future__ import print_function

import sys

from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.streaming.kinesis import KinesisUtils, InitialPositionInStream

if __name__ == "__main__":
    if len(sys.argv) != 5:
        print(
            "Usage: kinesis_wordcount_asl.py <app-name> <stream-name> <endpoint-url> <region-name>",
            file=sys.stderr)
        sys.exit(-1)

    sc = SparkContext(appName="PythonStreamingKinesisWordCountAsl")
    ssc = StreamingContext(sc, 1)
    appName, streamName, endpointUrl, regionName = sys.argv[1:]
    lines = KinesisUtils.createStream(
        ssc, appName, streamName, endpointUrl, regionName, InitialPositionInStream.LATEST, 2)
    counts = lines.flatMap(lambda line: line.split(" ")) \
        .map(lambda word: (word, 1)) \
        .reduceByKey(lambda a, b: a+b)
    counts.pprint()

    ssc.start()
    ssc.awaitTermination()
Example #32
0
from pyspark import SparkContext
from pyspark.streaming import StreamingContext

interval = 1
spark_context = SparkContext(appName='base.py')
stream_context = StreamingContext(spark_context, interval)

StreamName = 'test'
region_name = 'us-west-2'
endpoint = 'https://kinesis.{}.amazonaws.com/'.format(region_name)

from pyspark.streaming.kinesis import KinesisUtils, InitialPositionInStream

stream = KinesisUtils.createStream(stream_context, 'EventLKinesisConsumer',
                                   StreamName, endpoint, region_name,
                                   InitialPositionInStream.LATEST, interval)
import sys

from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.streaming.kinesis import KinesisUtils, InitialPositionInStream

sc = SparkContext(appName="SparkKinesisApp")
ssc = StreamingContext(sc, 1)
lines = KinesisUtils.createStream(ssc, "SparkKinesisApp", "myStream",
                                  "[https://kinesis.us-east-1.amazonaws.com",
                                  "us-east-1", InitialPositionInStream.LATEST,
                                  2)

#lines.saveAsTextFiles('/home/zh/streaming_logsout.txt')
lines.pprint()
counts = lines.flatMap(lambda line: line.split(" ")).map(
    lambda word: (word, 1)).reduceByKey(lambda a, b: a + b)
counts.pprint()
ssc.start()
ssc.awaitTermination()