Exemple #1
0
 def saveAsPickleFile(rdd, time):
     """
     Closure to save element in RDD in the DStream as Pickled data in file.
     This closure is called by py4j callback server.
     """
     path = rddToFileName(prefix, suffix, time)
     rdd.saveAsPickleFile(path)
Exemple #2
0
 def saveAsPickleFile(rdd, time):
     """
     Closure to save element in RDD in the DStream as Pickled data in file.
     This closure is called by py4j callback server.
     """
     path = rddToFileName(prefix, suffix, time)
     rdd.saveAsPickleFile(path)
Exemple #3
0
def dosth(time, rdd, spark):
    '''
    # change schema
    https://stackoverflow.com/questions/46432789/how-to-change-pyspark-data-frame-column-data-type
    '''
    if rdd.isEmpty():
        return
    sqlContext = getSqlContextInstance(rdd.context)
    df = sqlContext.createDataFrame(rdd)
    df.show()
    df.printSchema()
    # df.groupBy("user").count().show()

    df.createOrReplaceTempView('firewall')
    sqlDF = spark.sql(
        "select server,app,action,count(*) as cnt from firewall group by server, app, action order by cnt desc"
    )
    sqlDF.show()

    # output as parquet file
    if 1:
        sqlDF.write.parquet("data/firewall.parquet")

    # read from parquet
    if 0:
        pqtDF = spark.read.parquet("data/firewall.parquet")
        pqtDF.createOrReplaceTempView("pqt_firewall")
        pqtv2DF = spark.sql("SELECT * FROM pqt_firewall")
        pqtv2DF.show()

    # output as json
    if 1:
        enriched_data_path = 'data/firewall_df.json'
        path = rddToFileName(enriched_data_path, None, time)
        sqlDF.write.json(path, mode='error')
 def saveAsTextFile(t, rdd):
     path = rddToFileName(prefix, suffix, t)
     try:
         rdd.saveAsTextFile(path)
     except Py4JJavaError as e:
         # after recovered from checkpointing, the foreachRDD may
         # be called twice
         if 'FileAlreadyExistsException' not in str(e):
             raise
Exemple #5
0
 def saveAsTextFile(t, rdd):
     path = rddToFileName(prefix, suffix, t)
     try:
         rdd.saveAsTextFile(path)
     except Py4JJavaError as e:
         # after recovered from checkpointing, the foreachRDD may
         # be called twice
         if 'FileAlreadyExistsException' not in str(e):
             raise
Exemple #6
0
 def saveAsTextFile(t: Optional[datetime], rdd: RDD[T]) -> None:
     path = rddToFileName(prefix, suffix, t)
     try:
         rdd.saveAsTextFile(path)
     except Py4JJavaError as e:
         # after recovered from checkpointing, the foreachRDD may
         # be called twice
         if "FileAlreadyExistsException" not in str(e):
             raise
def process_messages(time, rdd, ssc, model, enriched_data_path, zookeeper_hosts, kafka_alert_topic, kafka_enriched_data_topic, max_batches):
    global BATCH_COUNTER

    if rdd.isEmpty():
        return

    sqlContext = getSqlContextInstance(rdd.context)
    df = sqlContext.createDataFrame(rdd)

    # Enrich data to build preprocesed dataframe.
    df = enrich_data(df)

    # Perist enriched data to storage (direct from Spark to HDFS).
    # This will create a file per partition per batch.
    if enriched_data_path:
        path = rddToFileName(enriched_data_path, None, time)
        df.write.json(path, mode='error')
    
    # Send all enriched data to a Kafka topic.
    # Note that each worker sends its own partitions directly to Kafka. The driver is not in the data path.
    # This can be consumed by Flume to write to HDFS allowing multiple batches to be appended to the same file.
    if kafka_enriched_data_topic:
        df.foreachPartition(lambda d: write_partition_to_kafka(d, zookeeper_hosts=zookeeper_hosts, kafka_topic=kafka_enriched_data_topic))

    # Build feature vector.
    df = build_features_vector(df)

    # Show 10 records of the dataframe.
    # df.select(['duration','src_bytes','dst_bytes','features','label']).show(10)

    # Predict anomalies with model.
    # We must use RDDs, not dataframes, because we can't save/load the pipelined ML model using PySpark yet.
    if model:
        features_rdd = extract_features(df)
        predictions_rdd = model.predict(features_rdd)
        features_and_predictions_rdd = df.rdd.zip(predictions_rdd)
        anomalies_rdd = features_and_predictions_rdd.filter(lambda x: x[1] <= 0).map(lambda x: x[0])
        anomalies = anomalies_rdd.collect()
        print('Predicted %d anomalies' % len(anomalies))

        # For demo purposes, only alert on the first 5 anomalies.
        anomalies = anomalies[:5]

        # Send anomalies to Kafka.
        # Note that since we expect very few anomalies, the records are brought into the driver which
        # then sends to Kafka.
        if anomalies:
            client = KafkaClient(zookeeper_hosts=zookeeper_hosts)
            topic = client.topics[kafka_alert_topic]
            with topic.get_producer(delivery_reports=False) as producer:
                for row in anomalies:
                    alert = row.asDict()
                    del alert['features']       # remove features vector because we can't serialize it to JSON
                    alert['alert_text'] = 'predicted to be an anomaly'
                    msg = json.dumps(alert)
                    producer.produce(msg)
                    print('Sent alert: %s' % msg)

    # Stop after specified number of batches. This is used for development only.
    BATCH_COUNTER += 1
    if max_batches > 0 and BATCH_COUNTER >= max_batches:
        print('Reached maximum number of batches.')
        ssc.stop(True, False)
def process_messages(time, rdd, ssc, model, enriched_data_path,
                     zookeeper_hosts, kafka_alert_topic,
                     kafka_enriched_data_topic, max_batches):
    global BATCH_COUNTER

    if rdd.isEmpty():
        return

    sqlContext = getSqlContextInstance(rdd.context)
    df = sqlContext.createDataFrame(rdd)

    # Enrich data to build preprocesed dataframe.
    df = enrich_data(df)

    # Perist enriched data to storage (direct from Spark to HDFS).
    # This will create a file per partition per batch.
    if enriched_data_path:
        path = rddToFileName(enriched_data_path, None, time)
        df.write.json(path, mode='error')

    # Send all enriched data to a Kafka topic.
    # Note that each worker sends its own partitions directly to Kafka. The driver is not in the data path.
    # This can be consumed by Flume to write to HDFS allowing multiple batches to be appended to the same file.
    if kafka_enriched_data_topic:
        df.foreachPartition(lambda d: write_partition_to_kafka(
            d,
            zookeeper_hosts=zookeeper_hosts,
            kafka_topic=kafka_enriched_data_topic))

    # Build feature vector.
    df = build_features_vector(df)

    # Show 10 records of the dataframe.
    # df.select(['duration','src_bytes','dst_bytes','features','label']).show(10)

    # Predict anomalies with model.
    # We must use RDDs, not dataframes, because we can't save/load the pipelined ML model using PySpark yet.
    if model:
        features_rdd = extract_features(df)
        predictions_rdd = model.predict(features_rdd)
        features_and_predictions_rdd = df.rdd.zip(predictions_rdd)
        anomalies_rdd = features_and_predictions_rdd.filter(
            lambda x: x[1] <= 0).map(lambda x: x[0])
        anomalies = anomalies_rdd.collect()
        print('Predicted %d anomalies' % len(anomalies))

        # For demo purposes, only alert on the first 5 anomalies.
        anomalies = anomalies[:5]

        # Send anomalies to Kafka.
        # Note that since we expect very few anomalies, the records are brought into the driver which
        # then sends to Kafka.
        if anomalies:
            client = KafkaClient(zookeeper_hosts=zookeeper_hosts)
            topic = client.topics[kafka_alert_topic]
            with topic.get_producer(delivery_reports=False) as producer:
                for row in anomalies:
                    alert = row.asDict()
                    del alert[
                        'features']  # remove features vector because we can't serialize it to JSON
                    alert['alert_text'] = 'predicted to be an anomaly'
                    msg = json.dumps(alert)
                    producer.produce(msg)
                    print('Sent alert: %s' % msg)

    # Stop after specified number of batches. This is used for development only.
    BATCH_COUNTER += 1
    if max_batches > 0 and BATCH_COUNTER >= max_batches:
        print('Reached maximum number of batches.')
        ssc.stop(True, False)