Ejemplo n.º 1
0
 def load_flume(self, ssc: StreamingContext) -> DStream:
     # stream that pulls inputs from Flume
     # maybe change host name
     print("LOADING FLUME")
     input_stream = FlumeUtils.createStream(ssc, self.__flume_host,
                                            self.__flume_port)
     d_stream = input_stream.map(self.__parse_json).transform(
         lambda rdd: self.__convert_service_format(rdd))
     return d_stream
Ejemplo n.º 2
0
    def _startContext(self, n, compressed):
        # Start the StreamingContext and also collect the result
        dstream = FlumeUtils.createStream(self.ssc, "localhost", self._utils.getTestPort(),
                                          enableDecompression=compressed)
        result = []

        def get_output(_, rdd):
            for event in rdd.collect():
                if len(result) < n:
                    result.append(event)
        dstream.foreachRDD(get_output)
        self.ssc.start()
        return result
Ejemplo n.º 3
0
    def _startContext(self, n, compressed):
        # Start the StreamingContext and also collect the result
        dstream = FlumeUtils.createStream(self.ssc, "localhost", self._utils.getTestPort(),
                                          enableDecompression=compressed)
        result = []

        def get_output(_, rdd):
            for event in rdd.collect():
                if len(result) < n:
                    result.append(event)
        dstream.foreachRDD(get_output)
        self.ssc.start()
        return result
def main():
    sc = SparkContext(appName="News_Steam_Analysis")

    # Create the flume stream
    ssc = StreamingContext(
        sc, 300
    )  # Use the time here to decide what should be the interval for the top stories.
    flume_strm = FlumeUtils.createStream(ssc,
                                         "localhost",
                                         9999,
                                         bodyDecoder=lambda x: x)

    lines = flume_strm.map(lambda (k, v): json.loads(v))

    lines.foreachRDD(get_trending_news)

    ssc.start()
    ssc.awaitTermination()
Ejemplo n.º 5
0
    exp = pattern.match(line)
    if exp:
        request = exp.groupdict()["request"]
        if request:
            requestFields = request.split()
            if (len(requestFields) > 1):
                return requestFields[1]


if __name__ == "__main__":

    sc = SparkContext(appName="StreamingFlumeLogAggregator")
    sc.setLogLevel("ERROR")
    ssc = StreamingContext(sc, 1)

    flumeStream = FlumeUtils.createStream(
        ssc, "localhost", 9092)  # DStream Object named flumeStream

    lines = flumeStream.map(lambda x: x[1])
    urls = lines.map(extractURLRequest)

    # Reduce by URL over a 5-minute window sliding every second
    urlCounts = urls.map(lambda x: (x, 1)).reduceByKeyAndWindow(
        lambda x, y: x + y, lambda x, y: x - y, 300, 1)

    # Sort and print the results
    sortedResults = urlCounts.transform(
        lambda rdd: rdd.sortBy(lambda x: x[1], False))
    sortedResults.pprint()

    ssc.checkpoint("/home/maria_dev/checkpoint")
    ssc.start()
Ejemplo n.º 6
0
from __future__ import print_function

import sys

from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.streaming.flume import FlumeUtils
import uuid
if __name__ == "__main__":
    if len(sys.argv) != 3:
        print("Usage: flume_wordcount.py <hostname> <port>", file=sys.stderr)
        sys.exit(-1)
    uid = str(uuid.uuid4())
    sc = SparkContext(appName="PythonStreamingFlumeWordCount")
    ssc = StreamingContext(sc, 1)

    hostname, port = sys.argv[1:]
    kvs = FlumeUtils.createStream(ssc, hostname, int(port))
    lines = kvs.map(lambda x: x[1])
    counts = lines.filter(lambda line: "sales" in line.lower())\
    .map(lambda word: (uid, word)) \
                  .reduceByKey(lambda a, b: a+b) \
    .saveAsTextFiles('hdfs://0.0.0.0:8020/weblogs/sales_','txt')

    ssc.start()
    ssc.awaitTermination()
Ejemplo n.º 7
0
    exp = pattern.match(line)
    if exp:
        request = exp.groupdict()["request"]
        if request:
            requestFields = request.split()
            if (len(requestFields) > 1):
                return requestFields[1]


if __name__ == "__main__":

    sc = SparkContext(appName="StreamingFlumeLogAggregator")
    sc.setLogLevel("ERROR")
    ssc = StreamingContext(sc, 1)

    flumeStream = FlumeUtils.createStream(ssc, "localhost", 9092)

    lines = flumeStream.map(lambda x: x[1])
    urls = lines.map(extractURLRequest)

    # Reduce by URL over a 5-minute window sliding every second
    urlCounts = urls.map(lambda x: (x, 1)).reduceByKeyAndWindow(
        lambda x, y: x + y, lambda x, y: x - y, 300, 1)

    # Sort and print the results
    sortedResults = urlCounts.transform(
        lambda rdd: rdd.sortBy(lambda x: x[1], False))
    sortedResults.pprint()

    ssc.checkpoint("/home/maria_dev/checkpoint")
    ssc.start()
Ejemplo n.º 8
0
    sys.stdout.flush()
    productSum = NewProductSum(pair[0])
    for it in pair[1]:
        productSum['revenue'] += Decimal(it['revenue'])
        if it['type'] == 'view': productSum['views'] += 1
        else: productSum['purchases'] += 1

    return result

def ProcessInput(rdd): rdd.groupBy(lambda rdd: rdd['product_id']).map(Test).foreachPartition(WriteData)


if __name__ == "__main__":
    if len(sys.argv) != 3:
        print("Usage: flume_wordcount.py <hostname> <port>", file = sys.stderr)
        sys.exit(-1)

    sparkContext = SparkContext(appName = "SparkFlumeStreaming")
    sparkContext.setLogLevel('ERROR')

    streamingContext = StreamingContext(sparkContext, 1)

    hostname, port = sys.argv[1:]
    print('Start listening at {}:{}'.format(hostname, port))
    stream = FlumeUtils.createStream(streamingContext, hostname, int(port))

    stream.map(lambda x: x[0]).window(60, 60).foreachRDD(ProcessInput)

    streamingContext.start()
    streamingContext.awaitTermination()
Ejemplo n.º 9
0
        request = exp.groupdict()["request"]
        if request:
            requestFields = request.split()
            if (len(requestFields) > 1):
                return requestFields[1]


if __name__ == "__main__":

    # setup sparkcomtext object, set log level, setup streaming context, interval of 1 sec
    sc = SparkContext(appName="StreamingFlumeLogAggregator")
    sc.setLogLevel("ERROR")
    ssc = StreamingContext(sc, 1)

    #user flume util library. push model from flume to spark
    flumeStream = FlumeUtils.createStream(ssc, "192.168.1.59", 9092)

    #map operation
    lines = flumeStream.map(lambda x: x[1])
    urls = lines.map(extractURLRequest)

    # Reduce by URL over a 5-minute window sliding every second
    urlCounts = urls.map(lambda x: (x, 1)).reduceByKeyAndWindow(
        lambda x, y: x + y, lambda x, y: x - y, 300, 1)

    # Sort and print the results
    sortedResults = urlCounts.transform(
        lambda rdd: rdd.sortBy(lambda x: x[1], False))
    sortedResults.pprint()

    #create check point directory
Ejemplo n.º 10
0
def salvaResultado(rdd):
    #coloca entradas no formato
    #(hora_da_req, quem_respondeu, tempo_de_resposta)
    linhas = rdd.map(lambda linha: (converte_data_redis(linha[6][1:len(linha[6])-1]), linha[8].split("/")[1], linha[9].split("/")[3]))

    for log in linhas.collect():
      salva_tempo_mais_recente(log[0])
      salva_req_redis(1, log[0])
      salva_t_srv_queue_redis(log[2], log[0])

# Create a local StreamingContext with two working thread and batch interval of 1 second

sc = SparkContext("local[2]", "acessos")
ssc = StreamingContext(sc, 20)
stream_flume_logs = FlumeUtils.createStream(ssc, "10.125.8.253", 44444)

#Pegar cada linha do log
linha_log = stream_flume_logs.map(lambda a: a[1]).filter(lambda a: "haproxy" in a)

#words = linha_log.flatMap(lambda line: line.split(" "))
words = linha_log.map(lambda line: line.split())
#words.pprint()

#Processar dados e salvar no banco Influxdb
words.foreachRDD(salvaResultado)

ssc.start()             # Start the computation

ssc.awaitTermination()  # Wait for the computation to terminate
from pyspark import SparkContext, SparkConf
from pyspark.streaming import StreamingContext
from pyspark.streaming.flume import FlumeUtils

# Local SparkContext and StreamingContext (batch interval of 1 second)
sc = SparkContext(master="local[*]",
                  appName="Flume-DStream-StdOut",
                  conf=SparkConf()
                  .set("spark.jars.packages", "org.apache.spark:spark-streaming-flume_2.11:2.4.7"))
sc.setLogLevel("ERROR")
ssc = StreamingContext(sc, 1)

# 1. Input data: create a DStream from Apache Flume
stream = FlumeUtils.createStream(ssc, "localhost", 4444)

# 2. Data processing: get first element
lines = stream.map(lambda x: x[1])

# 3. Output data: show result in the console
lines.pprint()

ssc.start()
ssc.awaitTermination()
Ejemplo n.º 12
0
 def load_flume(self, ssc: StreamingContext) -> DStream:
     stream = FlumeUtils.createStream(ssc, self.__flume_host,
                                      self.__flume_port)
     # Map applies an operation to each element in the stream, whereas transform applies an operation on an RDD level
     return stream.map(self.__parse_json) \
         .transform(lambda rdd: self.__convert_service_format(rdd))
Ejemplo n.º 13
0
from pyspark.streaming.flume import FlumeUtils
from pyspark import SparkContext
from pyspark.streaming import StreamingContext


def salvaResultado(rdd):
    #a=rdd.foreach(lambda dado: dado.sortByKey())
    a=rdd.sortByKey().collect()
    np.savetxt('/home/marceloca/teste',a, fmt='%s')
    #rdd.foreach(lambda dado: np.savetxt('/home/marceloca/teste', dado, fmt='%s'))

# Create a local StreamingContext with two working thread and batch interval of 1 second

sc = SparkContext("local[2]", "acessos")
ssc = StreamingContext(sc, 10)
stream_flume_logs = FlumeUtils.createStream(ssc, "192.168.0.13", 44444) 

#Definicao de dict para transformacao de data

cal = {'Jan': '01', 'Feb': '02', 'Mar': '03', 'Apr' : '04', \
'May' : '05', 'Jun' : '06', 'Jul' : '07', 'Aug' : '08', 'Sep' : '09', \
'Oct' : '10', 'Nov' : '11', 'Dec' : '12' }


#Pegar cada linha do log
linha_log = stream_flume_logs.map(lambda a: a[1])
#words = linha_log.flatMap(lambda line: line.split(" "))
words = linha_log.map(lambda line: line.split(" "))

#Extrair a data de cada acesso
datas = words.map(lambda data: str(cal[data[0]]) + str(data[1])) 
Ejemplo n.º 14
0
        print("Usage: flume_wordcount.py <hostname> <port>", file=sys.stderr)
        exit(-1)
    sc = SparkContext(appName="PythonStreamingFlumeWordCount")
    ssc = StreamingContext(sc, 1)
# Create regular expression pattern to parse log messages' mainbody.
    PATTERN = '^(\S*\s\S*\s\S*)(.*)'

    def parseLogLine(logline):
        match = re.search(PATTERN, logline)
        return (Row(
            date_time=match.group(1),
            mainbody=match.group(2),
        ), 1)
# Pairwise count each lines,then print out.
    hostname, port = sys.argv[1:]
    kvs = FlumeUtils.createStream(ssc, hostname, int(port))
    lines = kvs.map(lambda x: x[1])
    Errorcounts = (lines.map(parseLogLine)
                   .filter(lambda s: s[1] == 1)
                   .map(lambda s: s[0].mainbody)
                   .filter(lambda s: "ERROR" in s)
                   .map(lambda log: (log, 1))
                   .reduceByKey(lambda a, b: a + b))
    Warningcounts = (lines.map(parseLogLine)
                     .filter(lambda s: s[1] == 1)
                     .map(lambda s: s[0].mainbody)
                     .filter(lambda s: "WARNING" in s)
                     .map(lambda log: (log, 1))
                     .reduceByKey(lambda a, b: a + b))
    Errorcounts.pprint()
    Warningcounts.pprint()
Ejemplo n.º 15
0
# from pyspark.streaming import StreamingContext
# from pyspark import SparkContext
# from pyspark.streaming.flume import FlumeUtils

# sc = SparkContext()
# ssc = StreamingContext(sc, 10)
# flumeStream = FlumeUtils.createStream(ssc, "localhost", 6669)

# result = flumeStream.map(lambda x: json.loads(x[1]))

# result.pprint()

# ssc.start()
# ssc.awaitTermination()

from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.streaming.flume import FlumeUtils

sc = SparkContext(appName="PythonStreamingFlumeWordCount")
ssc = StreamingContext(sc, 10)

kvs = FlumeUtils.createStream(ssc, "localhost", int(6669))
lines = kvs.map(lambda x: x[1])
counts = lines.flatMap(lambda line: line.split(" ")) \
    .map(lambda word: (word, 1)) \
    .reduceByKey(lambda a, b: a+b)
counts.pprint()

ssc.start()
ssc.awaitTermination()
Ejemplo n.º 16
0
#https://issues.apache.org/jira/browse/PARQUET-222 - Parquet writer memory allocation
def process_proxy(time, rdd):
    output_rdd = rdd.map(lambda x: str(time) + ' ' + x[0]['host'] + ' ' + x[1]) \
        .filter(lambda x: '-net-bc' in x).map(parse) \
        .filter(lambda x: isinstance(x, Row)).repartition(10)
    return output_rdd


'''Main function'''
if __name__ == '__main__':
    appConfig = conf.Config()
    sc = SparkContext(conf=appConfig.setSparkConf())
    ssc = StreamingContext(sc, 600)
    logParser = Parser(type='flume')

    flumeStream = FlumeUtils.createStream(ssc, '10.129.4.182', 5141)
    flumeStream1 = FlumeUtils.createStream(ssc, '10.129.4.175', 5141)
    flumeStream2 = FlumeUtils.createStream(ssc, '10.129.4.174', 5141)
    flumeStream3 = FlumeUtils.createStream(ssc, '10.129.4.178', 5141)

    unionStream = flumeStream.union(flumeStream1).union(flumeStream3).union(flumeStream3)

    #fwDStream = flumeStream.transform(process_fw)
    proxyDStream = unionStream.transform(process_proxy)

    #fwDStream.foreachRDD(save_fw)
    proxyDStream.foreachRDD(save_proxy)
    #proxyDStream.saveAsTextFiles("sg_")

    '''
    genericRDD = rdd.filter(lambda x: any(y in x[0]['host'] for y in ['msr-off-fw', '-net-bc']) == False)
Ejemplo n.º 17
0
#!/usr/bin/python
# -*- coding: UTF-8 -*-

import pyspark_init as pi
from pyspark.streaming.flume import FlumeUtils
import pyspark

ssc = pi.streaming_init('streaming_flume1', 'local[2]', 3)
host = 'localhost'
port = 44444
dsm = FlumeUtils.createStream(ssc, host, port,
                              pyspark.StorageLevel.MEMORY_AND_DISK_SER_2)
dsm.count().map(lambda x: 'Recieve ' + str(x) + ' Flume events!!!!').pprint()
ssc.start()
ssc.awaitTerminationOrTimeout(120)
ssc.stop()
# ‐*‐ coding: UTF‐8 ‐*‐
###spark streaming&&Flume
from pyspark import SparkContext
from pyspark.streaming import StreamingContext 
from pyspark.streaming.flume import FlumeUtils

sc=SparkContext("yarn","FlumeWordCount")

# 處理時間間隔為2秒
ssc=StreamingContext(sc,2)

# 開啟TCP socket ip & port
lines = FlumeUtils.createStream(ssc, "1.1.1.1",12345) lines1=lines.map(lambda x:x[1])

# 對兩秒內收到的字串做分割
words=lines1.flatMap(lambda line:line.split(" "))

# word count
pairs=words.map(lambda word:(word,1)) wordcounts=pairs.reduceByKey(lambda x,y:x+y)

# 輸出檔案至HDFS 格式為/tmp/flume‐日期
wordcounts.saveAsTextFiles("/tmp/flume")

# 檢查檔案內容
wordcounts.pprint()

# 啟動spark streaming
ssc.start()

# 等待計算終止
ssc.awaitTermination()