Ejemplo n.º 1
0
def parse(str):
    s = p.match(str)
    d = {}
    d['ip'] = s.group(1)
    d['date'] = s.group(4)
    d['operation'] = s.group(5)
    d['uri'] = s.group(6)
    return d


regex = '^(\S+) (\S+) (\S+) \[([\w:/]+\s[+\-]\d{4})\] "(\S+)\s?(\S+)?\s?(\S+)?" (\d{3}|-) (\d+|-)\s?"?([^"]*)"?\s?"?([^"]*)?"?$'

p = re.compile(regex)

rdd = sc.textFile("/home/ubuntu/walker/apache_logs")

rdd2 = rdd.map(parse)

rdd3 = rdd2.map(addId())

es_write_conf = {
    "es.nodes": "localhost",
    "es.port": "9200",
    "es.resource": 'walker/apache',
    "es.input.json": "yes",
    "es.mapping.id": "doc_id"
}

rdd3.saveAsNewAPIHadoopFile(
    path='-',
Ejemplo n.º 2
0
from pyspark import SparkConf, SparkContext

conf = SparkConf().setMaster("local").setAppName("wordCount")
sc = SparkConf(conf=conf)

rdd = sc.textFile('./data/data.txt')

words = rdd.flatMap(lambda x: x.split(" "))

countRdd = words.map(lambda x: (x, 1)).reduceByKey(lambda x, y: x + y)

for count in countRdd.collect():
    print count[0] + " " + str(count[1])