Ejemplo n.º 1
0
"""
i = 1

for domain in top_domain_list:
    top_domain_dict[domain[0]] = i

    i = i + 1

print top_domain_dict
"""

jsonRDD = hc.jsonFile(
    "hdfs://dip.cdh5.dev:8020/user/hdfs/rawlog/app_saesinacomkafka12345_nginx/2015_10_22/09"
)

hc.registerRDDAsTable(jsonRDD, "temp_schema")


def if_in_top_10_domain(domain):
    if domain == '' or domain == None or len(domain) < 3:
        return 'no'
    else:
        if top_domain_dict.has_key(domain):
            return top_domain_dict[domain]
        else:
            return 'no'


hc.registerFunction("temp_if_in_top_10_domain", if_in_top_10_domain)

spark_sql = '''select domain,url,cast(sum(body_bytes_sent) as bigint) as flow from (
Ejemplo n.º 2
0
datas = ["1 a 28", "2 b 29", "3 c 30"]

source = sc.parallelize(datas)

splits = source.map(lambda line: line.split(" "))

rows = splits.map(lambda words: (int(words[0]), words[1], int(words[2])))

fields = []

fields.append(StructField("id", IntegerType(). True))
fields.append(StructField("name", StringType(). True))
fields.append(StructField("age", IntegerType(). True))

schema = StructType(fields)

people = hc.applySchema(rows, schema)

hc.registerRDDAsTable(people, "people")

hc.registerFunction("myfunc", lambda name: name.upper())

rows = hc.sql(
    "select myfunc(name) from people where age>28 and age<30").collect()

sc.stop()

for row in rows:
    print(row)
Ejemplo n.º 3
0
}

"""
i = 1

for domain in top_domain_list:
    top_domain_dict[domain[0]] = i

    i = i + 1

print top_domain_dict
"""

jsonRDD = hc.jsonFile("hdfs://dip.cdh5.dev:8020/user/hdfs/rawlog/app_saesinacomkafka12345_nginx/2015_10_22/09")

hc.registerRDDAsTable(jsonRDD, "temp_schema")


def if_in_top_10_domain(domain):
    if domain == "" or domain == None or len(domain) < 3:
        return "no"
    else:
        if top_domain_dict.has_key(domain):
            return top_domain_dict[domain]
        else:
            return "no"


hc.registerFunction("temp_if_in_top_10_domain", if_in_top_10_domain)

spark_sql = """select domain,url,cast(sum(body_bytes_sent) as bigint) as flow from (