""" i = 1 for domain in top_domain_list: top_domain_dict[domain[0]] = i i = i + 1 print top_domain_dict """ jsonRDD = hc.jsonFile( "hdfs://dip.cdh5.dev:8020/user/hdfs/rawlog/app_saesinacomkafka12345_nginx/2015_10_22/09" ) hc.registerRDDAsTable(jsonRDD, "temp_schema") def if_in_top_10_domain(domain): if domain == '' or domain == None or len(domain) < 3: return 'no' else: if top_domain_dict.has_key(domain): return top_domain_dict[domain] else: return 'no' hc.registerFunction("temp_if_in_top_10_domain", if_in_top_10_domain) spark_sql = '''select domain,url,cast(sum(body_bytes_sent) as bigint) as flow from (
datas = ["1 a 28", "2 b 29", "3 c 30"] source = sc.parallelize(datas) splits = source.map(lambda line: line.split(" ")) rows = splits.map(lambda words: (int(words[0]), words[1], int(words[2]))) fields = [] fields.append(StructField("id", IntegerType(). True)) fields.append(StructField("name", StringType(). True)) fields.append(StructField("age", IntegerType(). True)) schema = StructType(fields) people = hc.applySchema(rows, schema) hc.registerRDDAsTable(people, "people") hc.registerFunction("myfunc", lambda name: name.upper()) rows = hc.sql( "select myfunc(name) from people where age>28 and age<30").collect() sc.stop() for row in rows: print(row)
} """ i = 1 for domain in top_domain_list: top_domain_dict[domain[0]] = i i = i + 1 print top_domain_dict """ jsonRDD = hc.jsonFile("hdfs://dip.cdh5.dev:8020/user/hdfs/rawlog/app_saesinacomkafka12345_nginx/2015_10_22/09") hc.registerRDDAsTable(jsonRDD, "temp_schema") def if_in_top_10_domain(domain): if domain == "" or domain == None or len(domain) < 3: return "no" else: if top_domain_dict.has_key(domain): return top_domain_dict[domain] else: return "no" hc.registerFunction("temp_if_in_top_10_domain", if_in_top_10_domain) spark_sql = """select domain,url,cast(sum(body_bytes_sent) as bigint) as flow from (