return name + "," + "good" if __name__ == "__main__": conf = SparkConf().setMaster("local[2]").setAppName("sql_udf") sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) json_path = os.path.abspath("../doc/book.json") # json读取并隐射 json_df = sqlContext.read.json(json_path) json_df.registerTempTable("json_book") # UDF自定义函数 sqlContext.registerFunction("name_place", name_place) evalRDD = sqlContext.sql("SELECT name_place(name, place, price,evaluation) AS book_eval FROM json_book") #bookMap = lengthRDD.map(lambda books: (books.name, books.author, books.price, books.publish, books.place)) evalRDD.show() # 查询结果进行隐射 bookMap = evalRDD.map(lambda books: (books.book_eval)) general_list = [] good_list = [] for book in bookMap.collect(): book = book.encode("utf-8").split(',')
def analysis_email(email): """ 邮箱分割 """ return email.split("@")[1].split(".")[0] if __name__ == "__main__": conf = SparkConf().setAppName("analysis_demo").setMaster("local[2]") sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) # UDF自定义函数注册 sqlContext.registerFunction("analysis_email", analysis_email) file_path = os.path.abspath("../doc/analysis.txt") lines = sc.textFile(file_path) info = lines.map(lambda lines: lines.split("----")). \ map(lambda info: Row(email=info[0], username=info[1], realname=info[2], idcard=info[3], password=info[4], phone=info[5])) schemaInfo = sqlContext.createDataFrame(info) schemaInfo.registerTempTable("information") # cache表 #sqlContext.cacheTable("information") #sqlContext.uncacheTable("information") """ :邮箱分析与统计
master = "spark://hadoop:7077" appName = "spark_loginflowlog" #input = "/impala/parquet/back/back-portal-loginflowlog/dat=%s*" % ym input = '/input/loginfowlog/*' spark_home = '/opt/cloud/spark' os.environ['SPARK_HOME'] = spark_home conf = (SparkConf() .setMaster(master) .setAppName(appName) .set("spark.sql.parquet.binaryAsString","true") ) sc = SparkContext(conf = conf) sql_context = SQLContext(sc) sql_context.registerFunction("to_mac", lambda x: normal_mac(x), StringType()) parquet_df = sql_context.read.parquet(input) sql_context.registerDataFrameAsTable(parquet_df, "loginflowlog") #_sql = "select to_mac(upper(usermac)),count(distinct dat) days from loginflowlog group by to_mac(upper(usermac))" _sql = "select to_mac(upper(usermac)),count(distinct logtime) days from loginflowlog group by to_mac(upper(usermac))" rs_df = sql_context.sql(_sql) rs = rs_df.collect() logger.info("---->" + str(len(rs))) lists = [] for r in rs: usermac = r[0] days = r[1] t = (usermac,days) lists.append(t)
master = "local[*]" spark_home = '/opt/cloud/spark' os.environ['SPARK_HOME'] = spark_home # logFile = 'hdfs://master:8020/impala/parquet/back/back-portal-loginflowlog/dat=' + day logFile = "/input/loginfowlog/02*" conf = (SparkConf() .setMaster(master) .setAppName("loginflowlog2mysql") # .set("spark.kryoserializer.buffer.mb", "256") .set("spark.sql.parquet.binaryAsString", "true")) sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) sqlContext.registerFunction("to_datestr", lambda x: longTime2str(x), StringType()) df = sqlContext.read.parquet(logFile) rdd = df.select('logintype', 'logtype', 'hosid', 'suppid', 'logtime', 'usermac') fields = [ StructField('logintype', StringType(), True), StructField('logtype', StringType(), True), StructField('hosid', StringType(), True), StructField('suppid', StringType(), True), StructField('logtime', LongType(), True), StructField('usermac', StringType(), True) ] schema = StructType(fields)
p[5].strip(), p[6].strip(), p[7].strip(), p[8].strip(), p[9].strip(), \ p[10].strip(), p[11].strip(), p[12].strip(), p[13].strip(), p[14].strip(), \ p[15].strip(), p[16].strip(), gwid_hosid_dict.get(p[1].strip(), ""))) logger.debug('-->users:' + str(users.count())) schema_string = "id gw_id supp_id user_id user_type " \ "user_name login_time logout_time mac ip " \ "user_agent download_flow upload_flow os browser " \ "ratio batch_no hos_id" fields = [StructField(field_name, StringType(), True) for field_name in schema_string.split(' ')] schema = StructType(fields) schema_users = sql_context.applySchema(users, schema) schema_users.registerTempTable("wxcity_userlogin_info") # regist udf sql_context.registerFunction("get_date", lambda x: DateUtil.str_to_date(x).date(), DateType()) sql_context.registerFunction("date_diff", lambda x, k: DateUtil.date_diff(x, k), IntegerType()) sql_context.registerFunction("get_hour", lambda x: DateUtil.str_to_date(x).hour(), IntegerType()) sql_context.registerFunction("to_int", lambda x: int(x), IntegerType()) sql_context.registerFunction("timestamp_diff", lambda x, k: DateUtil.timestamp_diff(x, k), IntegerType()) lines_list = UserLoginRepeatService().exec_file(sql_context, time_begin, time_end) # group by day,hosid,(mac),2, 5, 10, 30, 60 #repeat_list = sc.textFile(ConfigSparkPath.userlogin_repeat_path % time_begin).map(lambda line:line.split('\t')).filter(lambda x:len(x)==8) repeat_list = sc.parallelize(lines_list).map(lambda line:line.split('\t')) schema_string = "day hos_id mac t2 t5 " \ "t10 t30 t60" fields = [StructField(field_name, StringType(), True) for field_name in schema_string.split(' ')] schema = StructType(fields) schema_repeat_list = sql_context.applySchema(repeat_list, schema)
# --set datetime DAY_OFFSET = 1 now = datetime.datetime.now() pro_time = now - datetime.timedelta(days=DAY_OFFSET) day = pro_time.strftime("%Y%m%d") master = "spark://hadoop:7077" appName = "spark_pageflow_outflow" input = "/impala/parquet/site/site-pageflowv1/dat=%s" % day spark_home = '/opt/cloud/spark' os.environ['SPARK_HOME'] = spark_home sc = SparkContext(master, appName) sql_context = SQLContext(sc) sql_context.registerFunction("to_day", lambda x: mill_date_str(x), StringType()) sql_context.registerFunction("to_str", lambda x: bytearray_str(x), StringType()) parquet_df = sql_context.read.parquet(input) sql_context.registerDataFrameAsTable(parquet_df, "site_pageflowv1") _sql = "select to_str(url),to_day(createtime) day,count(1) pv,count(distinct to_str(guuid)) uv " \ "from site_pageflowv1 where dat= %s and to_str(name)='outflow' " \ "group by to_str(url),to_day(createtime)" % day rs_df = sql_context.sql(_sql) rs = rs_df.collect() logger.info("---->" + str(len(rs))) list = [] for r in rs:
import os def analysis_email(email): """ 邮箱分割 """ return email.split("@")[1].split(".")[0] if __name__ == "__main__": conf = SparkConf().setAppName("analysis_demo").setMaster("local[2]") sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) # UDF自定义函数注册 sqlContext.registerFunction("analysis_email", analysis_email) file_path = os.path.abspath("../doc/analysis.txt") lines = sc.textFile(file_path) info = lines.map(lambda lines: lines.split("----")). \ map(lambda info: Row(email=info[0], username=info[1], realname=info[2], idcard=info[3], password=info[4], phone=info[5])) schemaInfo = sqlContext.createDataFrame(info) schemaInfo.registerTempTable("information") # cache表 #sqlContext.cacheTable("information") #sqlContext.uncacheTable("information") """
_adLoadDF=sqlContext.createDataFrame([ {'uid': '1', 'adid': 'a','guuid':'aa','guuidctime':1,'url':'','referer':'','hosid':'132','gwid':'','ua':'','ip':'','createtime':1450823568766}, {'uid': '2', 'adid': 'b','guuid':'aa','guuidctime':1,'url':'','referer':'','hosid':'132','gwid':'','ua':'','ip':'','createtime':1450823569766}, {'uid': '3', 'adid': 'c','guuid':'aa','guuidctime':1,'url':'','referer':'','hosid':'132','gwid':'','ua':'','ip':'','createtime':1450823550766}, {'uid': '4', 'adid': 'd','guuid':'bb','guuidctime':1,'url':'','referer':'','hosid':'133','gwid':'','ua':'','ip':'','createtime':1450823268766}, ]).registerAsTable("adload") _adPlayDF=sqlContext.createDataFrame([ {'uid': '1', 'adid': 'a','guuid':'aa','createtime':1450823568766}, {'uid': '2', 'adid': 'b','guuid':'aa','createtime':1450823569766}, {'uid': '4', 'adid': 'd','guuid':'bb','createtime':1450823268766}, ]).registerAsTable("adplay") _adClickDF =sqlContext.createDataFrame([ {'uid': '1', 'adid': 'a','guuid':'aa','createtime':1450823580766}, ]).registerAsTable("adclick") ''' sqlContext.registerFunction("dateformat", lambda x:longTime2str(x),StringType()) adLoadDf=sqlContext.sql('select hosid,dateformat(createtime) day,adid,count(guuid) pv,count(distinct guuid) uv ' 'from adload where createtime is not null and dateformat(createtime)=%s ' 'group by adid,hosid,dateformat(createtime)' % (lastdate)).registerAsTable("radload") adPlayDf=sqlContext.sql('select gh.hosid,dateformat(ap.createtime) day,adid,count(ap.guuid) pv,count(distinct ap.guuid) uv ' 'from adplay ap left join ghid gh on ap.guuid=gh.guuid where dateformat(ap.createtime)=%s ' 'group by ap.adid,gh.hosid,dateformat(ap.createtime)' % (lastdate)).registerAsTable("radplay") # sqlContext.sql('select sum(pv) from radplay').foreach(printx) adClick=sqlContext.sql('select gh.hosid,dateformat(ac.createtime) day,ac.adid,count(ac.guuid) pv,count(distinct ac.guuid) uv ' 'from adclick ac left join ghid gh on ac.guuid=gh.guuid where dateformat(ac.createtime)=%s ' 'group by ac.adid,gh.hosid,dateformat(ac.createtime)' % (lastdate)).registerAsTable("radclick")
data = [[2, 3, 4], [1, 2, 3], [7, 6, 5]] data_df = spark.createDataFrame(data, list('abc')) # create a DF, with columns name data_df2 = spark.createDataFrame(data) # create a DF data = [[2, 3, 4], [1, 2, 3], [7, 6, 5]] sqlContext.registerDataFrameAsTable(data_df2, "test_table") # register a Tmp Table test_data = spark.sql('select * from test_table') # sqlContext.dropTempTable("test_table") sqlContext.udf.register("stringLengthInt", lambda x: len(str(x)), IntegerType()) # register a Function for SQL sqlContext.registerFunction("stringLengthInt", lambda x: len(str(x)), IntegerType()) sqlContext.sql("SELECT stringLengthInt('test') as len").show() sqlContext.sql("SELECT stringLengthInt(a) as len from test_table ").show() df_as1 = data_df.alias("df_as1") # alias df_as2 = data_df.alias("df_as2") joined_df = df_as1.join(df_as2, col("df_as1.a") == col("df_as2.a"), 'inner') # 保留了全部列名 joined_df.select("df_as1.a", "df_as2.a", "df_as2.b", "df_as2.c").show() print(data_df.columns) # --------------------------------------------------------------------------------- data1 = [[2, u'Alice'], [5, u'Bob']] data2 = [[u'Tom', 80], [u'Bob', 85]] data3 = [[2, 2, u'Alice'], [5, 5, u'Bob'], [5, 53, u'Bob'], [7, 1, u'Alice']]
sqlContext = SQLContext(sc) # path to hillary/enron avro enr = sqlContext.read.format( "com.databricks.spark.avro").load( "s3n://datasets-396316040607/enron_data/*.avro").repartition(16) hil = sqlContext.read.format( "com.databricks.spark.avro").load( "s3n://datasets-396316040607/hillary/*.avro").repartition(16) # register tables sqlContext.registerDataFrameAsTable(hil, "hillary") sqlContext.registerDataFrameAsTable(enr, "enron") # register udf sqlContext.registerFunction( "getCos", lambda x, y: get_cosine(text_to_vector(x), text_to_vector(y)) ) # do the cosine similarity on the text, get the top 1000 matches out = sqlContext.sql("SELECT h.author h_auth, e.author e_auth, " "e.contents e_mail, h.contents h_mail, " "getCos(e.contents, h.contents) as cos_sim " "from hillary as h join enron as e order by cos_sim " "desc limit 1000") # write back out to s3 # write back out to s3 out.save("s3n://datasets-396316040607/cos_sim/", format="json")