def getUserInfo(user_id_list): if len(user_id_list) > 0: user_id_str = ",".join(user_id_list) sql = "select hn.hnuserid,hn.linkman,hnaccount.useraccount," \ "hnaccount.mobile,hn.risklevel,hn.createtime,hnaccount.lastlogontime," \ "hnblack.status from risk_business_source.hnuser hn " \ "left join risk_business_source.hnuseraccount hnaccount on hn.hnuserid = hnaccount.hnuserid " \ "left join risk_business_source.hn_user_black hnblack on hn.hnuserid = hnblack.hn_user_id " \ "where hn.hnuserid in(" + user_id_str + ")" sqlDF = spark.sql(sql) rows = sqlDF.collect() rows_count = len(rows) result = [] cols = [ '用户ID', '用户名', '用户账号', '用户手机号', '风险等级', '注册时间', '最后一次登录时间', '是否在黑名单' ] for i in range(rows_count): r = [] r.append(rows[i]['hnuserid']) r.append(rows[i]['linkman']) r.append(rows[i]['useraccount']) r.append(rows[i]['mobile']) r.append(rows[i]['risklevel']) r.append(rows[i]['createtime']) r.append(rows[i]['lastlogontime']) r.append(rows[i]['status']) result.append(r) return cols, result else: print("the user list is is empty")
def use_case_1(which_sql): """ 机动车业务办理数据统计 按照时间(年、月)、大队、业务类型统计记录数量 :param which_sql: :return: """ st = datetime.now() # from pyspark import SparkConf from pyspark import SparkContext from pyspark import SQLContext st_1 = datetime.now() sql = get_sql(which_sql) st_2 = datetime.now() print("Got sql: {sql}".format(sql=sql)) # conf = SparkConf().setAppName("testing").setMaster("local[2]") sc = SparkContext().getOrCreate() sqlcontext = SQLContext(sc) st_3 = datetime.now() # For use sql, must create a table or view # Load vio_violation_data # df_vio.cre('vio_violation_temp_view_001') # Table or view doesn't exits 看来还有什么视图存在时间,访问权限之类的? # df_vio.registerTempTable() # ??? # df_vio = SQLContext.table("vio_violation_temp_view_001") # Load # File path must be given df_veh_flow = sqlcontext.read.load( '/srv/BigData/dbdata_service/ffk/veh_flow_to_190805_allf') st_4 = datetime.now() df_veh_flow.createOrReplaceTempView( 'veh_flow_temp_view_002') # Create a view st_5 = datetime.now() # execute sql from pyspark.shell import spark df_veh_flow_key1 = spark.sql(sql) st_6 = datetime.now() df_veh_flow_key1.show() st_7 = datetime.now() print('运行时间统计:\nLoad sql: {0} ' '\nGet sqlcontext: {1}' '\nLoad data: {2}' '\nCreate table or view: {3}' '\nExecute sql: {4}' '\nShow result: {5}'.format(st_2 - st_1, st_3 - st_2, st_4 - st_3, st_5 - st_4, st_6 - st_5, st_7 - st_6)) keep_task_run_time(start_time=st)
def main(): """ :return: Place and magnitude, where magnitude is greater than 1.0. """ df = sqlContext.read.json(load_json_data(url)) df.createOrReplaceTempView('earthquakes') earthquakes_df = spark.sql("SELECT properties.mag, properties.place " "FROM earthquakes " "WHERE properties.mag > 1.0") earthquakes_df.show()
def save_data(rdd): global flag flag = False """ Parsing JSON value in each RDDs Creating Spark SQL DataFrame from RDD Writing DataFrame to HDFS and Oracle DB """ if not rdd.isEmpty(): rdd = rdd.map(lambda m: parse(m[1])) df = sqlContext.createDataFrame(rdd) df.createOrReplaceTempView("t") result = spark.sql( '''select event_id, event_type from (select row_number() over (partition by _1 order by _2) as RN, _1 as event_id,_2 as event_type from t) where RN = 1''') count = result.count() try: # Writing to HDFS result.write \ .format("csv") \ .mode("append") \ .option("header", "true") \ .save(HDFS_OUTPUT_PATH) # Writing to Oracle DB result.write \ .format("jdbc") \ .mode("append") \ .option("driver", DRIVER) \ .option("url", URL_TARGET_DB) \ .option("dbtable", TARGET_DB_TABLE_NAME) \ .option("user", TARGET_DB_USER_NAME) \ .option("password", TARGET_DB_USER_PASSWORD) \ .save() write_log('INFO', 'Consumer_dim_event_type.py', 'main', '{} rows inserted successfully'.format(count)) except Exception as e: print('--> It seems an Error occurred: {}'.format(e)) write_log('ERROR', 'Consumer_dim_event_type.py', 'main', str(e)[:1000]) flag = True else: ssc.stop() return rdd
def save_data(rdd): """ Function for saving data in window """ global NAMES if not rdd.isEmpty(): # parsing data in RDD rdd = rdd \ .map(lambda x: parser.parse(x[1])) \ .map(lambda data: collect(data)) \ .reduceByKey(lambda rec1, rec2: max(rec1, rec2, key=last_record)) NAMES = dict(rdd.collect()) print( "************************************> NAMES <************************************" ) print(NAMES) print( "************************************> NAMES <************************************" ) rdd = rdd \ .map(lambda rec: (rec[0], rec[1][0], rec[1][1], rec[1][2])) # create DataFrame and View df = sqlContext.createDataFrame(rdd) df.createOrReplaceTempView("t") # query for getting result res = spark.sql( 'select t._1 as NAME, t._2 as COUNT_NAME, t._3 as AVG_TRAFFIC, t._4 as AVG_SUCCESS_SELL from t' ) res.show(40) # res.printSchema() # res = spark.sql('select count(*) KEY, sum(t._2) VALUE from t') res \ .write \ .format("jdbc") \ .mode("overwrite") \ .option("driver", 'oracle.jdbc.OracleDriver') \ .option("url", "jdbc:oracle:thin:@{0}:{1}:orcl".format(IP_DB, PORT_DB)) \ .option("dbtable", "tmp_kafka") \ .option("user", "kozyar") \ .option("password", "usertest") \ .save() # spark.catalog.dropTempView("t") return rdd
def use_case_2(which_sql): """ :return: """ # todo st = datetime.now() # from pyspark import SparkConf from pyspark import SparkContext from pyspark import SQLContext st_1 = datetime.now() sql = get_sql(which_sql) st_2 = datetime.now() print("Got sql: {sql}".format(sql=sql)) # conf = SparkConf().setAppName("testing").setMaster("local[2]") sc = SparkContext().getOrCreate() sqlcontext = SQLContext(sc) st_3 = datetime.now() df_veh = sqlcontext.read.load('/srv/BigData/dbdata_service/ffk/vehicle_to_190805_allf') df_vio_op_check = sqlcontext.read.load('/srv/BigData/dbdata_service/ffk/vio_operate_check_to_190805_allf_parquet') # 注意这里是csv不是parquet # df_veh = sqlcontext.read.load('/srv/BigData/dbdata_service/ffk/vehicle_to190804_allf', format="csv", sep=",", inferSchema="true", header="true") # df_vio_op_check = sqlcontext.read.load('/srv/BigData/dbdata_service/ffk/vio_operate_check_to_190805_allf', format="csv", sep=",", inferSchema="true", header="true") st_4 = datetime.now() df_veh.createOrReplaceTempView('veh_temp_view_002') # Create a view df_vio_op_check.createOrReplaceTempView('vio_operate_check_view_002') # Create a view # df_veh.createGlobalTempView('veh_temp_view_002') # df_vio_op_check.createGlobalTempView('vio_operate_check_view_002') st_5 = datetime.now() # execute sql from pyspark.shell import spark df_veh_flow_key1 = spark.sql(sql) st_6 = datetime.now() df_veh_flow_key1.show() # 保存df结果到csv文件后返回给客户;u'd # new_rdd = df_veh_flow_key1.rdd.map(lambda x: (x[0], x)) # dict = new_rdd.collectAsMap() # def print_rows(row): # data = json.loads(row) # for key in data: # print("{key}:{value}".format(key=key, value=data[key])) # results = df_veh_flow_key1.toJSON() # results.foreach(print_rows) print('turn to json') results = df_veh_flow_key1.toJSON().map(lambda j: json.loads(j)).collect() print('got json', results) for i in results: print(i) # print(i["c1"], i["c6"]) # data = df.toPandas().to_csv('mycsv.csv') # Spark 1.3 # df_veh_flow_key1.save('mycsv.csv', 'com.databricks.spark.csv') # Spark 1.4+ # df_veh_flow_key1.write.format('com.databricks.spark.csv').save('mycsv.csv') # Spark 2.0+ # df_veh_flow_key1.write.csv('mycsv.csv') # pyspark.sql.utils.AnalysisException: u'path hdfs://hacluster/user/TestUser001/mycsv.csv already exists.;' # Send result to http import requests resp = requests.post('http://10.57.98.251:8008/some/url/404/', json=results) print(resp) st_7 = datetime.now() print('运行时间统计:\nLoad sql: {0} ' '\nGet sqlcontext: {1}' '\nLoad data: {2}' '\nCreate table or view: {3}' '\nExecute sql: {4}' '\nShow result: {5}'.format(st_2 - st_1, st_3 - st_2, st_4 - st_3, st_5 - st_4, st_6 - st_5, st_7 - st_6))
from pyspark.shell import spark from pyspark.sql import Row sc = spark.sparkContext # Load a text file and convert each line to a Row. lines = sc.textFile("examples/src/main/resources/people.txt") print('*****************') print(type(lines)) parts = lines.map(lambda l: l.split(",")) people = parts.map(lambda p: Row(name=p[0], age=int(p[1]))) print(type(people)) print(people) # Infer the schema, and register the DataFrame as a table. schemaPeople = spark.createDataFrame(people) schemaPeople.createOrReplaceTempView("people") # SQL can be run over DataFrames that have been registered as a table. teenagers = spark.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19") # The results of SQL queries are Dataframe objects. # rdd returns the content as an :class:`pyspark.RDD` of :class:`Row`. teenNames = teenagers.rdd.map(lambda p: "Name: " + p.name).collect() for name in teenNames: print(name)
flightData2015.sort("count").explain() # Play with partition configuration spark.conf.set("spark.sql.shuffle.partitions", "5") print(flightData2015.sort("count").take(2), "\n") # RDD.explain() will show the execution plan flightData2015.sort("count").explain() # Register a DF into a table flightData2015.createOrReplaceTempView("flight_data_2015") # Spark SQL way: a new DF is created by using Spark SQL sqlWay = spark.sql(""" SELECT DEST_COUNTRY_NAME, count(1) FROM flight_data_2015 GROUP BY DEST_COUNTRY_NAME """) # Spark DF way: a new DF dataFrameWay is created with transformation dataFrameWay = flightData2015\ .groupBy("DEST_COUNTRY_NAME")\ .count() # Both explain() shows the same plan of transformation sqlWay.explain() dataFrameWay.explain() # Spark DF way: to find the max value of column 'count' print(flightData2015.select(max("count")).take(1)) print('\n')
def save_data(rdd): global flag flag = False """ Parsing JSON value in each RDDs Creating Spark SQL DataFrame from RDD Writing DataFrame to HDFS and Oracle DB """ if not rdd.isEmpty(): # Create df for duplicate handling df_max_id = spark.read \ .format("jdbc") \ .option("driver", DRIVER) \ .option("url", URL_TARGET_DB) \ .option("dbtable", TARGET_DB_TABLE_NAME) \ .option("user", TARGET_DB_USER_NAME) \ .option("password", TARGET_DB_USER_PASSWORD) \ .load() max_id = df_max_id.agg({'product_id': 'max'}).collect()[0][0] if max_id == None: max_id = 0 rdd = rdd.map(lambda m: parse(m[1])) df = sqlContext.createDataFrame(rdd) df.createOrReplaceTempView("t") result = spark.sql( '''select product_id, category_id, brand, description, name, price, last_update_date from (select row_number() over (partition by _1 order by _7) as RN,_1 as product_id,_2 as category_id, _3 as brand,_4 as description,_5 as name,_6 as price,to_timestamp(_7) as last_update_date from t where _1 > ''' + str(max_id) + ''') where RN = 1''') count = result.count() try: # Writing to HDFS result.write \ .format("csv") \ .mode("append") \ .option("header", "true") \ .save(HDFS_OUTPUT_PATH) # Writing to Oracle DB result.write \ .format("jdbc") \ .mode("append") \ .option("driver", DRIVER) \ .option("url", URL_TARGET_DB) \ .option("dbtable", TARGET_DB_TABLE_NAME) \ .option("user", TARGET_DB_USER_NAME) \ .option("password", TARGET_DB_USER_PASSWORD) \ .save() write_log('INFO', 'Consumer_dim_products.py', 'main', '{} rows inserted successfully'.format(count)) except Exception as e: print('--> It seems an Error occurred: {}'.format(e)) write_log('ERROR', 'Consumer_dim_products.py', 'main', str(e)[:1000]) flag = True else: ssc.stop() return rdd
"sep": args.file2_sep, "quote": args.file2_quote, "escape": args.file2_escape, "inferSchema": "false", "header": args.file2_header, "schema": args.file2_schema, } print(f"File 2 dataframe parameters:\n{json.dumps(file2_df_params, indent=2)}") file2_df = spark.read.csv(**file2_df_params) file2_df.createOrReplaceTempView("file2") sql = f""" SELECT {args.output_columns} FROM file1 JOIN file2 ON file1.{args.file1_join_column} = file2.{args.file2_join_column} """ print(f"SQL query:\n{sql}") output = spark.sql(sql) output.show(50) if args.output_path: output_params = { "sep": args.output_sep, "header": args.output_header, "mode": args.output_mode, "path": args.output_path, } output.write.csv(**output_params)
numFeatures=20) featurizedData = hashingTF.transform(wordsData) idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) rescaledData.select("query", "StartTime", "RunTime", "features") numOfClusters = 5 kmeans = KMeans(k=numOfClusters, seed=1) model = kmeans.fit(rescaledData.select('features')) transformed = model.transform(rescaledData) # transformed.show() transformed.createOrReplaceTempView("df") timesDf = spark.sql("SELECT StartTime,cast(RunTime as int),prediction FROM df") sumDf = spark.sql( "SELECT StartTime,prediction,count(prediction) FROM df group by StartTime,prediction" ) # PIVOT pivotDf1 = sumDf.groupby('StartTime') \ .pivot('prediction') \ .max('count(prediction)') pivotDf1 = pivotDf1.fillna(0) pivotDf1.createOrReplaceTempView('pivotDf') pivotDf1.show() # PIVOT pivotDf = timesDf.groupby('StartTime') \ .pivot('prediction') \ .sum('RunTime')
.withColumn("use", lit("usual")) \ .withColumn("type", coding_lr) \ .withColumn("value", col("PatientID")) \ .withColumn("assigner", struct(col("PatientIDTypeDSC").alias("display"))) \ .select("EDWPatientID", "use", "type", "value", "assigner") identifier = id_edwpatient.unionAll(id_mrn).unionAll(id_patient) \ .groupBy("EDWPatientID").agg(collect_list(struct("use", "type", "value", "assigner")).alias("identifier")) # COMMAND ---------- # build patient from pyspark.sql.functions import coalesce, col, concat, expr, lit, split, to_date, when # work around an oddity with Spark 3 date parsing spark.sql("set spark.sql.legacy.timeParserPolicy=LEGACY") patient = patient_raw \ .join(address, ["EDWPatientID"]) \ .join(telecom, ["EDWPatientID"]) \ .join(identifier, ["EDWPatientID"]) \ .withColumn("name", struct( \ col("PatientFullNM").alias("text"), \ split(col("PatientFullNM"), ", ")[0].alias("family"), \ split(split(col("PatientFullNM"), ", ")[1], " ").alias("given"))) \ .withColumn("resourceType", lit("Patient")) \ .withColumnRenamed("EDWPatientID", "id") \ .withColumn("active", lit("true")) \ .withColumn("gender", when(col("GenderCD") == 1, "female").when(col("GenderCD") == 2, "male").otherwise("unknown")) \ .withColumn("birthDate", to_date(col("BirthDTS"), "yyyy-MM-dd")) \ .withColumn("deceasedBoolean", expr("DeathDTS IS NOT NULL")) \
def getSourceData(time_param): #查询数据语句# sql = "select tt.hnuserid, tt.totalscore, rf2.name_deceive,rf2.user_new,rf2.mobile_virtual," \ "rf2.app_correlation, rf2.card_different, rf2.supply_different, rf2.supply_risk," \ "rf2.mobile_risk_area,rf2.supply_risk_area,rf2.mobile_supply,rf2.percard,rf2.rvicard,rf2.devNums," \ "rf2.registerlen,rf2.mActives,rf2.dwActives,rf2.wActives,tt.isblack " \ "from( select t.hnuserid,t.totalscore,case when hn.status is null then '0' else hn.status end as isblack " \ "from ( select hnuserid, totalscore from risk_user_portrait.risk_up_featuresScore_total " \ "where day='" + time_param + "'" \ ")t " \ "left outer join risk_business_source.hn_user_black hn on t.hnuserid = hn.hn_user_id " \ "left outer join risk_business_source.hnuser hu on t.hnuserid = hu.hnuserid " \ ") tt join( select name_deceive, user_new, mobile_virtual, app_correlation, card_different, " \ "supply_different, supply_risk, mobile_risk_area, supply_risk_area, mobile_supply, hnuserid " \ "from risk_user_portrait.risk_up_featuresscore_more " \ "where day='" + time_param + "'" \ ") rf on tt.hnuserid = rf.hnuserid " \ "left join(select hnuserid," \ "name_deceive, user_new, mobile_virtual, app_correlation, card_different, " \ "supply_different, supply_risk, mobile_risk_area, supply_risk_area, mobile_supply" \ "percard,rvicard,devNums,registerlen,mActives,dwActives,wActives " \ "from risk_user_portrait.risk_up_featuresscore_more2 " \ "where day='" + time_param + "'" \ ")rf2 on tt.hnuserid = rf2.hnuserid" # The results of SQL queries are themselves DataFrames and support all normal functions. #parse the sql statement sqlDF = spark.sql(sql) #get the hive data rows = sqlDF.collect() rows_count = len(rows) #data conversion data = np.zeros((rows_count, 15)) for i in range(rows_count): data[i][0] = rows[i]['hnuserid'] data[i][1] = rows[i]['totalscore'] data[i][2] = rows[i]['name_deceive'] data[i][3] = rows[i]['user_new'] data[i][4] = rows[i]['mobile_virtual'] data[i][5] = rows[i]['app_correlation'] data[i][6] = rows[i]['card_different'] data[i][7] = rows[i]['supply_different'] data[i][8] = rows[i]['supply_risk'] data[i][9] = rows[i]['mobile_risk_area'] data[i][10] = rows[i]['supply_risk_area'] data[i][11] = rows[i]['mobile_supply'] data[i][12] = rows[i]['percard'] data[i][13] = rows[i]['rvicard'] data[i][14] = rows[i]['isblack'] labels = [ 'hnuserid', 'totalscore', 'name_deceive', 'user_new', 'mobile_virtual', 'app_correlation', 'card_different', 'supply_different', 'supply_risk', 'mobile_risk_area', 'supply_risk_area', 'mobile_supply', 'percard', 'rvicard', 'isblack', ] return data, labels
def save_data(rdd): global flag flag = False """ Parsing JSON value in each RDDs Creating Spark SQL dataFrame from RDD Writing dataFrame to HDFS and Oracle DB """ if not rdd.isEmpty(): # Create df for duplicate handling write_log('INFO', 'Consumer_fct_prod.py', 'main', 'Executing max_id') df_max_id = spark.read \ .format("jdbc") \ .option("driver", DRIVER) \ .option("url", URL_TARGET_DB) \ .option("dbtable", "(SELECT max(ID) ID from " + TARGET_DB_TABLE_NAME + ")") \ .option("user", TARGET_DB_USER_NAME) \ .option("password", TARGET_DB_USER_PASSWORD) \ .load() max_id = df_max_id.agg({'ID': 'max'}).collect()[0][0] if max_id == None: max_id = 0 write_log('INFO', 'Consumer_fct_prod.py', 'main', 'Max id executed successfully max_id = {}'.format(max_id)) rdd = rdd.map(lambda m: parse(m[1])) df_fct_prod = sqlContext.createDataFrame(rdd) df_fct_prod.createOrReplaceTempView("t") result = spark.sql( '''select id, event_id, event_time, product_id, customer_id from (select row_number() over (partition by _1 order by _3) as RN, _1 as id,_2 as event_id, to_timestamp(_3) as event_time,_4 as product_id,_5 as customer_id from t where _1 > ''' + str(max_id) + ''') where RN = 1''') count = result.count() try: write_log('INFO', 'Consumer_fct_prod.py', 'main', 'Consumer is inserting {} rows to DB'.format(count)) # Writing to HDFS result.write \ .format("csv") \ .mode("append") \ .option("header", "true") \ .save(HDFS_OUTPUT_PATH) # Writing to Oracle DB result.write \ .format("jdbc") \ .mode("append") \ .option("driver", DRIVER) \ .option("url", URL_TARGET_DB) \ .option("dbtable", TARGET_DB_TABLE_NAME) \ .option("user", TARGET_DB_USER_NAME) \ .option("password", TARGET_DB_USER_PASSWORD) \ .save() write_log('INFO', 'Consumer_fct_prod.py', 'main', '{} rows inserted to DB successfully'.format(count)) except Exception as e: print('--> It seems an Error occurred: {}'.format(e)) write_log('ERROR', 'Consumer_fct_prod.py', 'main', str(e)[:1000]) flag = True else: ssc.stop() return rdd