# Perform INNER JOIN on the two data frames on EMP_NO column # As of Spark 1.4 you don't have to worry about duplicate column on join result df_emp_sal_join = df_employees.join(df_salaries, "emp_no").select("emp_no", "birth_date", "first_name", "last_name", "gender", "hire_date", "salary", "from_date", "to_date") # Adding a column 'year' to the data frame for partitioning the hive table df_add_year = df_emp_sal_join.withColumn('year', F.year(df_emp_sal_join.to_date)) # Adding a load date column to the data frame df_final = df_add_year.withColumn('Load_date', F.current_date()) df_final.repartition(10) # Registering data frame as a temp table for SparkSQL hive_ctx.registerDataFrameAsTable(df_final, "EMP_TEMP") # Target Type: APACHE HIVE # Database : EMPLOYEES # Table Name : EMPLOYEE_DIM # + ------------------------------- + # | COlUMN NAME| TYPE | PARTITION | # + ------------------------------- + # | EMP_NO | INT | | # | BIRTH_DATE | DATE | | # | FIRST_NAME | STRING | | # | LAST_NAME | STRING | | # | GENDER | STRING | | # | HIRE_DATE | DATE | | # | SALARY | INT | | # | FROM_DATE | DATE | |
new_column = from_shop_total_switch.from_brand_code.cast("string") from_shop_total_switch = from_shop_total_switch.withColumn('from_brand_code',new_column) # 生成基于点击量的替代性结果 switch = switch.join(from_shop_total_switch,'from_brand_code','inner') switch = switch.withColumn('switch_prob', switch.page_views_switch/switch.from_shop_total_switch) switch = switch[['from_brand_code','to_brand_code','page_views_switch','from_shop_total_switch','switch_prob']] #switch_rank = Window.partitionBy('from_brand_code').orderBy(switch.switch_prob.desc()) #switch = switch.withColumn('switch_rank',rank().over(switch_rank)) # #switch = switch[['from_brand_code','to_brand_code', # 'page_views_switch','from_shop_total_switch','switch_prob','switch_rank']] # 保存结果 hc.registerDataFrameAsTable(switch, "table1") insert_sql = '''insert overwrite table dev.dev_open_brand_similarity_replacement partition(dt='%s') select * from table1'''%(dt) hc.sql(insert_sql) CREATE TABLE IF NOT EXISTS dev.dev_open_brand_similarity_replacement( from_brand_code STRING, to_brand_code STRING, page_views_switch INT, from_shop_total_switch INT, switch_prob FLOAT) PARTITIONED BY ( `dt` string) ROW FORMAT DELIMITED
result = y return result # return [valid_jsontxt(ln) for ln in result] s1 = "/commit/iteminfo/shopid_128287536/item.info." + today rdd_c = sc.textFile(s1).map(lambda x: f(x)).filter(lambda x: x != None) rdd = rdd_c.groupByKey().mapValues(list).map(lambda (x, y): quchong(x, y)) schema = StructType([ StructField("item_id", StringType(), True), StructField("title", StringType(), True), StructField("ts", StringType(), True) ]) df = hiveContext.createDataFrame(rdd, schema) hiveContext.registerDataFrameAsTable(df, 'qianxing_iteminfo') # hiveContext.sql("insert overwrite table wl_base.t_base_qianxing_iteminfo partition (ds =" + today + ")\ # select * from qianxing_iteminfo") sql_merge = ''' insert overwrite table wl_base.t_base_qianxing_iteminfo partition (ds =''' + today + ''') select COALESCE(t1.item_id,t2.item_id), COALESCE(t1.title,t2.title), COALESCE(t1.ts,t2.ts) from qianxing_iteminfo t1 full join (select * from wl_base.t_base_qianxing_iteminfo where ds =''' + yesterday + ''')t2 on t1.item_id = t2.item_id '''
and cid3 in (11924,11925,6739,11922,13550,11923) group by cid3,brand_code''' % (begin_dt,dt) brand_sku_num = hc.sql(brand_sku_num_query).coalesce(100).cache() # 计算每个店铺有匹配关系的spu数目 #shop_sku_num = all_sku_match.groupby('item_third_cate_cd1','brand_code1').agg(countDistinct(all_sku_match.main_sku_id1).alias('sku_num')) # 计算重合度 final_sku_match = sku_match.join(brand_sku_num,['item_third_cate_cd1','brand_code1'],'inner') final_sku_match = final_sku_match.withColumn('overlap_ratio',final_sku_match.match_sku_num/final_sku_match.sku_num) final_sku_match = final_sku_match.withColumnRenamed('item_third_cate_cd1','cid3') final_sku_match= final_sku_match[['cid3','brand_code1','brand_name1','brand_code2','brand_name2', 'match_sku_num','sku_num','overlap_ratio']] # 保存到hive表 hc.registerDataFrameAsTable(final_sku_match, "table1") insert_sql = '''insert overwrite table dev.dev_open_brand_similarity_spu_overlap_da partition(dt="%s") select * from table1'''%(dt) hc.sql(insert_sql) CREATE TABLE IF NOT EXISTS dev.dev_open_brand_similarity_spu_overlap_da( cid3 STRING COMMENT "商品1三级分类", brand_code1 STRING COMMENT "自营品牌商1id", brand_name1 STRING COMMENT "自营品牌商1名字", brand_code2 STRING COMMENT "自营品牌商2id", brand_name2 STRING COMMENT "自营品牌商2名字", match_sku_num INT COMMENT "品牌商1和品牌商2匹配上的主商品数目", sku_num INT COMMENT "品牌商1中有销量的sku数目", overlap_ratio FLOAT COMMENT "重合度")
from pyspark.sql import SparkSession from pyspark.context import SparkContext sc = SparkContext.getOrCreate() spark = SparkSession.builder.enableHiveSupport().getOrCreate( ) # enable hivesupport is optional #df=spark.read.csv("file:///home/bellapukondar1/ggtext.csv") df = spark.read.csv("ggtext.csv") print(type(df)) #Dataframe df.show() q = "select * from tablename" out = spark.sql(q) data = out.collect() from pyspark.sql import HiveContext hive_context = HiveContext(sc) prod = [(1, 'a1', '11'), (2, 'a2', '22'), (3, 'a3', '33'), (4, 'a4', '44'), (5, 'a5', '55'), (6, 'a6', '66'), (7, 'a7', '77'), (8, 'a8', '88'), (9, 'a9', '99'), (10, 'a10', '1010')] dd = sc.parallelize(prod) df = dd.toDF(['id', 'value', 'label']) hive_context.registerDataFrameAsTable(df, "table1") q = "select * from table1" out = hive_context.sql(q) out.show() #writing to s3 df.write.save("s3://s3-........", format='csv', header=True) df.write.mode("append").format("csv").save( "s3://s3-emr-test-stg/crn_analysis/output_files/ab1.csv")
def main(args): """ Main code for relevance computation """ start_time = time.time() # iq (code snippets that set below properties have been removed) driver = url = username = password = inputs = [driver, url, username, password] filename = str(args[0]) if os.path.exists(filename): pass else: sys.exit("Input file %s not found" % filename) file = open(filename, 'r') for line in file: key, val = line.split(",") if str(key).strip() == "dbalias": dbalias = str(val).strip() elif str(key).strip() == "numpartitions": numpartitions = int(val) elif str(key).strip() == "datadir": datadir = str(val).strip() else: print("Invalid key not set: %s" % str(key)) # Need to make sure that the datadir variable is set. try: print("datadir = '%s' " % datadir) except NameError: sys.exit("'datadir' variable not set. Check inputfile '%s'" % (datadir, filename)) # Spark and Hive contexts conf = SparkConf() sc = SparkContext(conf = conf) sqlContext = HiveContext(sc) df = utils.returnSparkDF(SQLContext(sc), inputs, "traffic") if df is None: sys.exit("'traffic' query failed: SystemExit.") sqlContext.registerDataFrameAsTable(df, "uniquedata") df = None df = utils.returnSparkDF(SQLContext(sc), inputs, "fbtraffic") if df is None: sys.exit("'fbtraffic' query failed: SystemExit.") sqlContext.registerDataFrameAsTable(df, "uniqueFBdata") df = None statement = "Select ud.loginid, ud.adid, ud.Type, ufd.Type as FBType "\ "from uniquedata ud left outer join uniqueFBdata ufd "\ "on ud.loginid = ufd.loginid and ud.adid = ufd.adid" adswithFBjoined = sqlContext.sql(statement) adswithFBjoined_cleaned = adswithFBjoined[adswithFBjoined['FBType'].isNull()] adswithFBjoined_cleaned = adswithFBjoined_cleaned.drop('FBType') sqlContext.registerDataFrameAsTable(adswithFBjoined_cleaned, "data") statement = "Select loginid, count(loginid) as viewcount from data group by loginid" temp = sqlContext.sql(statement) sqlContext.registerDataFrameAsTable(temp, "viewdata") statement = "Select d.* from data d, viewdata vd where d.loginid = vd.loginid and vd.viewcount > 1" temp2 = sqlContext.sql(statement) sqlContext.sql("drop table data") sqlContext.registerDataFrameAsTable(temp2, "data") temp, temp2 = (None, None) df = utils.returnSparkDF(SQLContext(sc), inputs, "agent") if df is None: sys.exit("'agent' query failed: SystemExit.") sqlContext.registerDataFrameAsTable(df, "agentdata") statement = "select loginid, adid, Type, count(adid) as counter from agentdata group by loginid, adid, Type" unique_adid_per_loginid = sqlContext.sql(statement) unique_adid_per_loginid = unique_adid_per_loginid.drop('counter') sqlContext.registerDataFrameAsTable(unique_adid_per_loginid, "agentdata") df = utils.returnSparkDF(SQLContext(sc), inputs, "favorite") if df is None: sys.exit("'favorite' query failed: SystemExit.") sqlContext.registerDataFrameAsTable(df, "favdata") df = None statement = "select * from data union all select * from agentdata union all select * from favdata" df2 = sqlContext.sql(statement) sqlContext.registerDataFrameAsTable(df2, "uniondata") df2 = None statement = "select loginid, max(Type) as UserMaxConversion from uniondata group by loginid" maxtype = sqlContext.sql(statement) sqlContext.registerDataFrameAsTable(maxtype, "maxconversiondata") statement = "select uniondata.loginid, uniondata.adid, uniondata.Type "\ "from uniondata, maxconversiondata where uniondata.loginid = maxconversiondata.loginid "\ "and uniondata.Type = maxconversiondata.UserMaxConversion" data = sqlContext.sql(statement) sqlContext.registerDataFrameAsTable(data, "data") # Delete tables tables = ["uniquedata", "FBdata", "uniqueFBdata", "agentdata", "favdata", "uniondata", "maxconversiondata"] for table in tables: sqlContext.sql("drop table if exists %s" % str(table)) df = utils.returnSparkDF(SQLContext(sc), inputs, "adclassified") if df is None: sys.exit("'adclassified' query failed: SystemExit.") sqlContext.registerDataFrameAsTable(df, "addata") df = None df = utils.returnSparkDF(SQLContext(sc), inputs, "geo") if df is None: sys.exit("'geo' query failed: SystemExit.") sqlContext.registerDataFrameAsTable(df, "geodata") df = None statement = "select addata.adid, addata.AskingPrice, addata.CollectiveDebt, "\ "addata.PageViewCount, geodata.Municipal, geodata.CityPart "\ "from addata, geodata where addata.locationkey = geodata.locationkey" addata_for_join = sqlContext.sql(statement) statement = "select addata.adid, addata.AskingPrice, addata.CollectiveDebt, "\ "addata.PageViewCount, geodata.Municipal, geodata.CityPart "\ "from addata, geodata where addata.locationkey = geodata.locationkey" addata_for_join = sqlContext.sql(statement) sqlContext.registerDataFrameAsTable(addata_for_join, "adtemp") statement = "select * from adtemp where PageViewCount < 10000" addata_for_join = sqlContext.sql(statement) sqlContext.registerDataFrameAsTable(addata_for_join, "addata_for_join") data, addata_for_join = (None, None) sqlContext.sql("drop table if exists addata") statement = "select a.*, b.AskingPrice, b.CollectiveDebt, b.PageViewCount, b.Municipal, b.CityPart "\ "from data a, addata_for_join b where a.adid = b.adid" data = sqlContext.sql(statement) data = data.fillna(0) data = data.repartition(numpartitions) # Save the files as csv using spark-csv from databricks try: st = time.time() data.write.format("com.databricks.spark.csv").save(datadir, mode="overwrite", codec="bzip2") et = time.time() print("File save time was: %.2f mins." % ((et-st)/60.)) except: sys.exit("Could not save files to dir '%s'. \n\nError = %s" % (datadir, sys.exc_info()[1])) finally: end_time = time.time() print("Spark ETL execution time = %.2f mins." % ((end_time-start_time)/60.)) # Stop spark and continue using in-memory computation (another script) sc.stop() return
def sql_hive_context_example(spark): # create hive context object. hive_ctx = HiveContext(spark.sparkContext) # createDataFrame l = [('Alice', 18), ('Bob', 20), ('Charley', 22)] df = hive_ctx.createDataFrame(l, ('name', 'age')) print("createDataFrame API finished") # registerDataFrameAsTable hive_ctx.registerDataFrameAsTable(df, "table1") print("registerDataFrameAsTable API finished") # sql tmp_df = hive_ctx.sql("select * from table1") tmp_df.show() print("sql API finished") # table tmp_df = hive_ctx.table("table1") tmp_df.show() print("table API finished") # tableNames table_names = hive_ctx.tableNames() print(table_names) print("tableNames API finished") # tables tables = hive_ctx.tables() print(tables) print("tables API finished") # range tmp_df = hive_ctx.range(1,10,2) tmp_df.show() print("range API finished") # dropTempTable hive_ctx.dropTempTable("table1") table_names = hive_ctx.tableNames() print(table_names) print("dropTempTable API finished") # cacheTable & uncacheTable & clearCache df = hive_ctx.range(1,10,2) hive_ctx.registerDataFrameAsTable(df, "table") hive_ctx.cacheTable("table") hive_ctx.uncacheTable("table") hive_ctx.clearCache() print("cacheTable & uncacheTable & clearCache API finished") # createExternalTable # newSession # registerFunction # Deprecated in 2.3.0. Use :func:`spark.udf.register` instead # registerJavaFunction # Deprecated in 2.3.0. Use :func:`spark.udf.registerJavaFunction` instead # setConf & getConf hive_ctx.setConf("key1", "value1") value = hive_ctx.getConf("key1") print(value) print("setConf & getConf API finished") # refreshTable # Exception: An error occurred while calling o26.refreshTable: # Method refreshTable([class java.lang.String]) does not exist print("Finish running HiveContext API")
else: sys.exit(gstrDefaultError) strSQL = SelectString(strTable1, strTable2) print strSQL #get data sc = SparkContext() sqlContext = HiveContext(sc) sqlContext.sql("use accident_project") df = sqlContext.sql(strSQL) #register as table sqlContext.registerDataFrameAsTable(df, "Results") #To CSV Must loop for resources (not enough if we do everything at once). #get years df_years = sqlContext.sql("SELECT distinct Year from Results") print "Collecting Data" df_years.collect() first = True for row in df_years.rdd.toLocalIterator(): print "Processing year " + row.Year df = sqlContext.sql("SELECT * FROM Results WHERE Year = " + row.Year) #to panda data frame
# 品牌商gmv排名 gmv_rank = Window.partitionBy('cid3').orderBy(top_brand_gmv.brand_gmv.desc()) top_brand_gmv = top_brand_gmv.filter(top_brand_gmv.brand_gmv > 0).withColumn('gmv_rank',rank().over(gmv_rank)) similar_top_brand_gmv=top_brand_gmv.withColumnRenamed('cid3','item_third_cate_cd').withColumnRenamed('brand_code','similar_brand_code').withColumnRenamed("gmv_rank", "similar_gmv_rank").withColumnRenamed("brand_gmv","similar_brand_gmv") top_brand_gmv = top_brand_gmv.crossJoin(similar_top_brand_gmv).filter("(similar_gmv_rank>gmv_rank and similar_gmv_rank<=gmv_rank+10) or (similar_gmv_rank>=gmv_rank-10 and similar_gmv_rank<gmv_rank)") top_brand_gmv = top_brand_gmv.filter("cid3 == item_third_cate_cd") top_brand_gmv = top_brand_gmv[['cid3','brand_code','brand_gmv','gmv_rank','item_third_cate_cd','similar_brand_code','similar_brand_gmv','similar_gmv_rank']] top_brand_gmv = top_brand_gmv.select('cid3','brand_code','brand_gmv','gmv_rank','similar_brand_code','similar_brand_gmv','similar_gmv_rank') # 保存结果 top_brand_gmv = top_brand_gmv[top_brand_gmv.brand_code != top_brand_gmv.similar_brand_code] hc.registerDataFrameAsTable(top_brand_gmv, "table1") insert_sql = '''insert overwrite table dev.dev_open_brand_similarity_volumn_da partition(dt='%s') select * from table1'''%(dt) hc.sql(insert_sql) CREATE TABLE IF NOT EXISTS dev.dev_open_brand_similarity_volumn_da( cid3 STRING, brand_code STRING, brand_gmv FLOAT, gmv_rank INT, similar_brand_code STRING, similar_brand_gmv FLOAT, similar_gmv_rank INT) PARTITIONED BY (
lambda x: [x.user_id, x.avg_price]) rdd = rdd1.map(lambda x: x[1]).repartition(100) data = rdd.filter(lambda x: x < 30000).map(lambda x: array(x)) model = KMeans.train(data, 5, maxIterations=20, runs=50, initializationMode="random", seed=50, initializationSteps=5, epsilon=1e-4) model.centers = sorted(model.centers) userlevel_rdd = rdd1.map(lambda x: (x[0], x[1], model.predict([x[1]]))) schema = StructType([ StructField("uid", StringType(), True), StructField("avg_price", FloatType(), True), StructField("ulevel", IntegerType(), True) ]) df = hiveContext.createDataFrame(userlevel_rdd, schema) # sqlContext.registerDataFrameAsTable(df,'userlevel') # 保存 hiveContext.registerDataFrameAsTable(df, 'userlevel') hiveContext.sql('drop table if EXISTS t_zlj_perfer_user_level ') hiveContext.sql( 'create table wlbase_dev.t_zlj_perfer_user_level as select * from userlevel' )
df_date_dim = df_FILE_1_FILE_2_join.withColumn('date_id', F.date_id(df_FILE_1_FILE_2_join.published_date)) df_media_dim = df_FILE_1_FILE_2_join.withColumn('media_id', F.media_id(df_FILE_1_FILE_2_join.published_date)) df_geo_dim = df_FILE_1_FILE_2_join.withColumn('geo_area_id', F.geo_id(df_FILE_1_FILE_2_join.published_date)) df_session_dim = df_FILE_1_FILE_2_join.withColumn('session_id', F.session_id(df_FILE_1_FILE_2_join.published_date)) # Joining Vendor data with dimensions df_fact = df_customer_dim.withColumn('vendor_id', F.current_vendor()) df_fact = df_date_dim.withColumn('vendor_id', F.current_vendor()) df_fact = df_media_dim.withColumn('vendor_id', F.current_vendor()) df_fact = df_geo_dim.withColumn('vendor_id', F.current_vendor()) df_fact = df_session_dim.withColumn('vendor_id', F.current_vendor()) df_fact.repartition(5) # Registering data frame as a temp table for SparkSQL hive_ctx.registerDataFrameAsTable(df_customer_dim, "MEDIA_TEMP") hive_ctx.registerDataFrameAsTable(df_date_dim, "MEDIA_TEMP") hive_ctx.registerDataFrameAsTable(df_media_dim, "MEDIA_TEMP") hive_ctx.registerDataFrameAsTable(df_geo_dim, "MEDIA_TEMP") hive_ctx.registerDataFrameAsTable(df_session_dim, "MEDIA_TEMP") hive_ctx.registerDataFrameAsTable(df_fact, "MEDIA_TEMP") # Target Type: APACHE HIVE ## Source : FILE_1, FILE_2 ## Table Name : Customer_Dimension ## Storage Format: ORC ## # ------------------------------------------------------- ## # COLUMN NAME DATA TYPE ## # ------------------------------------------------------- # customer_id varchar(35) # anonymized_person_id integer(8)
# transform RDD, using Row function cats = rdd.map(lambda x: Row(name=x[0], age=int(x[1]))) # In[8]: cats # In[9]: # Create dataframe schemaCats = sqlContext.createDataFrame(cats) # In[10]: # Register it as a temp table sqlContext.registerDataFrameAsTable(schemaCats, "cat_table") # Show HIVE table sqlContext.sql("show tables").show() # In[11]: # Using default HiveContext to select columns sqlContext.sql("Select * from cat_table").show() # In[12]: # USE where clause sqlContext.sql("Select * from cat_table where age > 20").show() # In[13]:
result = volumn.union(switch).union(overlap) result = result[result.brand_code_origin != result.brand_code_similar] hc.registerDataFrameAsTable(result, "table1") insert_sql = '''insert overwrite table dev.dev_open_brand_similarity_da partition(dt='%s') select * from table1'''%(dt) hc.sql(insert_sql) CREATE TABLE IF NOT EXISTS dev.dev_open_brand_similarity_da( cid3 STRING COMMENT "三级分类", brand_code_origin STRING COMMENT "被替代的自营品牌商id", brand_code_similar STRING COMMENT "替代from_brand_code的品牌商id", similarity_type STRING COMMENT "算法类型出处") COMMENT "三级分类层级下,三种相似性算法合并出的相似品牌商" PARTITIONED BY ( `dt` string) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'