Esempio n. 1
0
# Perform INNER JOIN on  the two data frames on EMP_NO column
# As of Spark 1.4 you don't have to worry about duplicate column on join result
df_emp_sal_join = df_employees.join(df_salaries, "emp_no").select("emp_no", "birth_date", "first_name",
                                                             "last_name", "gender", "hire_date",
                                                             "salary", "from_date", "to_date")

# Adding a column 'year' to the data frame for partitioning the hive table
df_add_year = df_emp_sal_join.withColumn('year', F.year(df_emp_sal_join.to_date))

# Adding a load date column to the data frame
df_final = df_add_year.withColumn('Load_date', F.current_date())

df_final.repartition(10)

# Registering data frame as a temp table for SparkSQL
hive_ctx.registerDataFrameAsTable(df_final, "EMP_TEMP")

# Target Type: APACHE HIVE
# Database   : EMPLOYEES
# Table Name : EMPLOYEE_DIM
# + ------------------------------- +
# | COlUMN NAME| TYPE   | PARTITION |
# + ------------------------------- +
# | EMP_NO     | INT    |           |
# | BIRTH_DATE | DATE   |           |
# | FIRST_NAME | STRING |           |
# | LAST_NAME  | STRING |           |
# | GENDER     | STRING |           |
# | HIRE_DATE  | DATE   |           |
# | SALARY     | INT    |           |
# | FROM_DATE  | DATE   |           |
Esempio n. 2
0
new_column = from_shop_total_switch.from_brand_code.cast("string")
from_shop_total_switch = from_shop_total_switch.withColumn('from_brand_code',new_column)

# 生成基于点击量的替代性结果
switch = switch.join(from_shop_total_switch,'from_brand_code','inner')
switch = switch.withColumn('switch_prob', switch.page_views_switch/switch.from_shop_total_switch)
switch = switch[['from_brand_code','to_brand_code','page_views_switch','from_shop_total_switch','switch_prob']]

#switch_rank = Window.partitionBy('from_brand_code').orderBy(switch.switch_prob.desc())
#switch = switch.withColumn('switch_rank',rank().over(switch_rank))
#
#switch = switch[['from_brand_code','to_brand_code',
#                 'page_views_switch','from_shop_total_switch','switch_prob','switch_rank']]
# 保存结果
hc.registerDataFrameAsTable(switch, "table1")
insert_sql = '''insert overwrite table dev.dev_open_brand_similarity_replacement partition(dt='%s')
                select * from table1'''%(dt)
hc.sql(insert_sql)



CREATE TABLE IF NOT EXISTS dev.dev_open_brand_similarity_replacement(
from_brand_code STRING,
to_brand_code STRING,
page_views_switch INT,
from_shop_total_switch INT,
switch_prob FLOAT) 
PARTITIONED BY ( 
  `dt` string)
ROW FORMAT DELIMITED  
    result = y
    return result
    # return [valid_jsontxt(ln) for ln in result]


s1 = "/commit/iteminfo/shopid_128287536/item.info." + today
rdd_c = sc.textFile(s1).map(lambda x: f(x)).filter(lambda x: x != None)
rdd = rdd_c.groupByKey().mapValues(list).map(lambda (x, y): quchong(x, y))
schema = StructType([
    StructField("item_id", StringType(), True),
    StructField("title", StringType(), True),
    StructField("ts", StringType(), True)
])

df = hiveContext.createDataFrame(rdd, schema)
hiveContext.registerDataFrameAsTable(df, 'qianxing_iteminfo')
# hiveContext.sql("insert overwrite table wl_base.t_base_qianxing_iteminfo partition (ds =" + today + ")\
# select * from qianxing_iteminfo")
sql_merge = '''
insert overwrite table wl_base.t_base_qianxing_iteminfo partition (ds =''' + today + ''')
select
COALESCE(t1.item_id,t2.item_id),
COALESCE(t1.title,t2.title),
COALESCE(t1.ts,t2.ts)
from
qianxing_iteminfo t1
full join
(select * from  wl_base.t_base_qianxing_iteminfo where ds =''' + yesterday + ''')t2
on
t1.item_id = t2.item_id
'''
Esempio n. 4
0
and cid3 in (11924,11925,6739,11922,13550,11923) group by cid3,brand_code''' % (begin_dt,dt)
brand_sku_num = hc.sql(brand_sku_num_query).coalesce(100).cache()

# 计算每个店铺有匹配关系的spu数目
#shop_sku_num = all_sku_match.groupby('item_third_cate_cd1','brand_code1').agg(countDistinct(all_sku_match.main_sku_id1).alias('sku_num'))


# 计算重合度
final_sku_match = sku_match.join(brand_sku_num,['item_third_cate_cd1','brand_code1'],'inner')
final_sku_match = final_sku_match.withColumn('overlap_ratio',final_sku_match.match_sku_num/final_sku_match.sku_num)

final_sku_match = final_sku_match.withColumnRenamed('item_third_cate_cd1','cid3')
final_sku_match= final_sku_match[['cid3','brand_code1','brand_name1','brand_code2','brand_name2',
                'match_sku_num','sku_num','overlap_ratio']]
# 保存到hive表
hc.registerDataFrameAsTable(final_sku_match, "table1")
insert_sql = '''insert overwrite table dev.dev_open_brand_similarity_spu_overlap_da partition(dt="%s") 
                select * from table1'''%(dt)
hc.sql(insert_sql)



CREATE TABLE IF NOT EXISTS dev.dev_open_brand_similarity_spu_overlap_da(
cid3 STRING COMMENT "商品1三级分类",
brand_code1 STRING COMMENT "自营品牌商1id",
brand_name1 STRING COMMENT "自营品牌商1名字",
brand_code2 STRING COMMENT "自营品牌商2id",
brand_name2 STRING COMMENT "自营品牌商2名字",
match_sku_num INT COMMENT "品牌商1和品牌商2匹配上的主商品数目",
sku_num INT COMMENT "品牌商1中有销量的sku数目",
overlap_ratio FLOAT COMMENT "重合度") 
Esempio n. 5
0
from pyspark.sql import SparkSession
from pyspark.context import SparkContext
sc = SparkContext.getOrCreate()

spark = SparkSession.builder.enableHiveSupport().getOrCreate(
)  # enable hivesupport is optional
#df=spark.read.csv("file:///home/bellapukondar1/ggtext.csv")
df = spark.read.csv("ggtext.csv")
print(type(df))  #Dataframe
df.show()

q = "select * from tablename"
out = spark.sql(q)
data = out.collect()

from pyspark.sql import HiveContext
hive_context = HiveContext(sc)
prod = [(1, 'a1', '11'), (2, 'a2', '22'), (3, 'a3', '33'), (4, 'a4', '44'),
        (5, 'a5', '55'), (6, 'a6', '66'), (7, 'a7', '77'), (8, 'a8', '88'),
        (9, 'a9', '99'), (10, 'a10', '1010')]
dd = sc.parallelize(prod)
df = dd.toDF(['id', 'value', 'label'])
hive_context.registerDataFrameAsTable(df, "table1")
q = "select * from table1"
out = hive_context.sql(q)
out.show()

#writing to s3
df.write.save("s3://s3-........", format='csv', header=True)
df.write.mode("append").format("csv").save(
    "s3://s3-emr-test-stg/crn_analysis/output_files/ab1.csv")
def main(args):
    """ Main code for relevance computation """
    
    start_time = time.time()
    
    # iq (code snippets that set below properties have been removed)
    driver   = 
    url      = 
    username = 
    password = 
    inputs = [driver, url, username, password]

    
    filename = str(args[0])
    if os.path.exists(filename):
        pass
    else:
        sys.exit("Input file %s not found" % filename)
    file = open(filename, 'r')
    for line in file:
        key, val = line.split(",")
        if str(key).strip() == "dbalias":
            dbalias = str(val).strip()
        elif str(key).strip() == "numpartitions":
            numpartitions = int(val)
        elif str(key).strip() == "datadir":
            datadir = str(val).strip()
        else:
            print("Invalid key not set: %s" % str(key))
    # Need to make sure that the datadir variable is set.
    try:
        print("datadir = '%s' " % datadir)
    except NameError:
        sys.exit("'datadir' variable not set. Check inputfile '%s'" 
                 % (datadir, filename))
            
    # Spark and Hive contexts
    conf = SparkConf()
    sc = SparkContext(conf = conf)
    sqlContext = HiveContext(sc)

    
    df = utils.returnSparkDF(SQLContext(sc), inputs, "traffic")
    if df is None: sys.exit("'traffic' query failed: SystemExit.")
    sqlContext.registerDataFrameAsTable(df, "uniquedata")
    df = None
    
    df = utils.returnSparkDF(SQLContext(sc), inputs, "fbtraffic")
    if df is None: sys.exit("'fbtraffic' query failed: SystemExit.")
    sqlContext.registerDataFrameAsTable(df, "uniqueFBdata")
    df = None    

    statement = "Select ud.loginid, ud.adid, ud.Type, ufd.Type as FBType "\
                "from uniquedata ud left outer join uniqueFBdata ufd "\
                "on ud.loginid = ufd.loginid and ud.adid = ufd.adid"
    adswithFBjoined = sqlContext.sql(statement)
    adswithFBjoined_cleaned = adswithFBjoined[adswithFBjoined['FBType'].isNull()]
    adswithFBjoined_cleaned = adswithFBjoined_cleaned.drop('FBType')

    sqlContext.registerDataFrameAsTable(adswithFBjoined_cleaned, "data")

    statement = "Select loginid, count(loginid) as viewcount from data group by loginid"
    temp = sqlContext.sql(statement)
    sqlContext.registerDataFrameAsTable(temp, "viewdata")
    
    statement = "Select d.* from data d, viewdata vd where d.loginid = vd.loginid and vd.viewcount > 1"
    temp2 = sqlContext.sql(statement)
    
    sqlContext.sql("drop table data")
    sqlContext.registerDataFrameAsTable(temp2, "data")
        
    temp, temp2  = (None, None)

    df = utils.returnSparkDF(SQLContext(sc), inputs, "agent")
    if df is None: sys.exit("'agent' query failed: SystemExit.")
    sqlContext.registerDataFrameAsTable(df, "agentdata")

    statement = "select loginid, adid, Type, count(adid) as counter from agentdata group by loginid, adid, Type"
    unique_adid_per_loginid = sqlContext.sql(statement)
    unique_adid_per_loginid = unique_adid_per_loginid.drop('counter')
    sqlContext.registerDataFrameAsTable(unique_adid_per_loginid, "agentdata")
    
    df = utils.returnSparkDF(SQLContext(sc), inputs, "favorite")
    if df is None: sys.exit("'favorite' query failed: SystemExit.")
    sqlContext.registerDataFrameAsTable(df, "favdata")
    df = None
    
    statement = "select * from data union all select * from agentdata union all select * from favdata"
    df2 = sqlContext.sql(statement)
    sqlContext.registerDataFrameAsTable(df2, "uniondata")
    df2 = None
    
    statement = "select loginid, max(Type) as UserMaxConversion from uniondata group by loginid"
    maxtype = sqlContext.sql(statement)
    sqlContext.registerDataFrameAsTable(maxtype, "maxconversiondata")

    statement = "select uniondata.loginid, uniondata.adid, uniondata.Type "\
                "from uniondata, maxconversiondata where uniondata.loginid = maxconversiondata.loginid "\
                "and uniondata.Type = maxconversiondata.UserMaxConversion"
    data = sqlContext.sql(statement)
    sqlContext.registerDataFrameAsTable(data, "data")
       
    # Delete tables
    tables = ["uniquedata", "FBdata", "uniqueFBdata", "agentdata", 
              "favdata", "uniondata", "maxconversiondata"]
    for table in tables:
        sqlContext.sql("drop table if exists %s" % str(table))

    df = utils.returnSparkDF(SQLContext(sc), inputs, "adclassified")
    if df is None: sys.exit("'adclassified' query failed: SystemExit.")
    sqlContext.registerDataFrameAsTable(df, "addata")
    df = None
    
    df = utils.returnSparkDF(SQLContext(sc), inputs, "geo")
    if df is None: sys.exit("'geo' query failed: SystemExit.")
    sqlContext.registerDataFrameAsTable(df, "geodata")
    df = None
    
    statement = "select addata.adid, addata.AskingPrice, addata.CollectiveDebt, "\
                "addata.PageViewCount, geodata.Municipal, geodata.CityPart "\
                "from addata, geodata where addata.locationkey = geodata.locationkey"
    addata_for_join = sqlContext.sql(statement)

    statement = "select addata.adid, addata.AskingPrice, addata.CollectiveDebt, "\
                "addata.PageViewCount, geodata.Municipal, geodata.CityPart "\
                "from addata, geodata where addata.locationkey = geodata.locationkey"
    addata_for_join = sqlContext.sql(statement)
    sqlContext.registerDataFrameAsTable(addata_for_join, "adtemp")

    statement = "select * from adtemp where PageViewCount < 10000"
    addata_for_join = sqlContext.sql(statement)
    sqlContext.registerDataFrameAsTable(addata_for_join, "addata_for_join")                   
    
    data, addata_for_join = (None, None)
    sqlContext.sql("drop table if exists addata")

    statement = "select a.*, b.AskingPrice, b.CollectiveDebt, b.PageViewCount, b.Municipal, b.CityPart "\
                "from data a, addata_for_join b where a.adid = b.adid"
    data = sqlContext.sql(statement)
    data = data.fillna(0)
    data = data.repartition(numpartitions)

    
    # Save the files as csv using spark-csv from databricks
    try:
        st = time.time()
        data.write.format("com.databricks.spark.csv").save(datadir, mode="overwrite", codec="bzip2")
        et = time.time()
        print("File save time was: %.2f mins." % ((et-st)/60.))
    except:
        sys.exit("Could not save files to dir '%s'. \n\nError = %s" % (datadir, sys.exc_info()[1]))
    finally:            
        end_time = time.time()    
        print("Spark ETL execution time = %.2f mins." % ((end_time-start_time)/60.))
    
    
    # Stop spark and continue using in-memory computation (another script)
    sc.stop()
        
    return
def sql_hive_context_example(spark):
    
    # create hive context object.
    hive_ctx = HiveContext(spark.sparkContext)

    # createDataFrame
    l = [('Alice', 18), ('Bob', 20), ('Charley', 22)]
    df = hive_ctx.createDataFrame(l, ('name', 'age'))
    print("createDataFrame API finished")

    # registerDataFrameAsTable 
    hive_ctx.registerDataFrameAsTable(df, "table1")
    print("registerDataFrameAsTable API finished")

    # sql
    tmp_df = hive_ctx.sql("select * from table1")
    tmp_df.show()
    print("sql API finished")

    # table
    tmp_df = hive_ctx.table("table1")
    tmp_df.show()
    print("table API finished")

    # tableNames
    table_names = hive_ctx.tableNames()
    print(table_names)
    print("tableNames API finished")

    # tables
    tables = hive_ctx.tables()
    print(tables)
    print("tables API finished")

    # range
    tmp_df = hive_ctx.range(1,10,2)
    tmp_df.show()
    print("range API finished")

    # dropTempTable
    hive_ctx.dropTempTable("table1")
    table_names = hive_ctx.tableNames()
    print(table_names)
    print("dropTempTable API finished")

    # cacheTable & uncacheTable & clearCache
    df = hive_ctx.range(1,10,2)
    hive_ctx.registerDataFrameAsTable(df, "table")
    hive_ctx.cacheTable("table")
    hive_ctx.uncacheTable("table")
    hive_ctx.clearCache()
    print("cacheTable & uncacheTable & clearCache API finished")

    # createExternalTable

    # newSession

    # registerFunction
    # Deprecated in 2.3.0. Use :func:`spark.udf.register` instead

    # registerJavaFunction
    # Deprecated in 2.3.0. Use :func:`spark.udf.registerJavaFunction` instead

    # setConf & getConf
    hive_ctx.setConf("key1", "value1")
    value = hive_ctx.getConf("key1")
    print(value)
    print("setConf & getConf API finished")

    # refreshTable
    # Exception: An error occurred while calling o26.refreshTable:
    # Method refreshTable([class java.lang.String]) does not exist
    
    print("Finish running HiveContext API")
Esempio n. 8
0
    else:
        sys.exit(gstrDefaultError)

    strSQL = SelectString(strTable1, strTable2)

    print strSQL

    #get data
    sc = SparkContext()
    sqlContext = HiveContext(sc)
    sqlContext.sql("use accident_project")

    df = sqlContext.sql(strSQL)

    #register as table
    sqlContext.registerDataFrameAsTable(df, "Results")

    #To CSV  Must loop for resources (not enough if we do everything at once).

    #get years
    df_years = sqlContext.sql("SELECT distinct Year from Results")

    print "Collecting Data"
    df_years.collect()

    first = True
    for row in df_years.rdd.toLocalIterator():
        print "Processing year " + row.Year
        df = sqlContext.sql("SELECT * FROM Results WHERE Year = " + row.Year)

        #to panda data frame
# 品牌商gmv排名
gmv_rank = Window.partitionBy('cid3').orderBy(top_brand_gmv.brand_gmv.desc())
top_brand_gmv = top_brand_gmv.filter(top_brand_gmv.brand_gmv > 0).withColumn('gmv_rank',rank().over(gmv_rank))


similar_top_brand_gmv=top_brand_gmv.withColumnRenamed('cid3','item_third_cate_cd').withColumnRenamed('brand_code','similar_brand_code').withColumnRenamed("gmv_rank", "similar_gmv_rank").withColumnRenamed("brand_gmv","similar_brand_gmv")
top_brand_gmv = top_brand_gmv.crossJoin(similar_top_brand_gmv).filter("(similar_gmv_rank>gmv_rank and similar_gmv_rank<=gmv_rank+10) or (similar_gmv_rank>=gmv_rank-10 and similar_gmv_rank<gmv_rank)")
top_brand_gmv = top_brand_gmv.filter("cid3 == item_third_cate_cd")
top_brand_gmv = top_brand_gmv[['cid3','brand_code','brand_gmv','gmv_rank','item_third_cate_cd','similar_brand_code','similar_brand_gmv','similar_gmv_rank']]
top_brand_gmv = top_brand_gmv.select('cid3','brand_code','brand_gmv','gmv_rank','similar_brand_code','similar_brand_gmv','similar_gmv_rank')


# 保存结果
top_brand_gmv = top_brand_gmv[top_brand_gmv.brand_code != top_brand_gmv.similar_brand_code]
hc.registerDataFrameAsTable(top_brand_gmv, "table1")
insert_sql = '''insert overwrite table dev.dev_open_brand_similarity_volumn_da  
                partition(dt='%s') select * from table1'''%(dt)
hc.sql(insert_sql)



CREATE TABLE IF NOT EXISTS dev.dev_open_brand_similarity_volumn_da(
cid3 STRING,
brand_code STRING,
brand_gmv FLOAT,
gmv_rank INT,
similar_brand_code STRING,
similar_brand_gmv FLOAT,
similar_gmv_rank INT) 
PARTITIONED BY ( 
Esempio n. 10
0
        lambda x: [x.user_id, x.avg_price])
rdd = rdd1.map(lambda x: x[1]).repartition(100)

data = rdd.filter(lambda x: x < 30000).map(lambda x: array(x))
model = KMeans.train(data,
                     5,
                     maxIterations=20,
                     runs=50,
                     initializationMode="random",
                     seed=50,
                     initializationSteps=5,
                     epsilon=1e-4)
model.centers = sorted(model.centers)

userlevel_rdd = rdd1.map(lambda x: (x[0], x[1], model.predict([x[1]])))

schema = StructType([
    StructField("uid", StringType(), True),
    StructField("avg_price", FloatType(), True),
    StructField("ulevel", IntegerType(), True)
])
df = hiveContext.createDataFrame(userlevel_rdd, schema)
# sqlContext.registerDataFrameAsTable(df,'userlevel')

# 保存
hiveContext.registerDataFrameAsTable(df, 'userlevel')
hiveContext.sql('drop table if EXISTS t_zlj_perfer_user_level ')
hiveContext.sql(
    'create table wlbase_dev.t_zlj_perfer_user_level as select * from userlevel'
)
Esempio n. 11
0
df_date_dim = df_FILE_1_FILE_2_join.withColumn('date_id', F.date_id(df_FILE_1_FILE_2_join.published_date))
df_media_dim = df_FILE_1_FILE_2_join.withColumn('media_id', F.media_id(df_FILE_1_FILE_2_join.published_date))
df_geo_dim = df_FILE_1_FILE_2_join.withColumn('geo_area_id', F.geo_id(df_FILE_1_FILE_2_join.published_date))
df_session_dim = df_FILE_1_FILE_2_join.withColumn('session_id', F.session_id(df_FILE_1_FILE_2_join.published_date))

# Joining Vendor data with dimensions
df_fact = df_customer_dim.withColumn('vendor_id', F.current_vendor())
df_fact = df_date_dim.withColumn('vendor_id', F.current_vendor())
df_fact = df_media_dim.withColumn('vendor_id', F.current_vendor())
df_fact = df_geo_dim.withColumn('vendor_id', F.current_vendor())
df_fact = df_session_dim.withColumn('vendor_id', F.current_vendor())

df_fact.repartition(5)

# Registering data frame as a temp table for SparkSQL
hive_ctx.registerDataFrameAsTable(df_customer_dim, "MEDIA_TEMP")
hive_ctx.registerDataFrameAsTable(df_date_dim, "MEDIA_TEMP")
hive_ctx.registerDataFrameAsTable(df_media_dim, "MEDIA_TEMP")
hive_ctx.registerDataFrameAsTable(df_geo_dim, "MEDIA_TEMP")
hive_ctx.registerDataFrameAsTable(df_session_dim, "MEDIA_TEMP")
hive_ctx.registerDataFrameAsTable(df_fact, "MEDIA_TEMP")

# Target Type: APACHE HIVE
## Source   : FILE_1, FILE_2
## Table Name : Customer_Dimension
## Storage Format: ORC
## # -------------------------------------------------------
## # COLUMN NAME			DATA TYPE
## # -------------------------------------------------------
#	customer_id 		varchar(35) 	
#	anonymized_person_id 	integer(8) 	
Esempio n. 12
0
# transform RDD, using Row function
cats = rdd.map(lambda x: Row(name=x[0], age=int(x[1])))

# In[8]:

cats

# In[9]:

# Create dataframe
schemaCats = sqlContext.createDataFrame(cats)

# In[10]:

# Register it as a temp table
sqlContext.registerDataFrameAsTable(schemaCats, "cat_table")
# Show HIVE table
sqlContext.sql("show tables").show()

# In[11]:

# Using default HiveContext to select columns
sqlContext.sql("Select * from cat_table").show()

# In[12]:

# USE where clause
sqlContext.sql("Select * from cat_table where age > 20").show()

# In[13]:












result = volumn.union(switch).union(overlap)
result = result[result.brand_code_origin != result.brand_code_similar]
hc.registerDataFrameAsTable(result, "table1")
insert_sql = '''insert overwrite table dev.dev_open_brand_similarity_da 
                partition(dt='%s') select * from table1'''%(dt)
hc.sql(insert_sql)


CREATE TABLE IF NOT EXISTS dev.dev_open_brand_similarity_da(
cid3 STRING COMMENT "三级分类",
brand_code_origin STRING COMMENT "被替代的自营品牌商id",
brand_code_similar STRING COMMENT "替代from_brand_code的品牌商id",
similarity_type STRING COMMENT "算法类型出处")
COMMENT "三级分类层级下,三种相似性算法合并出的相似品牌商" 
PARTITIONED BY ( 
  `dt` string)
ROW FORMAT DELIMITED  
  FIELDS TERMINATED BY '\t'