Example #1
0
def get_spark_test():
    conf = SparkConf()
    sc = SparkContext("local[4]", appName="youzan-algrithm", conf=conf)
    sql_context = HiveContext(sc)
    sql_context.sql(""" use fex_test """)
    sql_context.setConf("spark.sql.shuffle.partitions", "1")

    return sc, sql_context
Example #2
0
def get_spark(num =4 , cores =4 , mem = "32g"):
    conf = SparkConf()
    conf.set("spark.executor.instances", "%d"%  num)
    conf.set("spark.executor.cores", "%d" % cores)
    conf.set("spark.executor.memory", "%s" % mem)
    sc = SparkContext(appName="youzan-algrithm", conf=conf)
    sql_context = HiveContext(sc)
    sql_context.sql(""" use fex """)
    sql_context.setConf("spark.sql.shuffle.partitions", "16")

    return sc, sql_context
def main():
   if len(sys.argv) > 2:
      scriptPath = sys.argv[1]
      resultPath = sys.argv[2]
   else:
      print "Missing Arguments"

   sc = SparkContext("local", "Test sql queries from pyspark")
   try:
	hsc=HiveContext(sc)
        scriptRaw=str(sc.textFile(scriptPath,use_unicode=False).cache().collect())
        print scriptRaw
        result=open(resultPath,'w')
        for i in scriptRaw.split(';'):
           i=i.replace('[\'','')
           i=i.replace('\']','')
           print i
	   if not i=="":
		df=hsc.sql(i.strip())
                df.show()
           def printSeparator(cols):
                print 'inside print' + str(cols)
                for j in range(0,cols):
                   print j
                   result.write("+----")
                result.write("+--+")             
           printHeader=True
           printFooter=False
           cols=df.columns
           print cols
           for row in df.collect():
                print str(row)
                if printHeader:
                   print str(len(cols))
		   printSeparator(len(cols))
                   for col in cols:
                      result.write("| " + col)  
                   result.write("|")
                   printSeparator(len(cols))
                   printHeader=False
                   printFooter=True
                for v in row:
                   print str(v)
                   result.write("|" + valueToString(v))
                result.write("|")    
           if(printFooter):
                printSeparator(len(cols))          
   except:
	sc.stop()
Example #4
0
 def setUpClass(cls):
     ReusedPySparkTestCase.setUpClass()
     cls.tempdir = tempfile.NamedTemporaryFile(delete=False)
     cls.hive_available = True
     try:
         cls.sc._jvm.org.apache.hadoop.hive.conf.HiveConf()
     except py4j.protocol.Py4JError:
         cls.hive_available = False
     except TypeError:
         cls.hive_available = False
     os.unlink(cls.tempdir.name)
     if cls.hive_available:
         cls.spark = HiveContext._createForTesting(cls.sc)
         cls.testData = [Row(key=i, value=str(i)) for i in range(100)]
         cls.df = cls.sc.parallelize(cls.testData).toDF()
Example #5
0
#coding=UTF-8
from pyspark import SparkContext, SparkConf, SQLContext, Row, HiveContext
from pyspark.sql.types import *
from datetime import date, datetime, timedelta
import sys, re, os

st = datetime.now()
conf = SparkConf().setAppName('PROC_A_SUBJECT_D003018').setMaster(sys.argv[2])
sc = SparkContext(conf=conf)
sc.setLogLevel('WARN')
if len(sys.argv) > 5:
    if sys.argv[5] == "hive":
        sqlContext = HiveContext(sc)
else:
    sqlContext = SQLContext(sc)
hdfs = sys.argv[3]
dbname = sys.argv[4]

#处理需要使用的日期
etl_date = sys.argv[1]
#etl日期
V_DT = etl_date
#上一日日期
V_DT_LD = (date(int(etl_date[0:4]), int(etl_date[4:6]), int(etl_date[6:8])) +
           timedelta(-1)).strftime("%Y%m%d")
#月初日期
V_DT_FMD = date(int(etl_date[0:4]), int(etl_date[4:6]), 1).strftime("%Y%m%d")
#上月末日期
V_DT_LMD = (date(int(etl_date[0:4]), int(etl_date[4:6]), 1) +
            timedelta(-1)).strftime("%Y%m%d")
#10位日期
Example #6
0
#coding=UTF-8
from pyspark import SparkContext, SparkConf, SQLContext, Row, HiveContext
from pyspark.sql.types import *
from datetime import date, datetime, timedelta
import sys, re, os

st = datetime.now()
conf = SparkConf().setAppName('PROC_A_TRADE_OPPONENT').setMaster(sys.argv[2])
sc = SparkContext(conf=conf)
sc.setLogLevel('WARN')
if len(sys.argv) > 5:
    if sys.argv[5] == "hive":
        sqlContext = HiveContext(sc)
else:
    sqlContext = SQLContext(sc)
hdfs = sys.argv[3]
dbname = sys.argv[4]

#处理需要使用的日期
etl_date = sys.argv[1]
#etl日期
V_DT = etl_date
#上一日日期
V_DT_LD = (date(int(etl_date[0:4]), int(etl_date[4:6]), int(etl_date[6:8])) +
           timedelta(-1)).strftime("%Y%m%d")
#月初日期
V_DT_FMD = date(int(etl_date[0:4]), int(etl_date[4:6]), 1).strftime("%Y%m%d")
#上月末日期
V_DT_LMD = (date(int(etl_date[0:4]), int(etl_date[4:6]), 1) +
            timedelta(-1)).strftime("%Y%m%d")
#10位日期
Example #7
0
#coding=UTF-8
from pyspark import SparkContext, SparkConf, SQLContext, Row, HiveContext
from pyspark.sql.types import *
from datetime import date, datetime, timedelta
import sys, re, os

st = datetime.now()
conf = SparkConf().setAppName('PROC_O_WBK_WXYH_PLOAN').setMaster(sys.argv[2])
sc = SparkContext(conf=conf)
sc.setLogLevel('WARN')
if len(sys.argv) > 5:
    if sys.argv[5] == "hive":
        sqlContext = HiveContext(sc)
else:
    sqlContext = SQLContext(sc)
hdfs = sys.argv[3]
dbname = sys.argv[4]

#处理需要使用的日期
etl_date = sys.argv[1]
#etl日期
V_DT = etl_date
#上一日日期
V_DT_LD = (date(int(etl_date[0:4]), int(etl_date[4:6]), int(etl_date[6:8])) +
           timedelta(-1)).strftime("%Y%m%d")
#月初日期
V_DT_FMD = date(int(etl_date[0:4]), int(etl_date[4:6]), 1).strftime("%Y%m%d")
#上月末日期
V_DT_LMD = (date(int(etl_date[0:4]), int(etl_date[4:6]), 1) +
            timedelta(-1)).strftime("%Y%m%d")
#10位日期
Example #8
0
sys.path.append(local_path + "/../")
sys.path.append(local_path)

from pyspark import SQLContext, SparkConf, HiveContext
from pyspark import SparkContext

from post import post_run
from ta import mat_close
from ta import adx
from ml import diff_feature_cls as feature
from ml import diff_train_cls_pos_ml as train


def main(sc, sql_context, is_hive):
    #post_run.main(sc, sql_context, is_hive = True)
    #mat_close.main(sc, sql_context, is_hive = True)
    #adx.main(sc, sql_context, is_hive = True)
    feature.main(10, 1, sc, sql_context, is_hive = True)
    train.main("2010-01-01", "2010-04-30", "2010-05-01", "9999-99-99", sc, sql_context, is_hive=True)
if __name__ == "__main__":
    conf = SparkConf()
    #conf.set("spark.executor.instances", "4")
    #conf.set("spark.executor.cores", "4")
    #conf.set("spark.executor.memory", "8g")

    sc = SparkContext(appName="bintrade_candidate", master="local[2]", conf=conf)
    sqlContext = HiveContext(sc)
    sqlContext.setConf("spark.sql.shuffle.partitions", "1")
    sqlContext.sql("use fex_test")
    main(sc, sqlContext, is_hive=True)
#coding=UTF-8
from pyspark import SparkContext, SparkConf, SQLContext, Row, HiveContext
from pyspark.sql.types import *
from datetime import date, datetime, timedelta
import sys, re, os

st = datetime.now()
conf = SparkConf().setAppName('PROC_O_CEN_CBOD_CMCURCUR').setMaster(
    sys.argv[2])
sc = SparkContext(conf=conf)
sc.setLogLevel('WARN')
if len(sys.argv) > 5:
    if sys.argv[5] == "hive":
        sqlContext = HiveContext(sc)
else:
    sqlContext = SQLContext(sc)
hdfs = sys.argv[3]
dbname = sys.argv[4]

#处理需要使用的日期
etl_date = sys.argv[1]
#etl日期
V_DT = etl_date
#上一日日期
V_DT_LD = (date(int(etl_date[0:4]), int(etl_date[4:6]), int(etl_date[6:8])) +
           timedelta(-1)).strftime("%Y%m%d")
#月初日期
V_DT_FMD = date(int(etl_date[0:4]), int(etl_date[4:6]), 1).strftime("%Y%m%d")
#上月末日期
V_DT_LMD = (date(int(etl_date[0:4]), int(etl_date[4:6]), 1) +
            timedelta(-1)).strftime("%Y%m%d")
Example #10
0
#coding=UTF-8
from pyspark import SparkContext, SparkConf, SQLContext, Row, HiveContext
from pyspark.sql.types import *
from datetime import date, datetime, timedelta
import sys, re, os

st = datetime.now()
conf = SparkConf().setAppName('PROC_F_CI_CUST_SIMILARLST').setMaster(sys.argv[2])
sc = SparkContext(conf = conf)
sc.setLogLevel('WARN')
if len(sys.argv) > 5:
    if sys.argv[5] == "hive":
        sqlContext = HiveContext(sc)
else:
    sqlContext = SQLContext(sc)
hdfs = sys.argv[3]
dbname = sys.argv[4]

#处理需要使用的日期
etl_date = sys.argv[1]
#etl日期
V_DT = etl_date  
#上一日日期
V_DT_LD = (date(int(etl_date[0:4]), int(etl_date[4:6]), int(etl_date[6:8])) + timedelta(-1)).strftime("%Y%m%d")
#月初日期
V_DT_FMD = date(int(etl_date[0:4]), int(etl_date[4:6]), 1).strftime("%Y%m%d") 
#上月末日期
V_DT_LMD = (date(int(etl_date[0:4]), int(etl_date[4:6]), 1) + timedelta(-1)).strftime("%Y%m%d")
#10位日期
V_DT10 = (date(int(etl_date[0:4]), int(etl_date[4:6]), int(etl_date[6:8]))).strftime("%Y-%m-%d")
V_STEP = 0
Example #11
0
                    "lateral view explode(a.rowkeys) tab as rowkey   ")
    get_id.cache()
    users = hc.sql("select rowkey,collect_set(name) name,collect_set(phone) phone ,collect_set(mail)  mail  "
                   "from new_type.user  group by rowkey").alias('a')
    contacts = hc.sql('select u_uid,collect_set(c_uid) contacts from new_type.contact group by u_uid  ')
    user_compare = get_id.join(users, 'rowkey', 'inner').select('a.rowkey', 'gid', 'name', 'phone', 'mail').join(contacts, 'rowkey',
                                                                                                  'inner').select(
        'a.rowkey', 'gid', 'name', 'phone', 'mail', 'contacts')

    def func(iter):
        result = {}
        for i in range(0, iter.__len__()):
            
            pass

    user_compare.rdd.groupBy(lambda x:x.gid).map(func)


if __name__ == '__main__':
    # user1 = {'phone': ['123'], 'name': ['zy'], 'mail': ['*****@*****.**'], 'contact_list': ['asd', 'dasdas']
    #
    #          }
    # user2 = {'phone': ['123'], 'name': ['zy', 'zy3'], 'mail': ['*****@*****.**'], 'contact_list': ['dasdas']
    #
    #          }
    #
    # a = judge_similarity(user1=user1, user2=user2)
    # print a
    sc = SparkContext(conf=SparkConf())
    hc = HiveContext(sc)
Example #12
0
#coding=UTF-8
from pyspark import SparkContext, SparkConf, SQLContext, Row, HiveContext
from pyspark.sql.types import *
from datetime import date, datetime, timedelta
import sys, re, os

st = datetime.now()
conf = SparkConf().setAppName('PROC_ACRM_A_INOUTCOME').setMaster(sys.argv[2])
sc = SparkContext(conf=conf)
sc.setLogLevel('WARN')
if len(sys.argv) > 5:
    if sys.argv[5] == "hive":
        sqlContext = HiveContext(sc)
else:
    sqlContext = SQLContext(sc)
hdfs = sys.argv[3]
dbname = sys.argv[4]

#处理需要使用的日期
etl_date = sys.argv[1]
#etl日期
V_DT = etl_date
#上一日日期
V_DT_LD = (date(int(etl_date[0:4]), int(etl_date[4:6]), int(etl_date[6:8])) +
           timedelta(-1)).strftime("%Y%m%d")
#月初日期
V_DT_FMD = date(int(etl_date[0:4]), int(etl_date[4:6]), 1).strftime("%Y%m%d")
#上月末日期
V_DT_LMD = (date(int(etl_date[0:4]), int(etl_date[4:6]), 1) +
            timedelta(-1)).strftime("%Y%m%d")
#10位日期
Example #13
0
#coding=UTF-8
from pyspark import SparkContext, SparkConf, SQLContext, Row, HiveContext
from pyspark.sql.types import *
from datetime import date, datetime, timedelta
import sys, re, os

st = datetime.now()
conf = SparkConf().setAppName('PROC_F_CI_CCARD').setMaster(sys.argv[2])
sc = SparkContext(conf=conf)
sc.setLogLevel('WARN')
if len(sys.argv) > 5:
    if sys.argv[5] == "hive":
        sqlContext = HiveContext(sc)
else:
    sqlContext = SQLContext(sc)
hdfs = sys.argv[3]
dbname = sys.argv[4]

#处理需要使用的日期
etl_date = sys.argv[1]
#etl日期
V_DT = etl_date
#上一日日期
V_DT_LD = (date(int(etl_date[0:4]), int(etl_date[4:6]), int(etl_date[6:8])) +
           timedelta(-1)).strftime("%Y%m%d")
#月初日期
V_DT_FMD = date(int(etl_date[0:4]), int(etl_date[4:6]), 1).strftime("%Y%m%d")
#上月末日期
V_DT_LMD = (date(int(etl_date[0:4]), int(etl_date[4:6]), 1) +
            timedelta(-1)).strftime("%Y%m%d")
#10位日期
Example #14
0
#coding=UTF-8
from pyspark import SparkContext, SparkConf, SQLContext, Row, HiveContext
from pyspark.sql.types import *
from datetime import date, datetime, timedelta
import sys, re, os

st = datetime.now()
conf = SparkConf().setAppName('PROC_A_R_INCOME_TOP').setMaster(sys.argv[2])
sc = SparkContext(conf = conf)
sc.setLogLevel('WARN')
if len(sys.argv) > 5:
    if sys.argv[5] == "hive":
        sqlContext = HiveContext(sc)
else:
    sqlContext = SQLContext(sc)
hdfs = sys.argv[3]
dbname = sys.argv[4]

#处理需要使用的日期
etl_date = sys.argv[1]
#etl日期
V_DT = etl_date  
#上一日日期
V_DT_LD = (date(int(etl_date[0:4]), int(etl_date[4:6]), int(etl_date[6:8])) + timedelta(-1)).strftime("%Y%m%d")
#月初日期
V_DT_FMD = date(int(etl_date[0:4]), int(etl_date[4:6]), 1).strftime("%Y%m%d") 
#上月末日期
V_DT_LMD = (date(int(etl_date[0:4]), int(etl_date[4:6]), 1) + timedelta(-1)).strftime("%Y%m%d")
#10位日期
V_DT10 = (date(int(etl_date[0:4]), int(etl_date[4:6]), int(etl_date[6:8]))).strftime("%Y-%m-%d")
V_STEP = 0
#coding=UTF-8
from pyspark import SparkContext, SparkConf, SQLContext, Row, HiveContext
from pyspark.sql.types import *
from datetime import date, datetime, timedelta
import sys, re, os

st = datetime.now()
conf = SparkConf().setAppName('PROC_F_CI_CUST_FAMILY_MEMBER').setMaster(
    sys.argv[2])
sc = SparkContext(conf=conf)
sc.setLogLevel('WARN')
if len(sys.argv) > 5:
    if sys.argv[5] == "hive":
        sqlContext = HiveContext(sc)
else:
    sqlContext = SQLContext(sc)
hdfs = sys.argv[3]
dbname = sys.argv[4]

#处理需要使用的日期
etl_date = sys.argv[1]
#etl日期
V_DT = etl_date
#上一日日期
V_DT_LD = (date(int(etl_date[0:4]), int(etl_date[4:6]), int(etl_date[6:8])) +
           timedelta(-1)).strftime("%Y%m%d")
#月初日期
V_DT_FMD = date(int(etl_date[0:4]), int(etl_date[4:6]), 1).strftime("%Y%m%d")
#上月末日期
V_DT_LMD = (date(int(etl_date[0:4]), int(etl_date[4:6]), 1) +
            timedelta(-1)).strftime("%Y%m%d")
Example #16
0
""" wordcount example using the rdd api, we'll write a test for this """
from __future__ import print_function

import sys

from pyspark import SparkContext
from pyspark import HiveContext


def do_json_counts(df, target_name):
    """ count of records where name=target_name in a dataframe with column 'name' """

    return df.filter(df.name == target_name).count()


if __name__ == "__main__":
    
    if len(sys.argv) != 2:
        sys.exit("Usage: json file}")
    
    sc = SparkContext(appName="PythonJsonCount")
    hc = HiveContext.getOrCreate(sc)
    df = hc.read.json(sys.argv[1], 1)
    
    print("Name vikas found %d times" % do_json_counts(df, 'vikas'))
    

Example #17
0
#coding=UTF-8
from pyspark import SparkContext, SparkConf, SQLContext, Row, HiveContext
from pyspark.sql.types import *
from datetime import date, datetime, timedelta
import sys, re, os

st = datetime.now()
conf = SparkConf().setAppName('PROC_O_CRE_CUSTR2').setMaster(sys.argv[2])
sc = SparkContext(conf = conf)
sc.setLogLevel('WARN')
if len(sys.argv) > 5:
    if sys.argv[5] == "hive":
        sqlContext = HiveContext(sc)
else:
    sqlContext = SQLContext(sc)
hdfs = sys.argv[3]
dbname = sys.argv[4]

#处理需要使用的日期
etl_date = sys.argv[1]
#etl日期
V_DT = etl_date  
#上一日日期
V_DT_LD = (date(int(etl_date[0:4]), int(etl_date[4:6]), int(etl_date[6:8])) + timedelta(-1)).strftime("%Y%m%d")
#月初日期
V_DT_FMD = date(int(etl_date[0:4]), int(etl_date[4:6]), 1).strftime("%Y%m%d") 
#上月末日期
V_DT_LMD = (date(int(etl_date[0:4]), int(etl_date[4:6]), 1) + timedelta(-1)).strftime("%Y%m%d")
#10位日期
V_DT10 = (date(int(etl_date[0:4]), int(etl_date[4:6]), int(etl_date[6:8]))).strftime("%Y-%m-%d")
V_STEP = 0

def save(lp,path, sc):
    lp.saveAsTextFile(path)


def main(sc, sql_context, is_hive = True):
    df_train =  get_train(sc, sql_context, is_hive)
    df_check = get_check(sc, sql_context, is_hive)
    lp_train = cal_feature(df_train, 60,3)
    lp_check = cal_feature(df_check, 60,3)

    os.system("""
    source ~/.bashrc; hadoop fs -rm -r bintrade.ml.diff.label_point.train.cls; hadoop fs -rm -r bintrade.ml.diff.label_point.check.cls
    """)
    save(lp_train, "bintrade.ml.diff.label_point.train.cls", sc)
    save(lp_check, "bintrade.ml.diff.label_point.check.cls", sc)


if __name__ == "__main__":
    conf = SparkConf()
    conf.set("spark.executor.instances", "4")
    conf.set("spark.executor.cores", "4")
    conf.set("spark.executor.memory", "8g")
    sc = SparkContext(appName="bintrade.ml.diff_feature", conf=conf)
    sql_context = HiveContext(sc)
    sql_context.sql(""" use fex """)
    main(sc, sql_context, is_hive=True)
    sc.stop()

#coding=UTF-8
from pyspark import SparkContext, SparkConf, SQLContext, Row, HiveContext
from pyspark.sql.types import *
from datetime import date, datetime, timedelta
import sys, re, os

st = datetime.now()
conf = SparkConf().setAppName('PROC_O_CEN_CBOD_CMMISMIS').setMaster(
    sys.argv[2])
sc = SparkContext(conf=conf)
sc.setLogLevel('WARN')
if len(sys.argv) > 5:
    if sys.argv[5] == "hive":
        sqlContext = HiveContext(sc)
else:
    sqlContext = SQLContext(sc)
hdfs = sys.argv[3]
dbname = sys.argv[4]

#处理需要使用的日期
etl_date = sys.argv[1]
#etl日期
V_DT = etl_date
#上一日日期
V_DT_LD = (date(int(etl_date[0:4]), int(etl_date[4:6]), int(etl_date[6:8])) +
           timedelta(-1)).strftime("%Y%m%d")
#月初日期
V_DT_FMD = date(int(etl_date[0:4]), int(etl_date[4:6]), 1).strftime("%Y%m%d")
#上月末日期
V_DT_LMD = (date(int(etl_date[0:4]), int(etl_date[4:6]), 1) +
            timedelta(-1)).strftime("%Y%m%d")
from pyspark import SparkContext, SparkConf
from pyspark import HiveContext
'''
Problem statement: find the 5 most expensive orders per person per day using Data Frames
'''

conf = SparkConf().setAppName("mostExpensiveOrderPerDayPerPersonRDD")
sc = SparkContext(conf=conf)
hiveContext = HiveContext(sc)
#makes sure that the 'retail_db' hive database will be used
hiveContext.sql("use retail_db")

#loading the data from hive into dataframes
orders = hiveContext.sql("select order_id, order_date, order_customer_id from orders")
customers = hiveContext.sql("select customer_id, customer_fname, customer_lname from customers")
order_items = hiveContext.sql("select order_item_order_id, order_item_subtotal from order_items")

#joining the customers with orders on customer_id.  Orders and customers are the smaller tables
#so I try to join small tables with other small tables before joining to a big table.
orders_join_customers = orders.join(customers, orders.order_customer_id == customers.customer_id)

#joining on order_id so that I get rows with a customer and their purchases
orders_customers_join_order_items = \
	orders_join_customers.join(order_items, orders_join_customers.order_id == \
							order_items.order_item_order_id)
#aggregating by order_date and customer_id with the sum aggregation.
#This finds how much a person spent on a single day
aggResult = orders_customers_join_order_items.groupBy(['order_date','customer_id']).agg({"order_item_subtotal": "sum"}).withColumnRenamed("sum(order_item_subtotal)", "subtotal_sum")

#because in the aggregation the order_date, customer, and sum were generated in a data frame,
#I must unfortunatly rejoin to the customers table to display the purchases with names
from pyspark import SparkContext, SparkConf, HiveContext

conf = SparkConf().setAppName("most expensive product using SQL")
sc = SparkContext(conf=conf)
hiveContext = HiveContext(sc)

sqlString = 	"SELECT p.product_name, p.product_price \
		FROM retail_db.products p \
		JOIN 	(SELECT max(products.product_price) max_id \
			FROM retail_db.products) the_max \
			ON \
			p.product_price = the_max.max_id"

result = hiveContext.sql(sqlString)

print("***********************\n{0}".format(str(result.take(1))))
Example #22
0
#!/usr/bin/env python
# -*- coding:utf-8 -*-
# author [email protected]
import os
import sys

from pyspark import SQLContext, HiveContext
from pyspark import SparkContext

local_path = os.path.dirname(__file__)
sys.path.append(local_path + "/../lib")

def main(sc, sqlContext, isHive = True):
    dfSymMDate = sqlContext.sql("""
        SELECT
            symbol, max(date) as max, count(date) as c
        FROM
            eod2
        GROUP BY
            symbol
        ORDER BY
            symbol
    """)
    for each in dfSymMDate.collect():
        print "%s\t%s\t%d" % (each.symbol, each.max, each.c)
if __name__ == "__main__":
    sc = SparkContext(appName="bintrade_candidate")
    sqlContext = HiveContext(sc)
    sqlContext.sql("use fex")
    main(sc, sqlContext)
    sc.stop()
Example #23
0
#coding=UTF-8
from pyspark import SparkContext, SparkConf, SQLContext, Row, HiveContext
from pyspark.sql.types import *
from datetime import date, datetime, timedelta
import sys, re, os

st = datetime.now()
conf = SparkConf().setAppName('PROC_O_ZDH_ZZDH_HISTORY_LS').setMaster(
    sys.argv[2])
sc = SparkContext(conf=conf)
sc.setLogLevel('WARN')
if len(sys.argv) > 5:
    if sys.argv[5] == "hive":
        sqlContext = HiveContext(sc)
else:
    sqlContext = SQLContext(sc)
hdfs = sys.argv[3]
dbname = sys.argv[4]

#处理需要使用的日期
etl_date = sys.argv[1]
#etl日期
V_DT = etl_date
#上一日日期
V_DT_LD = (date(int(etl_date[0:4]), int(etl_date[4:6]), int(etl_date[6:8])) +
           timedelta(-1)).strftime("%Y%m%d")
#月初日期
V_DT_FMD = date(int(etl_date[0:4]), int(etl_date[4:6]), 1).strftime("%Y%m%d")
#上月末日期
V_DT_LMD = (date(int(etl_date[0:4]), int(etl_date[4:6]), 1) +
            timedelta(-1)).strftime("%Y%m%d")
from pyspark import SparkConf, SparkContext, HiveContext

conf = SparkConf().setAppName("Revenue per category")
sc = SparkContext(conf=conf)
hiveContext = HiveContext(sc)

hiveContext.sql("use retail_db")

sqlString = 	"SELECT	first_value(cat.category_name), \
			round(sum(oi.order_item_subtotal), 2)  category_revenue\
		FROM 	categories cat, products prod, order_items oi \
		WHERE 	cat.category_id = prod.product_category_id  \
			AND \
			prod.product_id = oi.order_item_product_id \
		GROUP BY cat.category_id \
		ORDER BY category_revenue DESC"

result = hiveContext.sql(sqlString)
collected = result.collect()
print "*****************\n{0}".format("\n".join([str(x) for x in collected]))










Example #25
0
    fg_count = int(sys.argv[2])

print("Parameters%s" % (len(sys.argv) - 1))
while (arguments >= position):
    print("Parameter %i: %s" % (position, sys.argv[position]))
    position = position + 1

from pyspark.sql.types import *
from pyspark.sql import SparkSession
from pyspark import SparkContext, HiveContext
import hsfs
import numpy as np

spark = SparkSession.builder.appName(
    "create_synthetic_fg").enableHiveSupport().getOrCreate()
sqlContext = HiveContext(spark.sparkContext)
connection = hsfs.connection()
fs = connection.get_feature_store()

size = 10

for i in list(range(0, fg_count)):
    fg_data = []
    for j in list(range(1, size)):
        fg_data.append((j, np.random.normal(), np.random.normal()))
    fg_col_1 = 'fg' + str(i) + "_col1"
    fg_col_2 = 'fg' + str(i) + "_col2"
    fg_name = fg_prefix + str(i)
    fg_spark_df = spark.createDataFrame(fg_data, ['id', fg_col_1, fg_col_2])
    fg_description = "synthetic " + fg_name
    fg = fs.create_feature_group(fg_name,
Example #26
0
#coding=UTF-8
from pyspark import SparkContext, SparkConf, SQLContext, Row, HiveContext
from pyspark.sql.types import *
from datetime import date, datetime, timedelta
import sys, re, os

st = datetime.now()
conf = SparkConf().setAppName('PROC_F_CI_GROUP_MEMBER').setMaster(sys.argv[2])
sc = SparkContext(conf=conf)
sc.setLogLevel('WARN')
if len(sys.argv) > 5:
    if sys.argv[5] == "hive":
        sqlContext = HiveContext(sc)
else:
    sqlContext = SQLContext(sc)
hdfs = sys.argv[3]
dbname = sys.argv[4]

#处理需要使用的日期
etl_date = sys.argv[1]
#etl日期
V_DT = etl_date
#上一日日期
V_DT_LD = (date(int(etl_date[0:4]), int(etl_date[4:6]), int(etl_date[6:8])) +
           timedelta(-1)).strftime("%Y%m%d")
#月初日期
V_DT_FMD = date(int(etl_date[0:4]), int(etl_date[4:6]), 1).strftime("%Y%m%d")
#上月末日期
V_DT_LMD = (date(int(etl_date[0:4]), int(etl_date[4:6]), 1) +
            timedelta(-1)).strftime("%Y%m%d")
#10位日期
#coding=UTF-8
from pyspark import SparkContext, SparkConf, SQLContext, Row, HiveContext
from pyspark.sql.types import *
from datetime import date, datetime, timedelta
import sys, re, os

st = datetime.now()
conf = SparkConf().setAppName('PROC_O_CEN_CBOD_LNLNSJRN0').setMaster(sys.argv[2])
sc = SparkContext(conf = conf)
sc.setLogLevel('WARN')
if len(sys.argv) > 5:
    if sys.argv[5] == "hive":
        sqlContext = HiveContext(sc)
else:
    sqlContext = SQLContext(sc)
hdfs = sys.argv[3]
dbname = sys.argv[4]

#处理需要使用的日期
etl_date = sys.argv[1]
#etl日期
V_DT = etl_date  
#上一日日期
V_DT_LD = (date(int(etl_date[0:4]), int(etl_date[4:6]), int(etl_date[6:8])) + timedelta(-1)).strftime("%Y%m%d")
#月初日期
V_DT_FMD = date(int(etl_date[0:4]), int(etl_date[4:6]), 1).strftime("%Y%m%d") 
#上月末日期
V_DT_LMD = (date(int(etl_date[0:4]), int(etl_date[4:6]), 1) + timedelta(-1)).strftime("%Y%m%d")
#10位日期
V_DT10 = (date(int(etl_date[0:4]), int(etl_date[4:6]), int(etl_date[6:8]))).strftime("%Y-%m-%d")
V_STEP = 0
#coding=UTF-8
from pyspark import SparkContext, SparkConf, SQLContext, Row, HiveContext
from pyspark.sql.types import *
from datetime import date, datetime, timedelta
import sys, re, os

st = datetime.now()
conf = SparkConf().setAppName('PROC_A_SUBJECT_D004009').setMaster(sys.argv[2])
sc = SparkContext(conf=conf)
sc.setLogLevel('WARN')
if len(sys.argv) > 5:
    if sys.argv[5] == "hive":
        sqlContext = HiveContext(sc)
else:
    sqlContext = SQLContext(sc)
hdfs = sys.argv[3]
dbname = sys.argv[4]

#处理需要使用的日期
etl_date = sys.argv[1]
#etl日期
V_DT = etl_date
#上一日日期
V_DT_LD = (date(int(etl_date[0:4]), int(etl_date[4:6]), int(etl_date[6:8])) +
           timedelta(-1)).strftime("%Y%m%d")
#月初日期
V_DT_FMD = date(int(etl_date[0:4]), int(etl_date[4:6]), 1).strftime("%Y%m%d")
#上月末日期
V_DT_LMD = (date(int(etl_date[0:4]), int(etl_date[4:6]), 1) +
            timedelta(-1)).strftime("%Y%m%d")
#10位日期
Example #29
0
from pyspark import SparkContext, HiveContext
sc = SparkContext(appName = "test")
sqlContext = HiveContext(sc)
sqlContext.sql("ADD JAR hdfs:///spark/auxjar/geometry-api.jar")
sqlContext.sql("ADD JAR hdfs:///spark/auxjar/geowave-csv-driver-0.80.jar")
sqlContext.sql("ADD JAR hdfs:///spark/auxjar/geowave-driver-0.80.jar")
sqlContext.sql("ADD JAR hdfs:///spark/auxjar/geowave-geotools-datastore-0.80.jar")
sqlContext.sql("ADD JAR hdfs:///spark/auxjar/geowave-raster-driver-0.80.jar")
sqlContext.sql("ADD JAR hdfs:///spark/auxjar/geowave-share-0.80-jar-with-dependencies.jar")
sqlContext.sql("ADD JAR hdfs:///spark/auxjar/geowave-util-0.80.jar")
sqlContext.sql("ADD JAR hdfs:///spark/auxjar/geowave-vector-driver-0.80.jar")
sqlContext.sql("ADD JAR hdfs:///spark/auxjar/gt-api-16.0.jar")
sqlContext.sql("ADD JAR hdfs:///spark/auxjar/json-serde-1.3.6.jar")
sqlContext.sql("ADD JAR hdfs:///spark/auxjar/oushive.jar")
sqlContext.sql("ADD JAR hdfs:///spark/auxjar/spatial-sdk-hadoop.jar")
sqlContext.sql("set geowave.server.ip=j11.forcewave.co.kr")
sqlContext.sql("set geowave.server.port=54555")
df = sqlContext.sql("select gizscore from htmrres limit 100")
df.show()
#coding=UTF-8
from pyspark import SparkContext, SparkConf, SQLContext, Row, HiveContext
from pyspark.sql.types import *
from datetime import date, datetime, timedelta
import sys, re, os

st = datetime.now()
conf = SparkConf().setAppName('PROC_O_LNA_XDXT_IND_INFO').setMaster(
    sys.argv[2])
sc = SparkContext(conf=conf)
sc.setLogLevel('WARN')
if len(sys.argv) > 5:
    if sys.argv[5] == "hive":
        sqlContext = HiveContext(sc)
else:
    sqlContext = SQLContext(sc)
hdfs = sys.argv[3]
dbname = sys.argv[4]

#处理需要使用的日期
etl_date = sys.argv[1]
#etl日期
V_DT = etl_date
#上一日日期
V_DT_LD = (date(int(etl_date[0:4]), int(etl_date[4:6]), int(etl_date[6:8])) +
           timedelta(-1)).strftime("%Y%m%d")
#月初日期
V_DT_FMD = date(int(etl_date[0:4]), int(etl_date[4:6]), 1).strftime("%Y%m%d")
#上月末日期
V_DT_LMD = (date(int(etl_date[0:4]), int(etl_date[4:6]), 1) +
            timedelta(-1)).strftime("%Y%m%d")
Example #31
0
#coding=UTF-8
from pyspark import SparkContext, SparkConf, SQLContext, Row, HiveContext
from pyspark.sql.types import *
from datetime import date, datetime, timedelta
import sys, re, os

st = datetime.now()
conf = SparkConf().setAppName('PROC_O_ZDH_ZZDH_SHOP').setMaster(sys.argv[2])
sc = SparkContext(conf = conf)
sc.setLogLevel('WARN')
if len(sys.argv) > 5:
    if sys.argv[5] == "hive":
        sqlContext = HiveContext(sc)
else:
    sqlContext = SQLContext(sc)
hdfs = sys.argv[3]
dbname = sys.argv[4]

#处理需要使用的日期
etl_date = sys.argv[1]
#etl日期
V_DT = etl_date  
#上一日日期
V_DT_LD = (date(int(etl_date[0:4]), int(etl_date[4:6]), int(etl_date[6:8])) + timedelta(-1)).strftime("%Y%m%d")
#月初日期
V_DT_FMD = date(int(etl_date[0:4]), int(etl_date[4:6]), 1).strftime("%Y%m%d") 
#上月末日期
V_DT_LMD = (date(int(etl_date[0:4]), int(etl_date[4:6]), 1) + timedelta(-1)).strftime("%Y%m%d")
#10位日期
V_DT10 = (date(int(etl_date[0:4]), int(etl_date[4:6]), int(etl_date[6:8]))).strftime("%Y-%m-%d")
V_STEP = 0
from pyspark import SparkContext, SparkConf
from pyspark import HiveContext
'''
Problem statement: find the 5 most expensive orders per person per day using SparkSQL.
'''
conf = SparkConf().setAppName("mostExpensiveOrderPerDayPerPersonSQL")
sc = SparkContext(conf=conf)
hiveContext = HiveContext(sc)
#makes sure that the 'retail_db' hive database will be used
hiveContext.sql("use retail_db")

#firstvalue() must be used because of a bug.  
#Without it, columns that are not in the group by or the aggregation part cannot be shown.
sqlString = "SELECT \
			first_value(customers.customer_fname), \
			first_value(customers.customer_lname), \
			orders.order_date, \
			ROUND(SUM(order_items.order_item_subtotal), 2) the_total\
		FROM customers, orders, order_items \
		WHERE 	orders.order_id = order_items.order_item_order_id \
			AND \
			customers.customer_id = orders.order_customer_id \
		GROUP BY orders.order_date, customers.customer_id \
		ORDER BY the_total DESC"

result = hiveContext.sql(sqlString).rdd #rdd used because this is certification practice
top_records = result.take(5)
print "*****************\n{0}".format(str(top_records))


#coding=UTF-8
from pyspark import SparkContext, SparkConf, SQLContext, Row, HiveContext
from pyspark.sql.types import *
from datetime import date, datetime, timedelta
import sys, re, os

st = datetime.now()
conf = SparkConf().setAppName('PROC_A_R_FINANCING_TOP').setMaster(sys.argv[2])
sc = SparkContext(conf=conf)
sc.setLogLevel('WARN')
if len(sys.argv) > 5:
    if sys.argv[5] == "hive":
        sqlContext = HiveContext(sc)
else:
    sqlContext = SQLContext(sc)
hdfs = sys.argv[3]
dbname = sys.argv[4]

#处理需要使用的日期
etl_date = sys.argv[1]
#etl日期
V_DT = etl_date
#上一日日期
V_DT_LD = (date(int(etl_date[0:4]), int(etl_date[4:6]), int(etl_date[6:8])) +
           timedelta(-1)).strftime("%Y%m%d")
#月初日期
V_DT_FMD = date(int(etl_date[0:4]), int(etl_date[4:6]), 1).strftime("%Y%m%d")
#上月末日期
V_DT_LMD = (date(int(etl_date[0:4]), int(etl_date[4:6]), 1) +
            timedelta(-1)).strftime("%Y%m%d")
#10位日期
Example #34
0
import sys

filename = sys.argv[1]
saveto = sys.argv[2]
jobName = sys.argv[3]
allMVs = sys.argv[4]

print "=> filename %s" % filename
print "=> saveto %s" % saveto
print "=> jobName %s" % jobName
print "=> allMVs %s" % allMVs

conf = SparkConf().setAppName(jobName)
sc = SparkContext(conf=conf)
sc.setLogLevel("WARN")
sqlc = HiveContext(sc)

T_XRD_HDFS = sqlc.read.format("json").load(filename)
T_XRD_HDFS.registerTempTable("T_XRD_HDFS")
print "=> rowsCounter T_XRD_HDFS: %d" % T_XRD_HDFS.count()
# T_XRD_HDFS.write.format("com.databricks.spark.csv").option("header", "false").save(saveto + "/T_XRD_HDFS")

T_XRD_RAW_FILE = sqlc.sql("SELECT from_unixtime(end_time, 'yyyy/MM/dd') as TDay, start_time as ots, end_time as cts, file_lfn, client_host, if(server_username = '', 'unknown', server_username) as server_username, (end_time - start_time) as proctime, read_bytes_at_close as readbytes FROM T_XRD_HDFS WHERE (end_time - start_time) > 0 AND read_bytes_at_close > 0 AND `_corrupt_record` IS NULL")
T_XRD_RAW_FILE.registerTempTable("T_XRD_RAW_FILE")
print "=> rowsCounter T_XRD_RAW_FILE: %d" % T_XRD_RAW_FILE.count()
if allMVs == 1:
    T_XRD_RAW_FILE.write.format("com.databricks.spark.csv").option("header", "false").save(saveto + "/T_XRD_RAW_FILE")
 
T_XRD_LFC = sqlc.read.format("com.databricks.spark.csv").load("/project/awg/cms/phedex/catalog/csv/merged/").toDF("dataset_name", "dataset_id", "dataset_is_open", "dataset_time_create", "block_name", "block_id", "block_time_create", "block_is_open", "file_lfn", "file_id", "filesize", "usernameXX", "checksum", "file_time_create")
T_XRD_LFC.registerTempTable("T_XRD_LFC")
print "=> rowsCounter T_XRD_LFC: %d" % T_XRD_LFC.count()
Example #35
0
#coding=UTF-8
from pyspark import SparkContext, SparkConf, SQLContext, Row, HiveContext
from pyspark.sql.types import *
from datetime import date, datetime, timedelta
import sys, re, os

st = datetime.now()
conf = SparkConf().setAppName('PROC_F_CI_CUST_RALE').setMaster(sys.argv[2])
sc = SparkContext(conf = conf)
sc.setLogLevel('WARN')
if len(sys.argv) > 5:
    if sys.argv[5] == "hive":
        sqlContext = HiveContext(sc)
else:
    sqlContext = SQLContext(sc)
hdfs = sys.argv[3]
dbname = sys.argv[4]

#处理需要使用的日期
etl_date = sys.argv[1]
#etl日期
V_DT = etl_date  
#上一日日期
V_DT_LD = (date(int(etl_date[0:4]), int(etl_date[4:6]), int(etl_date[6:8])) + timedelta(-1)).strftime("%Y%m%d")
#月初日期
V_DT_FMD = date(int(etl_date[0:4]), int(etl_date[4:6]), 1).strftime("%Y%m%d") 
#上月末日期
V_DT_LMD = (date(int(etl_date[0:4]), int(etl_date[4:6]), 1) + timedelta(-1)).strftime("%Y%m%d")
#10位日期
V_DT10 = (date(int(etl_date[0:4]), int(etl_date[4:6]), int(etl_date[6:8]))).strftime("%Y-%m-%d")
V_STEP = 0
from pyspark import SparkConf, SparkContext 
from pyspark.mllib.regression import LabeledPoint
import numpy as np
import string 
from pyspark import HiveContext
conf = SparkConf().setMaster('local').setAppName('SparkMLib_FinalProject') 
sc = SparkContext(conf = conf) 

RDD = HiveContext(sc).sql('select * from finalproject_merged2')
RDD.count()

RDD.cache()
def get_mapping(rdd, idx):
	return rdd.map(lambda fields: fields[idx]).distinct().zipWithIndex().collectAsMap()
print "Mapping of the categorical feature column: %s" % get_mapping(RDD, 12) # count from 0
print "Mapping of the categorical feature column: %s" % get_mapping(RDD, 13) # count from 0
print "Mapping of the categorical feature column: %s" % get_mapping(RDD, 14) # count from 0

mappings = [get_mapping(RDD, i) for i in [12,13,14]]
def extract_features_dt(record):
	record_num_vec = [record[1],record[2],record[3],record[4],record[5],record[6],record[7],record[8],record[9],record[10],record[11],record[15],record[16],record[18],record[19]]
	record_cat_vec = [record[12],record[13],record[14]] # because we cannot directly use record[12,13,14]
	numvalues = np.array([float(field) for field in record_num_vec])
	cat_vec = np.zeros(3)
	i=0
	for field in record_cat_vec: 
		m_countrycat = mappings[i] # instead of directly call get_mapping(record,3), we create dict
		idx = m_countrycat[field]
		cat_vec[i]=idx
		i=i+1
	return np.concatenate((numvalues,cat_vec))
Example #37
0
#coding=UTF-8
from pyspark import SparkContext, SparkConf, SQLContext, Row, HiveContext
from pyspark.sql.types import *
from datetime import date, datetime, timedelta
import sys, re, os

st = datetime.now()
conf = SparkConf().setAppName('PROC_A_CUST_ASSIGN_COM_SAVE').setMaster(
    sys.argv[2])
sc = SparkContext(conf=conf)
sc.setLogLevel('WARN')
if len(sys.argv) > 5:
    if sys.argv[5] == "hive":
        sqlContext = HiveContext(sc)
else:
    sqlContext = SQLContext(sc)
hdfs = sys.argv[3]
dbname = sys.argv[4]

#处理需要使用的日期
etl_date = sys.argv[1]
#etl日期
V_DT = etl_date
#上一日日期
V_DT_LD = (date(int(etl_date[0:4]), int(etl_date[4:6]), int(etl_date[6:8])) +
           timedelta(-1)).strftime("%Y%m%d")
#月初日期
V_DT_FMD = date(int(etl_date[0:4]), int(etl_date[4:6]), 1).strftime("%Y%m%d")
#上月末日期
V_DT_LMD = (date(int(etl_date[0:4]), int(etl_date[4:6]), 1) +
            timedelta(-1)).strftime("%Y%m%d")
Example #38
0
    sqlstr = sqlstr[: len(sqlstr)-2]
    sqlstr += "\n) stored as orc"
    print sqlstr

    sql_context.sql(sqlstr)
    df.insertInto(tableName, overwrite)



if __name__ == '__main__':
    #log.debug("debug")
    #a = eval("(1,[2,3])")
    #print "xxxxxxx",a[1][0]
    #a = {1: 1.0, 3: 5.5}
    #str_a = str(a)
    #a = eval(str_a)
    #print a[1]

    #print json.loads("""{1:1}""")
    sc = SparkContext("local[1]", appName="bintrade.ml.diff_feature")
    sql_context = HiveContext(sc)
    sql_context.sql(""" use fex_test """)
    sql_context.setConf("spark.sql.shuffle.partitions", "1")


    ldict = [{"symbol":"AAA", "date":"2010-01-01", "close":1.0}, {"symbol":"AAA","date":"2010-01-01", "close":1.0}]

    df = sql_context.createDataFrame(ldict)
    dfToTableWithPar(sql_context, df,  "test_eod_AAA")

Example #39
0
#!/usr/bin/env python
# -*- coding:utf-8 -*-
# author [email protected]
import os
import sys

local_path = os.path.dirname(__file__)
sys.path.append(local_path + "/../lib")
sys.path.append(local_path + "/../")

from pyspark import SQLContext, HiveContext
from pyspark import SparkContext

import eod





if __name__ == "__main__":
    sc = SparkContext(appName="bintrade_candidate", master="yarn-client")
    sc.setSystemProperty("spark.driver.memory",     "1g")
    sc.setSystemProperty("spark.executor.memory",   "8g")
    sc.setSystemProperty("spark.executor.cores",    "2")

    sqlContext = HiveContext(sc)
    sqlContext.setConf("spark.sql.shuffle.partitions", "16")
    sqlContext.sql("use fex")

    eod.run(sc, sqlContext, isHive=True)
Example #40
0
    sql_context.sql(sqlstr)
    df.insertInto(tableName, overwrite)


if __name__ == '__main__':
    #log.debug("debug")
    #a = eval("(1,[2,3])")
    #print "xxxxxxx",a[1][0]
    #a = {1: 1.0, 3: 5.5}
    #str_a = str(a)
    #a = eval(str_a)
    #print a[1]

    #print json.loads("""{1:1}""")
    sc = SparkContext("local[1]", appName="bintrade.ml.diff_feature")
    sql_context = HiveContext(sc)
    sql_context.sql(""" use fex_test """)
    sql_context.setConf("spark.sql.shuffle.partitions", "1")

    ldict = [{
        "symbol": "AAA",
        "date": "2010-01-01",
        "close": 1.0
    }, {
        "symbol": "AAA",
        "date": "2010-01-01",
        "close": 1.0
    }]

    df = sql_context.createDataFrame(ldict)
    dfToTableWithPar(sql_context, df, "test_eod_AAA")
#coding=UTF-8
from pyspark import SparkContext, SparkConf, SQLContext, Row, HiveContext
from pyspark.sql.types import *
from datetime import date, datetime, timedelta
import sys, re, os

st = datetime.now()
conf = SparkConf().setAppName('PROC_O_LNA_XDXT_VILLAGE_INFO').setMaster(
    sys.argv[2])
sc = SparkContext(conf=conf)
sc.setLogLevel('WARN')
if len(sys.argv) > 5:
    if sys.argv[5] == "hive":
        sqlContext = HiveContext(sc)
else:
    sqlContext = SQLContext(sc)
hdfs = sys.argv[3]
dbname = sys.argv[4]

#处理需要使用的日期
etl_date = sys.argv[1]
#etl日期
V_DT = etl_date
#上一日日期
V_DT_LD = (date(int(etl_date[0:4]), int(etl_date[4:6]), int(etl_date[6:8])) +
           timedelta(-1)).strftime("%Y%m%d")
#月初日期
V_DT_FMD = date(int(etl_date[0:4]), int(etl_date[4:6]), 1).strftime("%Y%m%d")
#上月末日期
V_DT_LMD = (date(int(etl_date[0:4]), int(etl_date[4:6]), 1) +
            timedelta(-1)).strftime("%Y%m%d")
from pyspark.sql import SQLContext, Row
from pyspark import SparkContext
from pyspark import HiveContext
import unittest
import os

sc = SparkContext("local", "Test sql queries from pyspark")

#Change this variable to point to your spak 1.5 example resources
examplefiles_path="/Users/gayathrimurali/spark-1.5.1/examples/src/main/resources/"


#Test 1: Sqlcontext and Hivecontext are created
sqlContext = SQLContext(sc)
hivecontext = HiveContext(sc)
   
#Test 2: Read from a parquet file using sql and hive context into a dataframe. Display and do some filter operations on the dataframe
df_sql=sqlContext.read.load(examplefiles_path + "users.parquet")
df_hive=hivecontext.read.load(examplefiles_path + "users.parquet")

df_sql.show()
df_hive.show()

df_hive.printSchema()

df_hive.filter(df_hive['favorite_color']=='red').show()

#Test 3: Write selected columns from dataframe into a parquet file

if not os.path.exists(examplefiles_path + "nameAndFavColors.parquet"):
   df_hive.select("name","favorite_color").write.save(examplefiles_path + "nameAndFavColors.parquet")
#coding=UTF-8
from pyspark import SparkContext, SparkConf, SQLContext, Row, HiveContext
from pyspark.sql.types import *
from datetime import date, datetime, timedelta
import sys, re, os

st = datetime.now()
conf = SparkConf().setAppName('PROC_A_SUBJECT_D002021').setMaster(sys.argv[2])
sc = SparkContext(conf = conf)
sc.setLogLevel('WARN')
if len(sys.argv) > 5:
    if sys.argv[5] == "hive":
        sqlContext = HiveContext(sc)
else:
    sqlContext = SQLContext(sc)
hdfs = sys.argv[3]
dbname = sys.argv[4]

#处理需要使用的日期
etl_date = sys.argv[1]
#etl日期
V_DT = etl_date  
#上一日日期
V_DT_LD = (date(int(etl_date[0:4]), int(etl_date[4:6]), int(etl_date[6:8])) + timedelta(-1)).strftime("%Y%m%d")
#月初日期
V_DT_FMD = date(int(etl_date[0:4]), int(etl_date[4:6]), 1).strftime("%Y%m%d") 
#上月末日期
V_DT_LMD = (date(int(etl_date[0:4]), int(etl_date[4:6]), 1) + timedelta(-1)).strftime("%Y%m%d")
#10位日期
V_DT10 = (date(int(etl_date[0:4]), int(etl_date[4:6]), int(etl_date[6:8]))).strftime("%Y-%m-%d")
V_STEP = 0
Example #44
0
#execfile('/data/w205Project/spark/getLinks.py') <-- don't use. use spark-submit instead.

from pyspark import SparkContext, HiveContext
sc = SparkContext()
sqlContext = HiveContext(sc)


from pyspark.sql.functions import UserDefinedFunction
from pyspark.sql.types import *
from pyspark.sql import functions as F
from pyspark.sql.window import Window

sqlContext.sql("ADD JAR /data/w205Project/load/hive-serdes-1.0-SNAPSHOT.jar");
# sqlContext.sql("ADD JAR /usr/lib/hadoop/hadoop-aws.jar");
# sqlContext.sql("ADD JAR /usr/lib/hadoop/lib/aws-java-sdk-1.7.14.jar");

###############################################
#    EXTRACT ALL THE LINKS INDISCRIMINATELY   #
###############################################

'''
links = sqlContext.sql("select entities.urls.url[0] as tco, entities.urls.expanded_url[0] as link from tweets where entities.urls.url[0] IS NOT NULL");
uniqueLInks = links.dropDuplicates(['tco', 'link'])
uniqueLInks.repartition(1).save("s3n://w205twitterproject/links5","json")
'''


###############################################
#                  ANALYZE                    #
###############################################
Example #45
0
    y_pred = lr_model.predict(train_data)
    y_prob = lr_model.predict_proba(train_data)[:, 1]

    print("evaluation model")
    pr = float(
        np.sum([
            1 if y_pred[i] == train_label[i] else 0
            for i in range(len(train_label))
        ])) / float(len(train_label))
    print("prediction precision: " + str(pr))


def train(rating_file_path, user_file_path, item_file_path, k):
    data_sample = sample(sc, rating_file_path, user_file_path, item_file_path,
                         k)
    train_data, train_label = extract_feature_label(data_sample)
    leaf = gbdt_train(train_data, train_label)
    leaf_transform = transfromed_feature(leaf, leaf.max())
    lr_train(leaf_transform, train_label)


if __name__ == "__main__":
    sc = SparkContext('local', 'traing')
    sqlcontext = HiveContext(sc)
    sc.setLogLevel("ERROR")
    rating_file_path = "E:/data/ml-100k/u.data"
    user_file_path = "E:/data/ml-100k/u.user"
    item_file_path = "E:/data/ml-100k/u.item"
    k = 5
    train(rating_file_path, user_file_path, item_file_path, k)
from pyspark import SparkContext, SparkConf
from pyspark import HiveContext

conf = SparkConf().setAppName("revenueByDaySQL")
sc = SparkContext(conf=conf)
hiveContext = HiveContext(sc)
hiveContext.sql("use retail_db")
sqlString = "SELECT 	orders.order_date, \
			ROUND(SUM(order_items.order_item_subtotal), 2) the_sum, \
			COUNT(DISTINCT orders.order_id) the_count\
		FROM orders, order_items \
		WHERE orders.order_id = order_items.order_item_order_id \
		GROUP BY orders.order_date \
		ORDER BY the_sum"

joinded_aggregate_data = hiveContext.sql(sqlString)

print str(joinded_aggregate_data.take(5))












Example #47
0
def load_data():
    # load data from files
    # and return query results / aggregates.

    hiveContext = HiveContext(sc)
    # 1027
    # path = '/home/brandon/PycharmProjects/markov_chain/data/raw_tx/'
    # path = '/home/brandon/PycharmProjects/markov_chain/data/raw_tx_fraud/train/'

    # AMAZON AWS EMR
    path = 'hdfs:///tmp/files/'    #HDFS


    # new segement files

    tx_files = [path + 'l_adults_2550_female_rural.csv', path + 'l_adults_2550_female_urban.csv', path + 'l_adults_2550_male_rural.csv', \
    path + 'l_adults_2550_male_urban.csv', path + 'l_young_adults_female_rural.csv', path + 'l_young_adults_female_urban.csv',\
    path + 'l_young_adults_male_rural.csv', path + 'l_young_adults_male_urban.csv', path + 'l_adults_50up_female_rural.csv', \
    path + 'l_adults_50up_female_urban.csv', path + 'l_adults_50up_male_rural.csv', path + 'l_adults_50up_male_urban.csv' ]

    # small file for debugging
    # 1027

    # tx_files = [path + 's_l_male_30_40_smaller_cities.csv']
    # tx_files = [path + 'sorted_fraud_male_30_40_smaller_cities.csv']

    # tx_files = [path+'40_60_bigger_cities.csv',path+'40_60_smaller_cities.csv',path+'all_60_up.csv'\
    #         ,path+'female_30_40_bigger_cities.csv',path+'female_30_40_smaller_cities.csv'\
    #         ,path+'male_30_40_bigger_cities.csv',path+'male_30_40_smaller_cities.csv'\
    #         ,path+'millenials.csv',path+'young_adults.csv']

    # 1027
    # tx_files = [path+'l_40_60_bigger_cities.csv',path+'l_40_60_smaller_cities.csv',path+'l_all_60_up.csv'\
    #         ,path+'l_female_30_40_bigger_cities.csv',path+'l_female_30_40_smaller_cities.csv'\
    #         ,path+'l_male_30_40_bigger_cities.csv',path+'l_male_30_40_smaller_cities.csv'\
    #         ,path+'l_millenials.csv',path+'l_young_adults.csv']



    all_tx = sc.textFile(','.join(tx_files),600)

    # 1027
    # txSchemaString = 'ssn|cc_num|first|last|gender|street|city|state|zip|lat|long|city_pop|job|dob|acct_num|profile|trans_num|trans_date|trans_time|unix_time|category|amt|merchant|merch_lat|merch_long'
    txSchemaString = 'ssn|cc_num|first|last|gender|street|city|state|zip|lat|long|city_pop|job|dob|acct_num|profile|trans_num|trans_date|trans_time|unix_time|category|amt|is_fraud|merchant|merch_lat|merch_long'
    txFields = [StructField(field_name, StringType(), True) for field_name in txSchemaString.split('|')]
    txFields[17] = StructField('trans_date', DateType(), True)

    txSchema = StructType(txFields)
    # ssn|cc_num|first|last|gender|street|city|state|zip|lat|long|city_pop|job|dob|acct_num|profile|trans_num|trans_date|trans_time|unix_time|category|amt|merchant|merch_lat|merch_long
    txHeader = all_tx.filter(lambda l: "ssn|" in l)
    txNoHeader = all_tx.subtract(txHeader)

    temp_tx = txNoHeader.map(lambda k: k.split("|")).map(lambda p: (
    p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15], p[16],
    datetime.datetime.strptime(p[17], '%Y-%m-%d').date(), p[18], p[19], p[20], p[21], p[22], p[23], p[24], p[25]))

    h_tx_df = hiveContext.createDataFrame(temp_tx, txSchema)
    h_tx_df.registerTempTable("htx")

    hiveContext.cacheTable("htx")

    # HBASE CODE HERE
    # create dataframe with all records
    # map using hbase_process to extract record into individual componenets
    # and create a dictionary to store in hbase
    #h_data = hiveContext.sql("SELECT * FROM htx")
    #h_data.map(hbase_process).foreachPartition(store_full_data)

    # get cust mean time between transactions
    time_lag_eval = hiveContext.sql(
    "SELECT cc_num, unix_time, LAG(htx.unix_time) OVER (PARTITION BY htx.cc_num ORDER BY  htx.unix_time) as lag_time from htx order by cc_num, unix_time asc")
    time_lag_eval.registerTempTable("ts_lag")

    user_avg_time = hiveContext.sql("SELECT cc_num, AVG(unix_time - lag_time) as time_diff, percentile_approx((unix_time - lag_time),0.1) as low_bound, percentile_approx((unix_time - lag_time),0.90) as high_bound from ts_lag where lag_time is not null group by cc_num")
    user_avg_time.registerTempTable("avg_time")


    # get cust mean per category
    mean_per_cat = hiveContext.sql("SELECT cc_num, category, avg(amt) as mean_exp, (avg(amt)-2*(stddev_pop(amt))) as low_bound, (avg(amt)+2*(stddev_pop(amt))) as high_bound from htx group by cc_num, category")
    mean_per_cat.registerTempTable("mean_per_cat")

    # evaluate amount for HML and time of purchase for normal/abnormal
    test = hiveContext.sql(
    # #    "SELECT htx.cc_num, profile, htx.category, htx.trans_date,htx.trans_time, htx.unix_time,IF(htx.amt>(2 * m.mean_exp),'H',(IF(htx.amt<(0.5 * m.mean_exp),'L','N'))) as EXP, IF(htx.category like '%_net%','N','P') as CNP, htx.amt, LAG(htx.unix_time) OVER (PARTITION BY htx.cc_num ORDER BY  htx.unix_time) as lag_time from htx join mean_per_cat m on htx.cc_num=m.cc_num and m.category =htx.category")
    "SELECT htx.cc_num, profile, htx.category, htx.trans_date,htx.trans_time, htx.unix_time,IF(htx.amt>m.high_bound,'H',(IF(htx.amt < m.low_bound,'L','N'))) as EXP, IF(cast(SUBSTR(htx.trans_time,0,2) as int)<05,'A',IF(cast(SUBSTR(htx.trans_time,0,2) as int)>21,'A','N')) as NAT, htx.amt, LAG(htx.unix_time) OVER (PARTITION BY htx.cc_num ORDER BY  htx.unix_time) as lag_time from htx join mean_per_cat m on htx.cc_num=m.cc_num and m.category =htx.category")
    test.registerTempTable("full_table")

    # evaluate for transaction time (HML)
    full_data = hiveContext.sql(
         "SELECT full_table.cc_num, profile, category, trans_date, trans_time, unix_time,lag_time,IF(lag_time is null,100000,unix_time-lag_time) as time_since,amt, EXP,NAT,IF((unix_time-lag_time)<avg_time.low_bound,'H',IF((unix_time-lag_time)>avg_time.high_bound,'L','N')) as VEL from full_table left join avg_time on avg_time.cc_num = full_table.cc_num")
    full_data.registerTempTable("full_data")


    # return full tx data for user with reduced HML/AN/HML variables
    per_cust_transactions = hiveContext.sql(
        "SELECT cc_num as cust_id,concat(EXP,NAT, VEL) as trans_list from full_data order by cc_num, unix_time asc")

    # return full tx data for profile with reduced HML/NP/HML variables in sorted order
    #pre_sort_
    per_profile_transactions = hiveContext.sql(
        "SELECT profile as cust_id,concat(EXP,NAT,VEL) as trans_list from full_data order by profile, unix_time asc")
    #pre_sort_per_profile_transactions.registerTempTable("pre_sort")



    # we only need cust_id (really profile name here) and trans_list, but we had to include cc_num above in our sort
    #per_profile_transactions = hiveContext.sql("SELECT cust_id,trans_list from pre_sort")

    # gets pre-computed reference values for each customer and stores in redis
    #   avg spent per category
    #   n transactions
    #   last unix time stamp
    agg_info = hiveContext.sql(
        "SELECT CONCAT(category, '_', cc_num) as cust_id, category, concat(low_bound,',',high_bound) as low_high from mean_per_cat")
    avg_cat_data = agg_info.rdd.map(lambda x: [str(x.cust_id), str(x.low_high)])


    agg_n_tx = hiveContext.sql(
        "SELECT CONCAT('count_', cc_num) as cust_id, count(cc_num) as tx_count from full_data group by cc_num")
    n_tx = agg_n_tx.rdd.map(lambda x: [str(x.cust_id), str(x.tx_count)])

    agg_unix_ts = hiveContext.sql(
        "SELECT CONCAT('timestamp_', cc_num) as cust_id, max(unix_time) as last_unix_time from full_data group by cc_num")
    n_ts = agg_unix_ts.rdd.map(lambda x: [str(x.cust_id), str(x.last_unix_time)])

    agg_vel_info = hiveContext.sql(
        "SELECT CONCAT('velocity_', cc_num) as cust_id, concat(low_bound,',',high_bound) as low_high from avg_time")
    avg_vel_data = agg_vel_info.rdd.map(lambda x: [str(x.cust_id), str(x.low_high)])




    # compile our final string per customer for all tx's
    per_cust_transactions_r = per_cust_transactions.map(lambda p: (str(p.cust_id), str(p.trans_list))) \
        .reduceByKey(lambda y, z: y + ',' + z).map(lambda x: ''.join(x[0]) + ',' + x[1])



    # compile our final string per profile for all tx's
    per_profile_transactions_r = per_profile_transactions.map(lambda p: (str(p.cust_id), str(p.trans_list))) \
        .reduceByKey(lambda y, z: y + ',' + z).map(lambda x: ''.join(x[0]) + ',' + x[1])


    # return tx data and aggregates
    return_dict = {}
    return_dict['profile'] = per_profile_transactions_r
    return_dict['customer'] = per_cust_transactions_r

    return avg_cat_data, n_tx, n_ts, return_dict, avg_vel_data
import sys
from pyspark import SparkContext, SparkConf, HiveContext, SQLContext

if __name__ == '__main__':
    conf = SparkConf().setAppName("Plotly Exports")
    sc = SparkContext(conf=conf)
    hive_context = HiveContext(sc)
    print '=== Creating Database ==='
    hive_context.sql('CREATE DATABASE PLOTLY')
    hive_context.sql('USE PLOTLY')

    print '=== Creating Table ==='
    hive_context.sql("CREATE TABLE ALCOHOL_CONSUMPTION_BY_COUNTRY_2010 "
                     "(LOCATION STRING, ALCOHOL FLOAT) ROW FORMAT "
                     "DELIMITED FIELDS TERMINATED BY ',' "
                     "TBLPROPERTIES (\"skip.header.line.count\"=\"1\")")
    print "=== loading data into table ==="
    hive_context.sql("LOAD DATA LOCAL INPATH "
                     "'/plotly_datasets/2010_alcohol_consumption_by_country.csv' "
                     "OVERWRITE INTO TABLE ALCOHOL_CONSUMPTION_BY_COUNTRY_2010")
    sys.exit()
Example #49
0
    df_customers.show(2)


def write_products():
    product_sql =  " SELECT od.productCode, o.orderDate, SUM(quantityOrdered) AS quantity" \
                   " FROM myorderdetails od" \
                   " JOIN myorders o ON od.orderNumber=o.orderNumber " \
                   " GROUP BY od.productCode, o.orderDate"
    df_product = hiveContext.sql(product_sql)
    df_product.show(5)

    df_product.registerTempTable('myproducts')

    hiveContext.sql("CREATE DATABASE IF NOT EXISTS " + hive_db)
    hiveContext.sql("DROP TABLE " + hive_db + "." + hive_table)
    hiveContext.sql("CREATE TABLE " + hive_db + "." + hive_table +
                    " AS SELECT * FROM myproducts")


if __name__ == "__main__":
    conf = SparkConf().setAppName("Spark Products")
    sc = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)
    hiveContext = HiveContext(sc)

    read_customers(hdfs_customers_path)
    read_orders(hdfs_orders_path)
    read_orders_details(hdfs_orders_details_path)

    write_products()
Example #50
0
# author [email protected]
import os
import sys

local_path = os.path.dirname(__file__)
sys.path.append(local_path + "/../lib")
sys.path.append(local_path + "/../")
sys.path.append(local_path)

from pyspark import SQLContext, SparkConf, HiveContext
from pyspark import SparkContext

from ml import  diff_feature_reg,diff_train

def run(sc, sql_context, is_hive):
    diff_feature_reg.main(sc, sql_context, is_hive = True)
    diff_train.main(sc, sql_context, is_hive = True)

if __name__ == "__main__":
    conf = SparkConf()
    conf.set("spark.executor.instances", "4")
    conf.set("spark.executor.cores", "4")
    conf.set("spark.executor.memory", "32g")

    sc = SparkContext(appName="bintrade_candidate", master="yarn-client", conf=conf)
    sqlContext = HiveContext(sc)
    sqlContext.setConf("spark.sql.shuffle.partitions", "32")

    sqlContext.sql("use fex")

    run(sc, sqlContext, is_hive=True)
Example #51
0
#coding=UTF-8
from pyspark import SparkContext, SparkConf, SQLContext, Row, HiveContext
from pyspark.sql.types import *
from datetime import date, datetime, timedelta
import sys, re, os

st = datetime.now()
conf = SparkConf().setAppName('PROC_O_CEN_CBOD_CRCUPDTL').setMaster(
    sys.argv[2])
sc = SparkContext(conf=conf)
sc.setLogLevel('WARN')
if len(sys.argv) > 5:
    if sys.argv[5] == "hive":
        sqlContext = HiveContext(sc)
else:
    sqlContext = SQLContext(sc)
hdfs = sys.argv[3]
dbname = sys.argv[4]

#处理需要使用的日期
etl_date = sys.argv[1]
#etl日期
V_DT = etl_date
#上一日日期
V_DT_LD = (date(int(etl_date[0:4]), int(etl_date[4:6]), int(etl_date[6:8])) +
           timedelta(-1)).strftime("%Y%m%d")
#月初日期
V_DT_FMD = date(int(etl_date[0:4]), int(etl_date[4:6]), 1).strftime("%Y%m%d")
#上月末日期
V_DT_LMD = (date(int(etl_date[0:4]), int(etl_date[4:6]), 1) +
            timedelta(-1)).strftime("%Y%m%d")