def main():
    conf = SparkConf().setAppName("transform")
    sc = SparkContext(conf=conf)
    sqlContext = HiveContext(sc)
    conn = "mongodb://{mongo_user}:{mongo_pass}@{mongo_host}:{mongo_port}/{mongo_db}.{mongo_collection}".format(
        mongo_user=MONGO_USER,
        mongo_pass=urllib.quote_plus(MONGO_PASSWORD),
        mongo_host=MONGO_HOST,
        mongo_port=MONGO_PORT,
        mongo_db=DB_NAME,
        mongo_collection=COLLECTION_NAME)
    rdd = sc.mongoRDD(conn)

    new_rdd = rdd.map(lambda x: dict([(i, x[i]) for i in x if i != '_id'])
                ).map(lambda x: json.dumps(x, ensure_ascii=False).encode('ascii', 'replace')
                ).map(lambda x: "".join(x.split("\\n")))
    df = sqlContext.jsonRDD(new_rdd)
    df.registerTempTable('events_temp')
    sqlContext.sql('DROP TABLE IF EXISTS default.events')
    sqlContext.sql('DROP TABLE IF EXISTS default.clean_table')
    sqlContext.sql('CREATE TABLE events AS SELECT * FROM events_temp')
    sqlContext.sql("CREATE TABLE clean_table AS SELECT description AS event_desc, id AS event_id, yes_rsvp_count, group.category.name AS cat_name, group.category.shortname AS cat_short, group.category.id AS cat_id, group.name AS group_name, group.topics.name AS topic_name, name AS event_name, time AS start_time, utc_offset AS timezone_offset, venue.state AS venue_state, venue.city AS venue_city, venue.zip AS venue_zip, fee.amount AS fee_amt, fee.required AS req_fee FROM events")
def run(inpath, outpath, mode='append'):
    
    gc.disable()
    print("===== Checking if Log Exists =====")
    check_log(inpath)
    print("===== Pass Log Checking =====")
    
    # initial SparkContext
    conf = SparkConf().setAppName("Forgate Log Parser")
    sc = SparkContext(conf=conf)
    sqlCtx = HiveContext(sc)
    start_time = time.time()
    print("===== INPUT FILE PATH: %s =====" % (str(inpath)))
    print("===== OUTPUT FILE PATH: %s =====" % (str(outpath)))
    print("===== %s Reading Data From HDFS" % (now()))
    distFile = sc.textFile(inpath)
    cnt_raw = distFile.count()
    print("===== Count of Input Data: %s =====" % (str(cnt_raw)))
    
    print("===== %s Parsing Data" % (now()))
    parsedData = parse_data(sc, distFile)
    print("===== Count of Parsed Data: %s =====" % (str(parsedData.count())))
    
    print("===== %s Saving Data" % (now()))
    jsonData = sqlCtx.jsonRDD(parsedData)
    old_col=['time','date']
    new_col=['time_','dt']
    jsonData = rename_column(jsonData, old_col, new_col)
    jsonData.write.partitionBy('dt').parquet(outpath, mode=mode)
    
    print("===== %s Checking Data" % (now()))
    confirm_row(sqlCtx, outpath)
    write_log(inpath)
    print("---Total took %s seconds ---" % (time.time() - start_time))
    
    sc.stop()
    gc.enable()
Ejemplo n.º 3
0
# coding=utf-8

from pyspark import SparkConf, SparkContext
from pyspark.sql import HiveContext, Row

conf = SparkConf().setAppName("spark_sql_cache")

sc = SparkContext(conf=conf)

hc = HiveContext(sc)

source = sc.parallelize(
    ['{"col1": "row1_col1","col2":"row1_col2","col3":"row1_col3"}', '{"col1": "row2_col1","col2":"row2_col2","col3":"row2_col3"}', '{"col1": "row3_col1","col2":"row3_col2","col3":"row3_col3"}'])


sourceRDD = hc.jsonRDD(source)

sourceRDD.registerTempTable("temp_source")

"""
def convert(row):
    mydict = row.asDict()

    mydict["col1"] = mydict["col1"].upper()

    return Row(**mydict)

convertRDD = hc.sql(
    "select col1, col2, col3 from temp_source").map(convert)

mytable = hc.inferSchema(convertRDD)
Ejemplo n.º 4
0
from pyspark import SparkConf, SparkContext
from pyspark.sql import HiveContext, Row
import re

conf = SparkConf().setAppName("spark_sql_json")

sc = SparkContext(conf=conf)

hc = HiveContext(sc)

source = sc.parallelize(
    ['{"col1": "row1_col1","col2":"row1_col2","col3":"row1_col3"}', '{"col1": "row2_col1","col2":"row2_col2","col3":"row2_col3"}', '{"col1": "row3_col1","col2":"row3_col2","col3":"row3_col3"}'])


table = hc.jsonRDD(source)

table.registerAsTable("temp_mytable")

datas = hc.sql("select * from temp_mytable").collect()

sc.stop()

if datas:
    for data in datas:
        print data.col1, data.col2, data.col3
Ejemplo n.º 5
0
    header_stores = ['store_id', 'avg_hhi', 'avg_traffic']
    df_stores.repartition(1).write.parquet(df_stores_loc, mode='overwrite')
    df_stores_join = df_stores.select(header_stores + ['group_val'])

# In[7]:

################# 2.1.3 sales data
## parsing sales json file and construct df_sales
sales_dates = [(processed_time_d[0] + timedelta(i + 1)).strftime('%Y_%m_%d')
               for i in range((processed_time_d[1] - processed_time_d[0]).days)
               ]
sales_raw_file_name = sales_and_pc_raw_d_loc + 'sales_store[0-9]*_{' + ','.join(
    sales_dates) + '}_00_00_00.json'
sales_jsonRDD = sc.wholeTextFiles(sales_raw_file_name).map(lambda x: x[1])
sales_js = sales_jsonRDD.map(lambda x: re.sub(r"\s+", "", x, flags=re.UNICODE))
sales_js = sqlContext.jsonRDD(sales_js)
sales_js = sales_js.select(col('SalesLogDateTime'), col('StoreID'),
                           explode(col('Transactions')).alias('Transaction'))
sales_js = sales_js.select([
    'SalesLogDateTime', 'StoreID', 'Transaction.Products',
    'Transaction.Subtotal', 'Transaction.Tax', 'Transaction.Total',
    'Transaction.TransactionDateTime'
])
sales_js = sales_js.select(col('Subtotal'), col('StoreID'), col('Tax'),
                           col('SalesLogDateTime'), col('Total'),
                           col('TransactionDateTime'),
                           explode(col('Products')).alias('Product'))
sales_js = sales_js.select([
    'Subtotal', 'StoreID', 'Tax', 'SalesLogDateTime', 'Total',
    'TransactionDateTime', 'Product.Price', 'Product.ProductID'
])
Ejemplo n.º 6
0
# values的输出结果:Row(_c0=u'1', _c1=u'2', _c2=u'3.0'),数据类型被全部推断为“int”,也就是说数组的数据类型一定要一致,否则可以引发异常
"""
"""
source = sc.parallelize(['{"key" : [1, 2 , 3.0]}'])

jsonRDD = hc.jsonRDD(source)

jsonRDD.registerTempTable("temp_table")

values = hc.sql("select key[0], key[1], key[2] from temp_table").collect()

# values的输出结果:Row(_c0=1.0, _c1=2.0, _c2=3.0),数据类型被全部推断为“float”
"""

source = sc.parallelize([
    '{"key" : [{"key1" : "value1", "key2" : [1, 2, 3], "key3" : [{"key4" : "value4", "key5" : [4, 5.0, 6]}]}]}'
])

jsonRDD = hc.jsonRDD(source)

jsonRDD.registerTempTable("temp_table")

values = hc.sql(
    "select key[0].key1, key[0].key2[0], key[0].key3[0].key4, key[0].key3[0].key5[1] from temp_table"
).collect()

sc.stop()

for value in values:
    print value
Ejemplo n.º 7
0
from pyspark import SparkConf, SparkContext
from pyspark.sql import HiveContext, Row
import re

conf = SparkConf().setAppName("spark_sql_json")

sc = SparkContext(conf=conf)

hc = HiveContext(sc)

source = sc.parallelize([
    '{"col1": "row1_col1","col2":"row1_col2","col3":"row1_col3"}',
    '{"col1": "row2_col1","col2":"row2_col2","col3":"row2_col3"}',
    '{"col1": "row3_col1","col2":"row3_col2","col3":"row3_col3"}'
])

table = hc.jsonRDD(source)

table.registerAsTable("temp_mytable")

datas = hc.sql("select * from temp_mytable").collect()

sc.stop()

if datas:
    for data in datas:
        print data.col1, data.col2, data.col3