コード例 #1
0
import sys
from pyspark import SparkConf, SparkContext
from pyspark.sql.context import SQLContext, HiveContext
from pyspark.sql.functions import *

inputs = sys.argv[1]
conf = SparkConf().setAppName('Month Wise Top 3 Crime')
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)
hiveContext = HiveContext(sc)

crime = sqlContext.read.parquet(inputs)
crime.registerTempTable('Crime')

Crime_month = sqlContext.sql('''select  Month,Category,count(Category) as cnt
    from Crime group by Month,Category order by Month
    ''')
Crime_month = Crime_month.na.replace(
    ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12'], [
        'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct',
        'Nov', 'Dec'
    ], 'Month')

Crime_month.registerTempTable('Crime_month')
#loading only aggregated records to save in csv so that hive can Query only less amount of records .
Crime_month.coalesce(1).write.format('com.databricks.spark.csv').save(
    'MonthCategory')
# Using Hive and creating table from csv
hiveContext.sql("DROP TABLE IF EXISTS Crime_month")
hiveContext.sql(
    "CREATE TABLE Crime_month (Month STRING, Category STRING, counts int) row format delimited fields terminated by ',' stored as textfile"
コード例 #2
0
from pyspark.context import SparkContext
from pyspark.sql.functions import explode, least
from decimal import Decimal
from pyspark.sql.types import *


import json
from chardet.latin1prober import UDF
from types import NoneType


conf = SparkConf().setMaster("local").setAppName("My application").set("spark.executor.memory", "1g")
sc = SparkContext()

# sc = SparkContext(conf=conf)      
sqlContext  = HiveContext(sc)
 
df = sqlContext.read.format("json").load("/home/mihai/ArhivaDateLema/somedata/temp/testDB.json")
# print df.count()

# a = df.filter(max(df['temp_apa_r']['S1'] , df['temp_apa_r']['S2']) > 1 ).count()





# df.select(df.data,df.bloc_soft.valoare).show()
# df.filter(df.temp_apa_r.S1 != '-').show()

# print  df.take(1)[0].asDict()
#     print k
コード例 #3
0
import sys
from pyspark import SparkConf, SparkContext
from pyspark.sql.context import SQLContext, HiveContext
from pyspark.sql.functions import *

inputs = sys.argv[1]
conf = SparkConf().setAppName('Time-wise Top 3 Crimes')
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)
hiveContext = HiveContext(sc)

crime = sqlContext.read.parquet(inputs)
crime.registerTempTable('Crime')

Crime_Time = sqlContext.sql(
    '''select  SUBSTR(Time,1,2) as hour,Category,count(Category) as cnt
    from Crime group by SUBSTR(Time,1,2),Category order by SUBSTR(Time,1,2)
    ''')

Crime_Time.registerTempTable('Crime_Time')
#loading only aggregated records to save in csv so that hive can Query only less amount of records .
Crime_Time.coalesce(1).write.format('com.databricks.spark.csv').save(
    'TimeCategory')

# Using Hive and creating table from csv
hiveContext.sql("DROP TABLE IF EXISTS TimeCategory")
hiveContext.sql(
    "CREATE TABLE TimeCategory (Hour STRING, Category STRING, counts int) row format delimited fields terminated by ',' stored as textfile"
)
#loading csv contents into csv
hiveContext.sql(
コード例 #4
0
import sys
from pyspark import SparkConf, SparkContext
from pyspark.sql.context import SQLContext,HiveContext
from pyspark.sql.functions import *

inputs=sys.argv[1]
conf = SparkConf().setAppName('District wise Analysis')
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)
hiveContext = HiveContext(sc)



crime=sqlContext.read.parquet(inputs)
crime.registerTempTable('Crime')

Crime_District=sqlContext.sql('''select  PdDistrict,Category,count(Category) as cnt
    from Crime group by PdDistrict,Category order by PdDistrict
    ''')

Crime_District.registerTempTable('Crime_District')

Crime_count=sqlContext.sql('''select * from Crime_District where cnt<>1''')
#loading only aggregated records to save in csv so that hive can Query only less amount of records .
Crime_count.coalesce(1).write.format('com.databricks.spark.csv').save('District_top')

# Using Hive and creating table from csv
hiveContext.sql("DROP TABLE IF EXISTS district")
hiveContext.sql("CREATE TABLE district (PdDistrict STRING, Category STRING, counts int) row format delimited fields terminated by ',' stored as textfile")
#loading csv contents into csv
hiveContext.sql("LOAD DATA INPATH '/user/chandras/District_top' INTO TABLE district")
# zookeeper地址
zkHost = '192.168.10.130:2181'
# 写入行为信息的topic
topic = 'userBehavior'

# 需要处理的行为类型
behaviorsSet = {'click', 'buy', 'addCart'}

if __name__ == '__main__':
    sparkContext = SparkContext(appName=appName)
    streamingContext = StreamingContext(sparkContext, 15)
    # spark sql 支持
    sqlContext = SQLContext(sparkContext)
    # 添加hive支持
    hiveContext = HiveContext(sparkContext)
    # 创建kafka连接
    kafkaStream = KafkaUtils.createStream(streamingContext, zkHost,
                                          "kafka-streaming-hive", {topic: 1})

    # 如果是点击或添加购物车操作更新用户推荐列表
    def updateUserRecommendList(msg):
        if (msg[2] == "click" or msg[2] == 'addCart'):
            print 'click---->' + msg[3]
        return msg

    # 定义存储行为的表模式
    schemaString = "time uid type content"
    fields = list(map(lambda fieldName: StructField(fieldName, StringType(), nullable=True), \
                      schemaString.split(" ")))
    schema = StructType(fields)
コード例 #6
0
import json
from chardet.latin1prober import UDF
from types import NoneType

from pyspark.mllib.stat import Statistics
from numpy.core.numeric import NaN
import pandas as pd
import numpy as np


conf = SparkConf().setMaster("local").setAppName("My application").set("spark.executor.memory", "1g")
sc = SparkContext()

# sc = SparkContext(conf=conf)      
sqlContext  = HiveContext(sc)
 
df = sqlContext.read.format("json").load("/home/mihai/ArhivaDateLema/somedata/temp/testDB.json")

from pyspark.sql.functions import udf
from pyspark.sql.types import StringType, BooleanType

import operator          
    

def is_value_in_marime(dict):
    d = dict.asDict()
    return "0x30" in d.values() 
        

コード例 #7
0
from pyspark.sql.context import SQLContext, HiveContext
from pyspark.context import SparkContext
from pyspark.sql.functions import explode, least
from decimal import Decimal
from pyspark.sql.types import *

import json
from chardet.latin1prober import UDF
from types import NoneType

conf = SparkConf().setMaster("local").setAppName("My application").set(
    "spark.executor.memory", "1g")
sc = SparkContext()

# sc = SparkContext(conf=conf)
sqlContext = HiveContext(sc)

df = sqlContext.read.format("json").load(
    "/home/mihai/ArhivaDateLema/somedata/temp/testDB.json")
# print df.count()

# a = df.filter(max(df['temp_apa_r']['S1'] , df['temp_apa_r']['S2']) > 1 ).count()

# df.select(df.data,df.bloc_soft.valoare).show()
# df.filter(df.temp_apa_r.S1 != '-').show()

# print  df.take(1)[0].asDict()
#     print k

from pyspark.sql.functions import udf
from pyspark.sql.types import StringType, BooleanType
コード例 #8
0
    #Read input parameter from command line
    if len(sys.argv) == 2:

        print "Base path " + ROOT_DIR + os.sep

        file_name = sys.argv[1]
        print "performing analysis on file_name " + file_name

        #Read input file using spark-csv and create spark dataframe

        conf = SparkConf().setAppName('Spark-Assignment').setMaster('local[*]')

        # create spark context and sql context
        sc = SparkContext(conf=conf)
        hive_context = HiveContext(sc)

        # read the input data file and create spark dataframe using com.databricks.spark.csv library
        input_dataframe = hive_context.read.format("com.databricks.spark.csv")\
        .option("header", "false") \
        .option("inferschema", "true") \
        .option("delimiter", ",") \
        .option("mode", "DROPMALFORMED") \
        .load("file://" + file_name)

        #dataframe schema based on data
        column_list = [
            "date_value", "date", "register", "private_agency", "state",
            "district", "sub_district", "pincode", "gender", "age",
            "aadhaar_generated", "rejected", "mobile_number", "email_id"
        ]
コード例 #9
0
from time import time
'''
How to run:

/usr/local/spark-2.0.2/bin/spark-submit  /home/mandar/Downloads/Spark_Example/pyspark/example/dataframe/SparkExample3.py

'''

if __name__ == '__main__':

    # Set Spark properties which will used to create sparkcontext
    conf = SparkConf().setAppName('SparkExample1').setMaster('local[*]')

    # create spark context and sql context
    sc = SparkContext(conf=conf)
    hive_context = HiveContext(sc)

    # read the input data file and create spark dataframe
    record_dataframe = hive_context.read.format("com.databricks.spark.csv")\
    .option("header", "false") \
    .option("inferschema", "true") \
    .option("delimiter", "\n") \
    .load("file:///home/mandar/Downloads/Spark_Example/resources/1").withColumnRenamed("_c0", "record")

    # meta config dataframe
    metaconfig_dataframe = hive_context.read.format("com.databricks.spark.csv")\
    .option("header", "true") \
    .option("inferschema", "true") \
    .option("delimiter", "\t") \
    .load("file:///home/mandar/Downloads/Spark_Example/resources/meta_config")
コード例 #10
0
from subprocess import call
import math
from collections import OrderedDict
from pyspark.sql.functions import monotonically_increasing_id

from time import time

# /usr/local/spark-2.0.2/bin/spark-submit /home/mandar/ProjectWorkspace/Example/com/spark/example/DataDifferenceSpark.py

if __name__ == '__main__':
    # Set Spark properties which will used to create sparkcontext
    conf = SparkConf().setAppName('SparkExample1').setMaster('local[*]')
    
    # create spark context and sql context 
    sc = SparkContext(conf=conf)
    hive_context = HiveContext(sc)
    
    # read the input data file and create pandas dataframe
    type_2_dataframe = hive_context.read.format("com.databricks.spark.csv")\
    .option("header", "false") \
    .option("inferschema", "true") \
    .option("delimiter", "|") \
    .option("mode", "DROPMALFORMED") \
    .load("/home/mandar/ProjectWorkspace/Example/resources/data_difference_input") \
    
    type_2_dataframe = type_2_dataframe .withColumnRenamed('_c0', 'date_value')
    
    # register dataframe to perform sql DSL    
    type_2_dataframe.registerTempTable("date_table")
    
    # get hash value for column first and second
コード例 #11
0
def initSparkConf(isLocal, appName):
    conf = SparkConf()
    conf.setAppName(appName)
    if isLocal is True:
        conf.setMaster("local[*]")
    return conf


def initSparkContext(conf):
    return SparkContext(conf=conf)


conf = initSparkConf(False, "HiveDataSource")
sc = initSparkContext(conf)
hiveContext = HiveContext(sc)
hiveContext.sql("DROP TABLE IF EXISTS student_infos")
hiveContext.sql(
    "CREATE TABLE IF NOT EXISTS student_infos (name STRING, age INT) row format delimited fields terminated by '\t'"
)
hiveContext.sql("LOAD DATA " + "LOCAL INPATH '/root/resource/student_infos' " +
                "INTO TABLE student_infos")

hiveContext.sql("DROP TABLE IF EXISTS student_scores")
hiveContext.sql(
    "CREATE TABLE IF NOT EXISTS student_scores (name STRING, score INT) row format delimited fields terminated by '\t'"
)
hiveContext.sql("LOAD DATA " +
                "LOCAL INPATH '/root/resource/student_scores' " +
                "INTO TABLE student_scores")