class MainApp(object): def __init__(self): pass def init(self): os.environ["SPARK_HOME"] = "/Users/abhinavrungta/Desktop/setups/spark-1.5.2" # os.environ['AWS_ACCESS_KEY_ID'] = <YOURKEY> # os.environ['AWS_SECRET_ACCESS_KEY'] = <YOURKEY> conf = SparkConf() conf.setMaster("local[10]") conf.setAppName("PySparkShell") conf.set("spark.executor.memory", "2g") conf.set("spark.driver.memory", "1g") self.sc = SparkContext(conf=conf) self.sqlContext = SQLContext(self.sc) def loadData(self): category_list = self.sc.textFile("/Users/abhinavrungta/Desktop/uf-study/snc/github/SNC-WEB/src/yahoo/ydata-ymovies-user-movie-ratings-train-v1_0.txt").map(lambda line: (int(line.split(',')[0]), int(line.split(',')[1]), float(line.split(',')[2]), long(line.split(',')[3]))) category_schema = StructType([ StructField("userid", IntegerType(), True), StructField("movieid", IntegerType(), True), StructField("rating", FloatType(), True), StructField("time", LongType(), True) ]) category_list = self.sqlContext.createDataFrame(category_list, category_schema) category_list.registerTempTable("data") movie_list = self.sqlContext.sql("SELECT movieid, COUNT(movieid) AS ct FROM data GROUP BY movieid") movie_list.registerTempTable("movie") movieid = movie_list.sort(movie_list.ct.desc()).first().movieid # movieid = category_list.first().movieid category_list = self.sqlContext.sql("SELECT * FROM data WHERE movieid = {0}".format(movieid)) category_list.registerTempTable("data") user_list = self.sqlContext.sql("SELECT DISTINCT userid FROM data LIMIT 50") print(user_list.count()) user_list.show() user_list.registerTempTable("users") category_list = self.sqlContext.sql("SELECT d.userid AS userid, d.movieid AS movieid, d.rating AS rating, d.time AS time FROM data d, users u WHERE d.userid = u.userid").repartition(1) #category_list = self.sqlContext.createDataFrame(category_list, category_schema) category_list = category_list.map(lambda line: str(line.userid) + "," + str(line.movieid) + "," + str(line.rating) + "," + str(line.time)) category_list = category_list.repartition(1) category_list.saveAsTextFile("data.txt")
def read_existing_parquet(): sc = get_configured_context() sql_context = SQLContext(sparkContext=sc) # Loads parquet file located in AWS S3 / minio into RDD Data Frame parquet_file = sql_context.read.parquet("s3a://testparquet/nation.parquet") parquet_file.registerTempTable("parquet_file") # Run standard SQL queries against temporary table nations_all_sql = sql_context.sql("SELECT * FROM parquet_file") nations_all = nations_all_sql.rdd.map( lambda p: "Country: {0:15} Ipsum Comment: {1}".format( p.N_NAME, p.N_COMMENT)) for idx, nation in enumerate(nations_all.collect()): if idx == 0: print("All Nations and Comments -- `SELECT * FROM parquet_file`") print_horizontal() print(nation) else: print_horizontal() # Use standard SQL to filter nations_filtered_sql = sql_context.sql( "SELECT N_NAME FROM parquet_file WHERE N_NAME LIKE '%IND%'") nations_filtered = nations_filtered_sql.rdd.map( lambda p: "Country: {0:20}".format(p.N_NAME)) for idx, nation in enumerate(nations_filtered.collect()): if idx == 0: print( "Nations Filtered -- `SELECT name FROM parquet_file WHERE name LIKE '%IND%'`" ) print_horizontal() print(nation) else: print_horizontal()
# District - particular Time -Crime -Max Count import sys from pyspark import SparkConf, SparkContext from pyspark.sql.context import SQLContext input = sys.argv[1] conf = SparkConf().setAppName('Crime Analysis') sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) #loading data into parquet file crime = sqlContext.read.parquet(input).cache() crime.registerTempTable('Crime') Crime_Month = sqlContext.sql('''select Month,count(Category) from Crime group by Month ''') Crime_Time = sqlContext.sql('''select substr(Time,1,2),count(Category) from Crime group by substr(Time,1,2) ''') # caching the output as it will be required Crime_Time_Category = sqlContext.sql( '''select substr(Time,1,2) as Hour,PdDistrict,Category,count(Category) as cat_cnt from Crime where Category<>'OTHER OFFENSES' group by substr(Time,1,2),PdDistrict,Category ''').cache() Crime_Time_Category.registerTempTable('Crime_Time_Category') Crime_Time_Category_Max = sqlContext.sql(
from pyspark.sql.context import SQLContext input = sys.argv[1] conf = SparkConf().setAppName('Crime Analysis') sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) #loading crime data from parquet files crime = sqlContext.read.parquet(input) crime.registerTempTable('Crime') #loading distinct district and their distinct addresses from crime data Crime_District = sqlContext.sql('''select distinct PdDistrict,Address from Crime ''') Crime_District.registerTempTable('Crime_District') #assigning number one for distinct address and also calculating crimes occured in that address and district Crime_Add_Cnt = sqlContext.sql( '''select PdDistrict,Address,'1' as one ,count(Category) as Crime_count from Crime group by PdDistrict,Address order by PdDistrict''' ) Crime_Add_Cnt.registerTempTable('Crime_Address_count') Crime_Address_count = sqlContext.sql( '''select PdDistrict,sum(one) as x,sum(Crime_count) as y from Crime_Address_count where Crime_count<>'1' and PdDistrict in ('NORTHERN','SOUTHERN','CENTRAL') group by PdDistrict ''' )
class MainApp(object): def __init__(self): pass def init(self): os.environ["SPARK_HOME"] = "/Users/abhinavrungta/Desktop/setups/spark-1.5.2" # os.environ['AWS_ACCESS_KEY_ID'] = <YOURKEY> # os.environ['AWS_SECRET_ACCESS_KEY'] = <YOURKEY> conf = SparkConf() conf.setMaster("local") conf.setAppName("PySparkShell") conf.set("spark.executor.memory", "2g") # conf.set("spark.driver.memory", "1g") self.sc = SparkContext(conf=conf) self.sqlContext = SQLContext(self.sc) def loadData(self): self.df_review = self.sqlContext.read.json( "../yelp_dataset_challenge_academic_dataset/yelp_academic_dataset_review.json" ).cache() # self.df_review = self.sqlContext.read.json("s3n://ds-emr-spark/data/yelp_academic_dataset_review.json").cache() self.df_business = self.sqlContext.read.json( "../yelp_dataset_challenge_academic_dataset/yelp_academic_dataset_business.json" ).cache() # self.df_business = self.sqlContext.read.json("s3n://ds-emr-spark/data/yelp_academic_dataset_business.json").cache() self.df_review.registerTempTable("reviews") self.df_business.registerTempTable("business") def createCheckInDataPerUser(self): review_user = self.sqlContext.sql("SELECT business_id, user_id FROM reviews") business_loc = self.sqlContext.sql("SELECT business_id, latitude, longitude FROM business") review_user.registerTempTable("reviews_user") business_loc.registerTempTable("business_loc") self.df_join_reviewAndBusiness = self.sqlContext.sql( "SELECT r.user_id, b.latitude, b.longitude FROM reviews_user r JOIN business_loc b ON r.business_id = b.business_id" ).cache() self.df_join_reviewAndBusiness.registerTempTable("userBusiness") self.df_unique_users = self.sqlContext.sql( 'SELECT DISTINCT user_id FROM userBusiness where user_id = "SIfJLNMv7vBwo-fSipxNgg"' ) self.df_unique_users.registerTempTable("users") pd = self.df_join_reviewAndBusiness.toPandas() global_db = self.sc.broadcast(pd) schema = StructType([StructField("latitude", FloatType()), StructField("longitude", FloatType())]) partialFunc = partial(getLocationsOfUser, business_db=global_db.value) self.get_locations = udf(partialFunc, ArrayType(schema)) self.get_centers = udf(getCentersOfUser, ArrayType(schema)) self.df_unique_users = self.df_unique_users.withColumn( "user_locations", self.get_locations(self.df_unique_users["user_id"]) ) self.df_unique_users.registerTempTable("users") self.df_unique_users.repartition(1).write.save("user.json", "json", "overwrite") print(getCentersOfUser(self.df_unique_users.toPandas().iloc[0]["user_locations"])) self.df_unique_users = self.df_unique_users.withColumn( "user_centers", self.get_centers(self.df_unique_users["user_locations"]) ) self.df_unique_users.registerTempTable("users") self.df_unique_users.repartition(1).write.save("center.json", "json", "overwrite") self.df_unique_users.show() def distanceCalc(self): self.df_unique_users = self.sqlContext.read.json( "user.json/part-r-00000-23a1b514-f5fe-4f61-9a64-01ebbc88c146" ).cache() print(len(getCentersOfUser(self.df_unique_users.toPandas().iloc[0]["user_locations"])))
# register udf to calculate hash value of column using md5 algorithm(Using SQL DSL) sql_context.udf.register("get_hash_value", get_hash_value, StringType()) # register udf to check column value is null not sql_context.udf.register("is_null", isNull, StringType()) # register dataframe to perform sql DSL type_2_dataframe.registerTempTable("type2_data") # remove null value from column #null_value_query = "select _c3,is_null(_c3) as new_c3 from type2_data" #null_value_query_df = sql_context.sql(null_value_query) #null_value_query_df.show() # get hash value for column first and second hash_record_query = "select get_hash_value(CONCAT(_c1,_c2)) as record_hashvalue,*,get_hash_value(CONCAT(_c0,_c1,_c2,_c3,_c4,_c8,_c9,_c10)) as record_hash_row,from_unixtime(unix_timestamp(_c5, 'MMM dd yyyy hh:mm:ss.SSSaaa')) as init_effective_date,from_unixtime(unix_timestamp(_c6, 'MMM dd yyyy hh:mm:ss.SSSaaa')) as init_expiration_date from type2_data" hash_record_query_df = sql_context.sql(hash_record_query) hash_record_query_df.show() #populate effective date based on init_expiration_date and init_expiration_date and init_effective_date effective_date_df = hash_record_query_df.withColumn( "effective_date", when( hash_record_query_df.init_expiration_date == '1991-01-01 00:00:00', hash_record_query_df.init_effective_date).otherwise( hash_record_query_df.init_expiration_date)) effective_date_df.show() # register dataframe to perform sql DSL effective_date_df.registerTempTable("date_table")
import sys from pyspark import SparkConf, SparkContext from pyspark.sql.context import SQLContext, HiveContext from pyspark.sql.functions import * inputs = sys.argv[1] conf = SparkConf().setAppName('Month Wise Top 3 Crime') sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) hiveContext = HiveContext(sc) crime = sqlContext.read.parquet(inputs) crime.registerTempTable('Crime') Crime_month = sqlContext.sql('''select Month,Category,count(Category) as cnt from Crime group by Month,Category order by Month ''') Crime_month = Crime_month.na.replace( ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12'], [ 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec' ], 'Month') Crime_month.registerTempTable('Crime_month') #loading only aggregated records to save in csv so that hive can Query only less amount of records . Crime_month.coalesce(1).write.format('com.databricks.spark.csv').save( 'MonthCategory') # Using Hive and creating table from csv hiveContext.sql("DROP TABLE IF EXISTS Crime_month") hiveContext.sql( "CREATE TABLE Crime_month (Month STRING, Category STRING, counts int) row format delimited fields terminated by ',' stored as textfile"
lambda x: x is not None).flatMap(lambda x: x).coalesce(1) graphedges = graphedges_rdd.toDF(['source', 'destination']).cache() graphedges.registerTempTable('SourceDestTable') initial_node = source_node initial_row = KnownRow(initial_node, initial_node, 0) knownpaths = sqlContext.createDataFrame([initial_row], schema=schema) part_knownpaths = knownpaths for i in range(6): part_knownpaths.registerTempTable('PartKnownPathTable') newpaths = sqlContext.sql(""" SELECT destination AS node, t1.source AS source, (distance+1) AS distance FROM SourceDestTable t1 JOIN PartKnownPathTable t2 ON (t1.source = t2.node) """).show() newpaths.registerTempTable('NewPathTable') knownpaths.registerTempTable('KnowPathTable') duplicate_df = sqlContext.sql(""" SELECT t1.node AS node, t1.source as source, t1.distance as distance FROM NewPathTable t1 JOIN KnowPathTable t2 ON (t1.node = t2.node) """).show() if duplicate_df.count() != 0:
# for front-end purposes displaying 2016 year with most number of crimes import sys from pyspark import SparkConf, SparkContext from pyspark.sql.context import SQLContext input = sys.argv[1] conf = SparkConf().setAppName('Crime Analysis') sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) crime=sqlContext.read.parquet(input) crime.registerTempTable('Crime') District_WeekCrime=sqlContext.sql('select Year,count(Category) w_cnt from Crime where Year=2016 and Month=11 and Day>=3 group by Year order by w_cnt desc') District_WeekCrime.registerTempTable('Week_2016') #District_WeekCrime=sqlContext.sql('select Year,max(cnt) cnt from Week_2016 Group by Year ') District_WeekCrime.show() District_MonthCrime=sqlContext.sql('select Year,count(Category) m_cnt from Crime where Year=2016 and Month=11 Group by Year order by m_cnt desc') District_MonthCrime.registerTempTable('Month_2016') District_MonthCrime.show() District_YearCrime=sqlContext.sql('select Year,count(Category) y_cnt from Crime where Year=2016 Group by Year order by y_cnt desc') District_YearCrime.registerTempTable('Year_2016') District_YearCrime.show() District_Overall=sqlContext.sql('select w_cnt as week_crime_count,m_cnt as month_crime_count,y_cnt as Year_crime_count from Week_2016 W,Month_2016 M,Year_2016 Y where W.Year=M.Year and M.Year=Y.Year') District_Overall.coalesce(1).write.mode('append').json("Crimes_Overall")
.option("mode", "DROPMALFORMED") \ .load("/home/mandar/Downloads/YouTube-Spam-Collection-v1/Youtube05-Shakira.csv") you_tube_dataframe.printSchema() print "--------------------------Total number of record -------------------------" print you_tube_dataframe.count() you_tube_dataframe.select('CLASS').show(10) you_tube_dataframe.registerTempTable("you_tube_data") # Number of comment per user comment_per_user = "******" comment_per_user_df = sql_context.sql(comment_per_user) comment_per_user_df.show() # select author data whose value =Leonel Hernandez select_author = "select *,CONCAT(YEAR(DATE), '-', MONTH(DATE),'-',DAY(DATE),DAYOFMONTH(DATE),'-',DAYOFYEAR(DATE)) as date_value from you_tube_data where AUTHOR = 'Sabina Pearson-Smith'" select_author_df = sql_context.sql(select_author) select_author_df.show() # select author data whose value =Leonel Hernandez # select_comment_per_date = "select count(*) as total_comment_count,CONCAT(YEAR(DATE), '-', MONTH(DATE),'-',DAY(DATE)) as comment_date from you_tube_data group by CONCAT(YEAR(DATE), '-', MONTH(DATE),'-',DAY(DATE))" # select_comment_per_date_df = sql_context.sql(select_comment_per_date) # select_comment_per_date_df.show() # from_unixtime(Column ut, java.lang.String f) #2015-05-29T02:30:18.971000
rel['name'] = x[0] rel['age'] = x[1] return rel peopleRDD = sc.textFile( "file:///usr/local/my_soft/spark-2.1.0/examples/src/main/resources/people.txt" ) # 定义一个模式字符串 schemaString = "name age" # 根据模式字符串生成模式 fields = list( map(lambda fieldName: StructField(fieldName, StringType(), nullable=True), schemaString.split(" "))) schema = StructType(fields) # 从上面信息可以看出,schema描述了模式信息,模式中包含name和age两个字段 rowRDD = peopleRDD.map(lambda line: line.split(',')).map( lambda attributes: Row(attributes[0], attributes[1])) peopleDF = spark.createDataFrame(rowRDD, schema) # 必须注册为临时表才能供下面查询使用 peopleDF.createOrReplaceTempView("people") results = spark.sql("SELECT * FROM people") results.rdd.map(lambda attributes: "name: " + attributes[0] + "," + "age:" + attributes[1]).foreach(print)
# a) Load each file in the dataset into separate DataFrames. businessDf = businessDF.toDF("business_id", "name", "address", "city", "state", "postal_code", "latitude", "longitude", "stars", "review_count", "categories") reviewersDF = reviewersDF.toDF("review_id", "user_id", "business_id", "review_text", "review_date") friendshipDF = friendshipDF.toDF("src_user_id", "dst_user_id") businessDF.createOrReplaceTempView("businesses") reviewersDF.createOrReplaceTempView("reviewers") friendshipDF.createOrReplaceTempView("friendshipGraph") # Just to show that task 5 a) works: b = sqlContext.sql("select count(*) from businesses") r = sqlContext.sql("select count(*) from reviewers") f = sqlContext.sql("select count(*) from friendshipGraph") print("done [x]") # Task 6 print("-------- Task (6) --------") # a) Inner join review table and business table on business_id # b) The result is saved in a temporary variable, "innerJoin" innerJoin = businessDF.join(reviewersDF, "business_id") # innerJoin.show() # c) Number of reviews for each user in the review table for top 20 users with the most number of reviews reviews = reviewersDF.groupBy("user_id").count().sort(desc("count")).limit(20)
graphedges_rdd = textinput.map(lambda line: get_graphedges(line)).filter(lambda x: x is not None).flatMap(lambda x: x).coalesce(1) graphedges = graphedges_rdd.toDF(['source', 'destination']).cache() graphedges.registerTempTable('SourceDestTable') initial_node = source_node initial_row = KnownRow(initial_node, initial_node, 0) knownpaths = sqlContext.createDataFrame([initial_row], schema=schema) part_knownpaths = knownpaths for i in range(6): part_knownpaths.registerTempTable('PartKnownPathTable') newpaths = sqlContext.sql(""" SELECT destination AS node, t1.source AS source, (distance+1) AS distance FROM SourceDestTable t1 JOIN PartKnownPathTable t2 ON (t1.source = t2.node) """) newpaths.registerTempTable('NewPathTable') knownpaths.registerTempTable('KnowPathTable') duplicate_df = sqlContext.sql(""" SELECT t1.node AS node, t1.source as source, t1.distance as distance FROM NewPathTable t1 JOIN KnowPathTable t2 ON (t1.node = t2.node) """) if duplicate_df.count() != 0:
import sys from pyspark import SparkConf, SparkContext from pyspark.sql.context import SQLContext, HiveContext from pyspark.sql.functions import * inputs = sys.argv[1] conf = SparkConf().setAppName('Time-wise Top 3 Crimes') sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) hiveContext = HiveContext(sc) crime = sqlContext.read.parquet(inputs) crime.registerTempTable('Crime') Crime_Time = sqlContext.sql( '''select SUBSTR(Time,1,2) as hour,Category,count(Category) as cnt from Crime group by SUBSTR(Time,1,2),Category order by SUBSTR(Time,1,2) ''') Crime_Time.registerTempTable('Crime_Time') #loading only aggregated records to save in csv so that hive can Query only less amount of records . Crime_Time.coalesce(1).write.format('com.databricks.spark.csv').save( 'TimeCategory') # Using Hive and creating table from csv hiveContext.sql("DROP TABLE IF EXISTS TimeCategory") hiveContext.sql( "CREATE TABLE TimeCategory (Hour STRING, Category STRING, counts int) row format delimited fields terminated by ',' stored as textfile" ) #loading csv contents into csv hiveContext.sql( "LOAD DATA INPATH '/user/chandras/TimeCategory' INTO TABLE TimeCategory")
import sys from pyspark import SparkConf, SparkContext from pyspark.sql.context import SQLContext input = sys.argv[1] conf = SparkConf().setAppName('Crime Analysis') sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) # loading Crime data from parquet time = sqlContext.read.parquet(input).cache() time.registerTempTable('Time_table') # Time_table = sqlContext.sql( '''select DayOfWeek,SUBSTR(Time,1,2) as hour, count(Category) as cnt from Time_table group by DayOfWeek,SUBSTR(Time,1,2)''' ).cache() Time_table.registerTempTable('Time_cnt') Time_Max = sqlContext.sql( '''select DayOfWeek,max(cnt) as cnt from Time_cnt group by DayOfWeek ''') Time_Max.registerTempTable('Time_max') # Day wise - hour wise - Max crime calculation Time_week_Max = sqlContext.sql( '''select TM.DayOfWeek,TC.hour,TM.cnt from Time_cnt TC,Time_max TM where TC.DayOfWeek=TM.DayOfWeek and TC.cnt=TM.cnt ''') #coalescing the final output as it a safe operation to do on our final results-it doesnt affect parallel operation much Time_week_Max.coalesce(1).write.format('com.databricks.spark.csv').option(
#!/usr/bin/env python # -*- coding: utf-8 -*- import logging from pyspark import SparkContext, SparkConf from pyspark.sql.context import SQLContext from pyspark.sql.types import Row __author__ = 'guotengfei' __time__ = 2019 / 11 / 26 """ Module comment """ LOGGER = logging.getLogger(__name__) if __name__ == '__main__': conf = SparkConf().setAppName('vcom') sc = SparkContext(conf=conf) sql = SQLContext(sc) lines = sc.textFile('../data/users.txt') user = lines.map(lambda l: l.split(",")).map(lambda p: (p[0], p[1])) sql.createDataFrame(user, ['id', 'name']).registerTempTable('user') df = sql.sql("select id,name from user") df.write.save("../data/result", format='json')
# sub-Crime associated with particular crime (LARCENY-THEFT) import sys from pyspark import SparkConf, SparkContext from pyspark.sql.context import SQLContext input = sys.argv[1] conf = SparkConf().setAppName('Crime Analysis') sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) crime = sqlContext.read.parquet(input).cache() crime.registerTempTable('Crime') Crime = sqlContext.sql('''select IncidntNum,count(IncidntNum) as inc_num_cnt from Crime group by IncidntNum having count(IncidntNum)>1 ''').cache() Crime.registerTempTable('Crime_Frequency') Crime_Frequency = sqlContext.sql( '''select distinct CF.IncidntNum from Crime_Frequency CF,Crime C Where CF.IncidntNum=C.IncidntNum and C.Category='LARCENY-THEFT' ''' ) Crime_Frequency.registerTempTable('Crime_Vehicle_Theft') # Extracting crimes that are associated with LARCENY-THEFT Crime_Vehicle_Theft = sqlContext.sql( '''select CV.IncidntNum,C.Category,count(Category) as cnt from Crime_Vehicle_Theft CV,Crime C where CV.IncidntNum=C.IncidntNum and C.Category not in ('LARCENY-THEFT','OTHER OFFENSES') group by CV.IncidntNum,C.Category having count(Category)>1''' ) Crime_Vehicle_Theft.registerTempTable('Crime_VT_Frequency')
from pyspark.sql.context import SQLContext,HiveContext from pyspark.sql.functions import * inputs=sys.argv[1] conf = SparkConf().setAppName('District wise Analysis') sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) hiveContext = HiveContext(sc) crime=sqlContext.read.parquet(inputs) crime.registerTempTable('Crime') Crime_District=sqlContext.sql('''select PdDistrict,Category,count(Category) as cnt from Crime group by PdDistrict,Category order by PdDistrict ''') Crime_District.registerTempTable('Crime_District') Crime_count=sqlContext.sql('''select * from Crime_District where cnt<>1''') #loading only aggregated records to save in csv so that hive can Query only less amount of records . Crime_count.coalesce(1).write.format('com.databricks.spark.csv').save('District_top') # Using Hive and creating table from csv hiveContext.sql("DROP TABLE IF EXISTS district") hiveContext.sql("CREATE TABLE district (PdDistrict STRING, Category STRING, counts int) row format delimited fields terminated by ',' stored as textfile") #loading csv contents into csv hiveContext.sql("LOAD DATA INPATH '/user/chandras/District_top' INTO TABLE district") District=hiveContext.sql("SELECT PdDistrict,Category,counts FROM (SELECT PdDistrict,Category,counts,dense_rank() OVER (PARTITION BY PdDistrict ORDER BY counts DESC) as rank FROM district) tmp WHERE rank <= 3");
#Blue Collar vs White Collar Crime import sys from pyspark import SparkConf, SparkContext from pyspark.sql.context import SQLContext input = sys.argv[1] conf = SparkConf().setAppName('Crime Analysis') sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) #loading crime data from parquet files crime = sqlContext.read.parquet(input).cache() crime.registerTempTable('Crime') Crime_Total = sqlContext.sql( '''select Year,PdDistrict,count(Category) as cnt from Crime group by Year,PdDistrict''' ) Crime_Total.registerTempTable('Crime_Total') Blue_Crime = sqlContext.sql( '''select Year,PdDistrict,count(Category) as blue_count from Crime where Category in ('VANDALISM','VEHICLE THEFT','LARCENY-THEFT','ROBBERY','BURGLARY', 'STOLEN PROPERTY','DISORDERLY CONDUCT','ASSAULT','KIDNAPPING','RECOVERED VEHICLE') group by Year,PdDistrict order by Year,blue_count desc''' ).cache() Blue_Crime.registerTempTable('Blue_Crime') White_Crime = sqlContext.sql( '''select Year,PdDistrict ,count(Category) as white_count from Crime where Category in ('FRAUD','FORGERY-COUNTERFEITING','GAMBLING','EMBEZZLEMENT','SECONDARY CODES', 'EXTORTION','BRIBERY','BAD CHECKS','KIDNAPPING','RECOVERED VEHICLE') group by Year,PdDistrict order by Year,white_count desc'''
class MainApp(object): def __init__(self): pass def init(self): os.environ[ "SPARK_HOME"] = "/Users/abhinavrungta/Desktop/setups/spark-1.5.2" # os.environ['AWS_ACCESS_KEY_ID'] = <YOURKEY> # os.environ['AWS_SECRET_ACCESS_KEY'] = <YOURKEY> conf = SparkConf() conf.setMaster("local") conf.setAppName("PySparkShell") conf.set("spark.executor.memory", "2g") # conf.set("spark.driver.memory", "1g") self.sc = SparkContext(conf=conf) self.sqlContext = SQLContext(self.sc) def loadData(self): self.df_review = self.sqlContext.read.json( "../yelp_dataset_challenge_academic_dataset/yelp_academic_dataset_review.json" ).cache() # self.df_review = self.sqlContext.read.json("s3n://ds-emr-spark/data/yelp_academic_dataset_review.json").cache() self.df_business = self.sqlContext.read.json( "../yelp_dataset_challenge_academic_dataset/yelp_academic_dataset_business.json" ).cache() # self.df_business = self.sqlContext.read.json("s3n://ds-emr-spark/data/yelp_academic_dataset_business.json").cache() self.df_review.registerTempTable("reviews") self.df_business.registerTempTable("business") def createCheckInDataPerUser(self): review_user = self.sqlContext.sql( "SELECT business_id, user_id FROM reviews") business_loc = self.sqlContext.sql( "SELECT business_id, latitude, longitude FROM business") review_user.registerTempTable("reviews_user") business_loc.registerTempTable("business_loc") self.df_join_reviewAndBusiness = self.sqlContext.sql( "SELECT r.user_id, b.latitude, b.longitude FROM reviews_user r JOIN business_loc b ON r.business_id = b.business_id" ).cache() self.df_join_reviewAndBusiness.registerTempTable("userBusiness") self.df_unique_users = self.sqlContext.sql( "SELECT DISTINCT user_id FROM userBusiness where user_id = \"SIfJLNMv7vBwo-fSipxNgg\"" ) self.df_unique_users.registerTempTable("users") pd = self.df_join_reviewAndBusiness.toPandas() global_db = self.sc.broadcast(pd) schema = StructType([ StructField("latitude", FloatType()), StructField("longitude", FloatType()) ]) partialFunc = partial(getLocationsOfUser, business_db=global_db.value) self.get_locations = udf(partialFunc, ArrayType(schema)) self.get_centers = udf(getCentersOfUser, ArrayType(schema)) self.df_unique_users = self.df_unique_users.withColumn( "user_locations", self.get_locations(self.df_unique_users["user_id"])) self.df_unique_users.registerTempTable("users") self.df_unique_users.repartition(1).write.save("user.json", "json", "overwrite") print( getCentersOfUser( self.df_unique_users.toPandas().iloc[0]["user_locations"])) self.df_unique_users = self.df_unique_users.withColumn( "user_centers", self.get_centers(self.df_unique_users["user_locations"])) self.df_unique_users.registerTempTable("users") self.df_unique_users.repartition(1).write.save("center.json", "json", "overwrite") self.df_unique_users.show() def distanceCalc(self): self.df_unique_users = self.sqlContext.read.json( "user.json/part-r-00000-23a1b514-f5fe-4f61-9a64-01ebbc88c146" ).cache() print( len( getCentersOfUser(self.df_unique_users.toPandas().iloc[0] ["user_locations"])))
from pyspark.sql.readwriter import DataFrameReader, DataFrameWriter from pyspark.sql.window import Window, WindowSpec conf = SparkConf().setMaster("local").setAppName("MyApp") sc = SparkContext(conf=conf) spark = SQLContext(sc) def stopSpark(): sc.stop() def f(x): rel = {} rel['name'] = x[0] rel['age'] = x[1] return rel people = sc.textFile("file:///usr/local/my_soft/spark-2.1.0/examples/src/main/resources/people.txt")\ .map(lambda line : line.split(','))\ .map(lambda p: Row(name=p[0], age=int(p[1]))) peopleDF = spark.createDataFrame(people) # peopleDF = sc.textFile("file:///usr/local/my_soft/spark-2.1.0/examples/src/main/resources/people.txt")\ # .map(lambda line : line.split(','))\ # .map(lambda x: Row(**f(x))).toDF() peopleDF.createOrReplaceTempView("people") # 必须注册为临时表才能供下面的查询使用 personsDF = spark.sql("select name,age from people where age>20") personsDF.rdd.map(lambda t: "Name:" + t[0] + "," + "Age:" + str(t[1])).foreach( print)
class MainApp(object): def __init__(self): pass def init(self): os.environ[ "SPARK_HOME"] = "/Users/abhinavrungta/Desktop/setups/spark-1.5.2" # os.environ['AWS_ACCESS_KEY_ID'] = <YOURKEY> # os.environ['AWS_SECRET_ACCESS_KEY'] = <YOURKEY> conf = SparkConf() conf.setMaster("local") conf.setAppName("My application") conf.set("spark.executor.memory", "2g") self.sc = SparkContext(conf=conf) self.sqlContext = SQLContext(self.sc) self.df_user = self.sqlContext.read.json("dataset/user.json").cache() self.df_review = self.sqlContext.read.json( "dataset/review.json").cache() self.df_business = self.sqlContext.read.json( "dataset/business.json").cache() self.df_user.registerTempTable("user") def getS3File(self, s3FilePath, destinationPathOnLocal): r = requests.get(s3FilePath) fileOb = open(destinationPathOnLocal, 'w') fileOb.write(r.text) fileOb.close() def writeToS3File(self, s3FilePath, sourcePathOnLocal): fileOb = open(sourcePathOnLocal, 'r') payload = fileOb.read() fileOb.close() headers = {"x-amz-acl": "public-read-write"} return requests.put(s3FilePath, headers=headers, data=payload) def reads3spark(self, path): # path = "s3n://b-datasets/flight_data/*" x = self.sc.textFile(path) # we can just specify all the files. return x def writes3spark(self, x, path): x.saveAsTextFile(path) def createFeatures(self): userData = self.sqlContext.sql( "SELECT user_id, name, review_count, votes, fans, yelping_since, elite FROM user" ) userData = userData.map(mapUsers).coalesce(1) res = self.sqlContext.createDataFrame(userData) review_user = self.df_review.select(self.df_review.business_id, self.df_review.user_id) business_loc = self.df_business.select(self.df_business.business_id, self.df_business.city, self.df_business.state) df_join_reviewAndBusiness = review_user.join( business_loc, review_user.business_id == business_loc.business_id).select( "user_id", "city", "state") df_grouped = df_join_reviewAndBusiness.groupBy( ["user_id", "city", "state"]).count() df_panda = res.toPandas() for name, group in df_grouped: if (group['city'] > 10): user_id = df_grouped.get_group(name)[0]['user_id'] df_panda[user_id]['k'] = df_panda[user_id]['k'] + 1 res = self.sqlContext.createDataFrame(df_panda) res.toJSON().saveAsTextFile('user_features.json')
swimmers.sql( "select name, eyeColor from swimmers where eyeColor like 'b%'").show # reality data flightPerfFilePath = "C:\\Users\\sangjiaqi\\Desktop\\软件\\Scala\\learningPySpark-master\\Chapter03\\flight-data\\departuredelays.csv" airportsFilePath = "C:\\Users\\sangjiaqi\\Desktop\\软件\\Scala\\learningPySpark-master\\Chapter03\\flight-data\\airport-codes-na.txt" airports = sqlContext.read.csv(airportsFilePath, header='true', inferSchema='true', sep='\t') airports.createOrReplaceTempView("airports") flightPerf = sqlContext.read.csv(flightPerfFilePath, header='true') flightPerf.createOrReplaceTempView("FlightPerformance") flightPerf.cache sqlContext.sql(""" select a.City, f.origin, sum(f.delay) as Delays from FlightPerformace f join airports a on a.IATA = f.orifin where a.State = 'WA' group by a.City, f.origin order by sum(f.delay) desc """).show
# -*- coding: utf-8 -*- ''' Created on 2018年1月5日 @author: root ''' from com.bjsxt.python.test import SparkUtil from pyspark.sql.context import SQLContext conf = SparkUtil.initSparkConf(True, "DataFrameOpsFromFile") sc = SparkUtil.initSparkContext(conf) sqlContext = SQLContext(sc) df = sqlContext.read.json("../data/people.json") df.registerTempTable("people") sqlContext.sql("select * from people where age > 20").show() sc.stop()
import sys from pyspark import SparkConf, SparkContext from pyspark.sql.context import SQLContext input = sys.argv[1] conf = SparkConf().setAppName('Crime Analysis') sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) #loading crime data from parquet files crime = sqlContext.read.parquet(input).cache() crime.registerTempTable('Crime') # taking count of resolution against the crime excluding resolution in Pending/In-progress/No Resolution Crime_Resolution = sqlContext.sql( '''select PdDistrict,count(Resolution) as cnt from Crime Where Resolution<>'NONE' Group by PdDistrict order by PdDistrict''' ) Crime_Resolution.registerTempTable('Crime_Resolution') # ignoring bad data Crime_Res = sqlContext.sql( '''select * from Crime_Resolution where cnt<>1 ''').cache() Crime_Res.registerTempTable('Crime_Res') # taking count of resolution against the crime Including resolution in Pending/In-progress/No Resolution Crime_Resolution_ALL = sqlContext.sql( '''select PdDistrict,count(Resolution) as cnt from Crime Group by PdDistrict order by PdDistrict''' ) Crime_Resolution_ALL.registerTempTable('Crime_Resolution_ALL') Crime_Res_ALL = sqlContext.sql(