def clacForFile(features, file, otherFile): print('running on file {0}'.format(file)) sc = SparkSession \ .builder \ .appName("Python Spark SQL basic example") \ .config("spark.some.config.option", "some-value") \ .getOrCreate() spark = SQLContext(sc) data = spark.read.csv(file, header=True).coalesce(100) #otherData = spark.read.csv(otherFile, header=True).coalesce(100) data.createTempView('clicks') #otherData.createTempView('otherDataclicks') #spark.cacheTable('clicks') fieldsCsv = ','.join(ALL_COLUMNS) query = 'select case clicked when 1 then 1 else 0 end as clicked, case clicked when 1 then 0 else 1 end as nonclicked, rowIndex, ' \ '{0} from clicks '.format(fieldsCsv) d1 = spark.sql(query) d1.createTempView('countClicks') spark.cacheTable('countClicks') numOfRows = spark.sql('select count(*) as numOfRows from countClicks' ).take(1)[0]['numOfRows'] print('running on file {0} and numOfRows{1}'.format(file, numOfRows)) for column in ALL_COLUMNS: features[column][ 'totalNumOfRows'] = features[column]['totalNumOfRows'] + numOfRows features[column]['numOfRowsPerFile'].append(numOfRows) print('running on file {0} and noNullColumns'.format(file)) noNullColumns(spark, ALL_COLUMNS, features) print('running on file {0} and distinctValuesFilter'.format(file)) distinctValuesFilter(spark, ALL_COLUMNS, features) # spark.cacheTable('countClicks') print('running on file {0} and adsAndClicksPerUserFilter'.format(file)) adsAndClicksPerUserFilter(spark, numOfRows, ALL_COLUMNS, features) print('running on file {0} and adsAndClicksPerUserFilter'.format(file)) sc.stop() print('done')
def main(sc): sqlContext = SQLContext(sc) taxiFile = sc.textFile("taxizip/taxizipaa.csv") header = taxiFile.first() taxiHeader = taxiFile.filter(lambda l: "vendor_id" in l) taxiNoHeader = taxiFile.subtract(taxiHeader) taxi_temp = taxiNoHeader.map(lambda k: k.split(",")) taxi_rdd = taxi_temp.map(lambda p: Row(vendor_id=p[0], pickup_datetime=datetime.strptime(p[1], "%Y-%m-%d %H:%M:%S"), dropoff_datetime=datetime.strptime(p[2], "%Y-%m-%d %H:%M:%S"), passenger_count=int(p[3] if p[3]!="" else 0), trip_distance=float(p[4] if p[4]!="" else 0), pickup_longitude=float(p[5] if p[5]!="" else 0) , pickup_latitude=float(p[6] if p[6]!="" else 0), rate_code=p[7], store_and_fwd_flag=p[8], dropoff_longitude=float(p[9] if p[9]!="" else 0), dropoff_latitude=float(p[10] if p[10]!="" else 0), payment_type=p[11], fare_amount=float(p[12] if p[12]!="" else 0), surcharge=float(p[13] if p[13]!="" else 0), mta_tax=float(p[14] if p[14]!="" else 0), tip_amount=float(p[15] if p[15]!="" else 0), tolls_amount=float(p[16] if p[16]!="" else 0), total_amount=float(p[17] if p[17]!="" else 0), zipcode=p[18])) taxi_df = sqlContext.createDataFrame(taxi_rdd) taxi_df.registerTempTable("taxi") sqlContext.registerFunction("to_hour", lambda x: x.hour) sqlContext.registerFunction("str_date", lambda x: str(x.month) + "-" + str(x.day) + "-" + str(x.year)) th = sqlContext.sql("SELECT to_hour(dropoff_datetime) as hour, dropoff_datetime as trip_date, dropoff_longitude as lng,dropoff_latitude as lat,zipcode FROM taxi where dropoff_longitude!=0 and dropoff_latitude!=0") th.registerTempTable("taxi_hr") sqlContext.cacheTable("taxi_hr") grouped_taxi = sqlContext.sql("select hour, zipcode,str_date(trip_date), count(*) as c from taxi_hr group by hour,zipcode,str_date(trip_date) order by c desc") grouped_taxi.show(100) #save this intermediate result to a file as csv grouped_csv = grouped_taxi.map(toCSV) grouped_csv.saveAsTextFile('results') grouped_taxi.registerTempTable("taxi_grouped") sqlContext.cacheTable("taxi_grouped")
return x.upper() if __name__ == "__main__": # read json file inputFile = sys.argv[1] conf = SparkConf().setAppName("SparkSQLTwitter") sc = SparkContext(conf=conf) sqlCtx = SQLContext(sc) input = sqlCtx.read.json(inputFile) # print data schema # print input.printSchema() # get partitions number print input.rdd.getNumPartitions() input.createOrReplaceTempView("tb_data") sqlCtx.cacheTable("tb_data") data = sqlCtx.sql("select * from tb_data") print "----read json file and save as parquet file----" data.show() data.write.save("./parquetFile", mode="overwrite", format="parquet") # Read parquet file print "----read parquet file and show----" rows = sqlCtx.read.parquet("./parquetFile") print rows.show() # Use Row to create DataFrame base on RDD happyPeopleRDD = sc.parallelize( [Row(name="holden", favouriteBeverage="coffee")]) happyPeopleSchemaRDD = sqlCtx.createDataFrame(happyPeopleRDD) happyPeopleSchemaRDD.createOrReplaceTempView("happy_people") print sqlCtx.sql("select * from happy_people").show() # UDF
def main(): """ Main function to execute the HW assignments """ print "Adv DB HW" conf = (SparkConf().setMaster("local").setAppName("Adv DB HW app").set( "spark.executor.memory", "1g")) sc = SparkContext(conf=conf) # Define a display when no DISPLAY is undefined matplotlib.use('Agg') # Import weather dataset weatherDataset = sc.textFile("/datasets/2016weather.csv") # Import user dataset userDataset = sc.textFile("/datasets/userdata.csv") # Parse datasets weatherParse = weatherDataset.map(lambda line: line.split(",")) userParse = userDataset.map(lambda line: line.split(",")) # Work on datasets weatherSnow = weatherParse.filter(lambda x: x[2] == "SNOW") # Just testing firstWeather = weatherParse.first() print "firstWeather is ", firstWeather # instantiate SQLContext object sqlContext = SQLContext(sc) # Convert each line of snowWeather RDD into a Row object snowRows = weatherSnow.map( lambda p: Row(station=p[0], month=datetime.strptime(p[1], '%Y%m%d').month, date=datetime.strptime(p[1], '%Y%m%d').day, metric=p[2], value=int(p[3]))) # Apply Row schema snowSchema = sqlContext.createDataFrame(snowRows) # Register 'snow2016' table with 5 columns: station, month, date, metric, and value snowSchema.registerTempTable("snow2016") sqlContext.cacheTable("snow2016") snow_US10chey021 = sqlContext.sql( "SELECT month, COUNT(*) AS snowdays FROM snow2016 WHERE station='US10chey021' GROUP BY month ORDER BY month" ).collect() snow_US10chey021[0] # list of 12 values are 0 US10chey021_snowdays_y = [0] * 12 for row in snow_US10chey021: US10chey021_snowdays_y[row.month - 1] = row.snowdays print(US10chey021_snowdays_y) snow_USW00094985 = sqlContext.sql( "SELECT month, COUNT(*) AS snowdays FROM snow2016 WHERE station='USW00094985' GROUP BY month ORDER BY month" ).collect() USW00094985_snowdays_y = [0] * 12 for row in snow_USW00094985: USW00094985_snowdays_y[row.month - 1] = row.snowdays print(USW00094985_snowdays_y) """ Print figures """ N = 12 ind = np.arange(N) width = 0.35 pUS10chey021 = plt.bar(ind, US10chey021_snowdays_y, width, color='g', label='US10chey021') pUSW00094985 = plt.bar(ind + width, USW00094985_snowdays_y, width, color='y', label='USW00094985') plt.ylabel('SNOW DAYS') plt.xlabel('MONTH') plt.title('Snow Days in 2016 at Stations US10chey021 vs. USW00094985') plt.xticks(ind + width, ('Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec')) plt.legend() plt.savefig('/python-scripts/examples/weatherfig.png') plt.show() snowStations = sqlContext.sql( "SELECT station, COUNT(*) AS snowdays FROM snow2016 WHERE station LIKE 'US%' GROUP BY station ORDER BY station LIMIT 100" ) snowStations.head(5) sqlContext.registerDataFrameAsTable(snowStations, "snowdays_2016") snowStations_new = sqlContext.sql( "SELECT station, snowdays FROM snowdays_2016 ORDER BY snowdays DESC LIMIT 5" ).collect() for row in snowStations_new: print(row)
sc = SparkContext("local[*]", "Simple App") #sc = SparkContext("spark://url:7077", "Simple App") sqlContext = SQLContext(sc) sqlContext.setConf("spark.sql.shuffle.partitions", "5") # issue movies query conf = {"es.resource" : "movies2/logs", "es.query" : "?q=name:picture"} movies = sc.newAPIHadoopRDD("org.elasticsearch.hadoop.mr.EsInputFormat",\ "org.apache.hadoop.io.NullWritable", "org.elasticsearch.hadoop.mr.LinkedMapWritable", conf=conf) # place results in table moviesRows = movies.map(lambda p: Row(id=int(p[1]['id']), name=p[1]['name'])) moviesRowsList = moviesRows.collect() schemaMovies = sqlContext.createDataFrame(moviesRowsList) schemaMovies.registerTempTable("movies") sqlContext.cacheTable("movies") # get ids in order to form acted_in query ids = [] for moviesRow in moviesRowsList: ids.append(moviesRow['id']) movieIdSnippets = [] for id in ids: movieIdSnippets.append("movie_id:" + str(id)) # partition acted_in query actedInRowsTotalList = [] movieIdSnippetsChunks = list(chunks(movieIdSnippets, 1000)) for chunk in movieIdSnippetsChunks: movieIdQuery = " OR ".join(chunk) conf = {"es.resource" : "acted_in2/logs", "es.query" : "?q=" + movieIdQuery, "es.size" : "10000"}
Sentiment = sqlContext.read.parquet("swift://notebooks.spark/sentiment.parquet") # In[62]: print Gender # In[37]: followers.registerTempTable("followers"); # In[38]: sqlContext.cacheTable("followers") plt1 = sqlContext.sql("SELECT * FROM followers where followers not in (2870776) order by userDisplayName desc") # In[39]: df1 = plt1.toPandas() df1=df1.set_index('userDisplayName') # In[75]: from pylab import rcParams rcParams['figure.figsize'] = 20,10 Line=df1.plot(kind='line',title='Count by followers of users',stacked=False) Line.set_ylabel("No.of follwers")
counter=0 ff=kmeansDF.rdd.map(lambda x:Kmeans(x)) new=ff.groupByKey().map(computecentroid) dd=new.collect() for itt in dd: index=itt[0] val=itt[1] centroidlist[index-1][0]=val[0] centroidlist[index-1][1]=val[1] classrow=ff.map(lambda x: (Row(Zone=x[0],ids=x[1][2]))) finaldf=sqlContext.createDataFrame(classrow) naivedf=kmeansDF.join(finaldf,(kmeansDF.id==finaldf.ids)) '''------------------------------------------------------Naive Bayes Distributed--------------------------------------------''' classification = [] classfinal = {} naivedf.registerTempTable('bayesDataTable') sqlContext.cacheTable("bayesDataTable") timevocab=sqlContext.sql("SELECT count(distinct(Time_Zone)) from bayesDataTable").collect()[0][0] weekdayvocab=sqlContext.sql("SELECT count(distinct(Weekday)) from bayesDataTable").collect()[0][0] # zoneList = sqlContext.sql("SELECT distinct(Zone) from bayesDataTable").collect() min=0.0 timelist=sqlContext.sql("SELECT Time_Zone FROM bayesDataTable").collect() weeklist=sqlContext.sql("SELECT Weekday FROM bayesDataTable").collect() classpredicted=PredictNaiveBayes(giventimeZone, givenWeekday) print "Area with highest demand given time and day is ",classpredicted print "Top three highest demand areas are ", sorted(classfinal,key=classfinal.get,reverse=True)[:3]
file_list = cols.split(',') #print("file list: ", file_list) file_id = 0 path = '/user/hm74/NYCColumns/' labels = dict() for file in file_list: if file_id < 135 and file != '4d7f-74pe.Address.txt.gz' and file != '64gx-bycn.EMPCITY.txt.gz' and file != 'pq5i-thsu.DVC_MAKE.txt.gz' and file != 'jz4z-kudi.Respondent_Address__Zip_Code_.txt.gz' and file != 'tukx-dsca.Address_1.txt.gz' and file != 'h9gi-nx95.VEHICLE_TYPE_CODE_1.txt.gz' and file != 'mdcw-n682.First_Name.txt.gz' and file != 'sxx4-xhzg.Park_Site_Name.txt.gz': inFile = path + file print("File Name: ", file) file_name = file.split('.')[1] if file in labels: labels[file].append(file_name) else: labels[file] = [file_name] spark = SparkSession(sc) col = sqlContext.read.format('csv').options(inferschema='true', delimiter='\t').load(inFile) col.createOrReplaceTempView("col") sqlContext.cacheTable("col") spark.sql("select * from col").show(col.count(), False) file_id += 1 else: file_id += 1 with open('135_label.json', 'w', encoding='utf-8') as f: json.dump(labels, f, ensure_ascii=False) sc.stop()
zonenum = eachh[0] newcen = eachh[1] #Adding new list of Centroids to the ListofCentroids ListOfCentroids[zonenum - 1][0] = newcen[0] ListOfCentroids[zonenum - 1][1] = newcen[1] classrow = TempRdd.map(lambda x: (Row(Zone=x[0], idxx=x[1][2]))) finaldf = sqlContext.createDataFrame(classrow) #Adding a column called "Zone" to each record/row in the RDD NBDF = KMDF.join(finaldf, (KMDF.id == finaldf.idxx)) Result = {} NBDF.registerTempTable('NBtable') #Caching the table so to reduce the execution time sqlContext.cacheTable("NBtable") #To get the total vocabulary of Periods in the file PeriodVocabulary = sqlContext.sql( "SELECT count(distinct(Period)) from NBtable").collect()[0][0] #To get the total vocabulary of days in the file DayVocabulary = sqlContext.sql( "SELECT count(distinct(Weekday)) from NBtable").collect()[0][0] WeatherVocabulary = sqlContext.sql( "SELECT count(distinct(weather)) from NBtable").collect()[0][0] totaltimes = sqlContext.sql("SELECT Period FROM NBtable").collect() weeklist = sqlContext.sql("SELECT Weekday FROM NBtable").collect() weatherlist = sqlContext.sql("SELECT weather FROM NBtable").collect() #Sending the given user arguments of day and time to NaiveBayes to predict zone PredictedZone = NaiveBayes(InputPeriod, InputDay, InputWeather)
jsonFile = sqlContext.read.json("swift://notebooks.spark/lobbying.json") # In[3]: print jsonFile # In[4]: jsonFile.registerTempTable("lobbyings"); # In[5]: sqlContext.cacheTable("lobbyings") # In[6]: lobbyings = sqlContext.sql("SELECT * FROM lobbyings") # In[7]: lobbyings.cache() # In[8]: lobbyings.printSchema()
def main(argv): conf = SparkConf().setAppName("checkpoint-2") sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) partition = int(argv[0]) input_file = argv[1] #Print info to the console print '\nCreating dataframe:\n' print 'No. of partitions : ', partition print 'input_file : ', input_file ## create rdd from input file and apply map operation to split strings rdd = sc.textFile(input_file, partition) mapped_rdd = rdd.map(lambda l: l.split(",")) ## Converting fields into appropriate data types aadhaar = mapped_rdd.map(lambda l: \ Row(date=(datetime.strptime(l[0].strip(), '%Y%m%d').date()), \ registrar=(l[1].strip()), \ private_agency=(l[2].strip()), \ state=(l[3].strip()), \ district=(l[4].strip()), \ sub_district=(l[5].strip()), \ pincode=(l[6].strip()), \ gender=(l[7].strip()), \ age=int(l[8].strip()), \ aadhaar_generated=int(l[9].strip()), \ rejected=int(l[10].strip()), \ mobile_number=int(l[11].strip()), \ email_id=int(l[12].strip()))) # Applying schema to RDD and create Dataframe aadhaar_df = sqlContext.createDataFrame(aadhaar) aadhaar_df.persist() aadhaar_df.registerTempTable('aadhaar_table') sqlContext.cacheTable('aadhaar_table') # 7. Describe the schema print('7 - Schema : ') print('\n') aadhaar_df.printSchema() print('\n\n') # 8. count and names of registrars in the table print('8 - the count and names of registrars in the table ') print('\n') count_registrars = aadhaar_df.select("registrar").distinct().count() print("8.1 - count of registrars : \n", count_registrars) print("names of registrars: \n") print('\n') if aadhaar_df.isLocal: aadhaar_df.select("registrar").distinct().show(count_registrars, False) else: aadhaar_df.select("registrar").distinct().show(False) print('\n\n') # 9. number of states, districts in each state and sub-districts in each district count_states = aadhaar_df.select("state").distinct().count() print(" 9 - number of states : ", count_states) print('\n') query = 'select state, count(distinct district) as district_count from aadhaar_table group by state order by state' district_per_state = sqlContext.sql(query) print('\n\n') print(" 9 - number of districts in each state : ") print('\n') district_per_state.show(district_per_state.count(), False) query = 'select state, district, count(distinct sub_district) as sub_district_count from aadhaar_table group by state, district order by state, district' subdist_per_dist = sqlContext.sql(query) print(" 9 -number of sub_district in each district : ") print('\n') subdist_per_dist.show(subdist_per_dist.count(), False) print('\n\n') # 10. number of males and females in each state from the table and display a suitable plot query = 'select state, \ sum(case when gender = \'M\' then 1 else 0 end) as male,\ sum(case when gender = \'F\' then 1 else 0 end) as female \ from aadhaar_table \ group by state \ order by state' mf_each_state = sqlContext.sql(query) print(" 10 - number of male and female in each state are: ") print('\n') mf_each_state.show(mf_each_state.count(), False) print('\n\n') # plot the graph using Matplotlib df = mf_each_state.toPandas() df = df.set_index('state') gf = df[['male', 'female']] gf.plot.barh(stacked=True) plt.title('Number of males and females in each state') plt.xlabel('Number of males and females') plt.ylabel('States') plt.show() # 11. names of private agencies for each state query = 'select distinct state, private_agency from aadhaar_table group by state, private_agency' prvt_agencies_each_state = sqlContext.sql(query) print(" 11 - names of private_agencies in each state are : ") print('\n') prvt_agencies_each_state.show(prvt_agencies_each_state.count(), False) print('\n\n') # 12. number of private agencies for each state. query = 'select state, count(distinct private_agency) as private_agencies from aadhaar_table group by state order by state' prvt_agencies_each_state = sqlContext.sql(query) print(" 12 - Number of private_agencies in each state as follow: ") print('\n') prvt_agencies_each_state.show(prvt_agencies_each_state.count(), False) print('\n\n') # plot the graph using Matplotlib df = prvt_agencies_each_state.toPandas() df = df.set_index('state') gf = df['private_agencies'] gf.plot.barh(stacked=True) plt.title('Number of private agencies for each state') plt.xlabel('Number of private agencies') plt.ylabel('States') plt.show() # remove cached elements from memory sqlContext.uncacheTable('aadhaar_table') aadhaar_df.unpersist()
class SparkDriver: def __init__(self, config): self.config = config log.debug('SPARK_CONFIG: {0}'.format(config)) #获取需要进行计算的文件列表 path = self.config['data_path'] file_list = [] import os dir_list = os.listdir(path) for d in dir_list: d2 = os.path.join(path,d) if not os.path.isfile(d2): continue n, e = os.path.splitext(d) if e == '.csv': file_list.append(d2) file_list = sorted(file_list,reverse = True) if 'file_count' in self.config and self.config['file_count'] != -1 and len(file_list) > self.config['file_count']: file_list = file_list[:self.config['file_count']] # 配置spark的上下文 spark_conf = SparkConf().setMaster( self.config['master'] ).setAppName( datetime.now().strftime('%Y%m%d%H%M%S') ).set( 'spark.executor.memory', self.config['executor_memory'] ) self.sc = SparkContext(conf=spark_conf) self.sqlContext = SQLContext(self.sc) # 读取文件列表 parts_list = [] for name in file_list: log.debug('+++++ADD FILE: '+name) lines = self.sc.textFile(name) parts = lines.map(lambda l: l.split(',')) parts_list.append(parts) log.debug('@@@@@DATA FILE COUNT({0})'.format(len(parts_list))) parts = self.sc.union(parts_list) reportInfo = parts.map(lambda p: Row(order_count=1, order_id=p[0], downstream=p[1], face_value=float(p[2])*10000, mobile=p[3], start_time=p[4] and p[4] != '' and p[4] != 'None' and int(p[4]) or 0, end_time= p[5] and p[5] != '' and p[5] != 'None' and int(p[5]) or 0, order_status=[p[6], '9'][p[6] not in ('0', '1')], downstream_price= p[7] and p[7] !='' and p[7] != 'None' and int(p[7]) or 0, carrier=p[8], area=p[9], product_type=p[10], upstream=p[11], upstream_price=p[12] and p[12] !='' and p[12] != 'None' and int(p[12]) or 0, profit=[0, p[12] and (p[7] and p[7] !='' and p[7] != 'None' and int(p[7]) or 0) - (p[12] and p[12] !='' and p[12] != 'None' and int(p[12]) or 0) ][ p[6] == '1'], ) ) self.sales_temp_table = self.sqlContext.createDataFrame(reportInfo) self.sales_temp_table.registerTempTable('sales_temp_table') execSql=''' SELECT SUM(order_count) AS total_order_count, downstream, SUM(face_value) AS total_face_value, start_time, order_status, SUM(downstream_price) AS total_downstream_price, carrier, area, upstream, SUM(upstream_price) AS total_upstream_price, SUM(profit) AS total_profit FROM sales_temp_table GROUP BY downstream, start_time, end_time, order_status, carrier, area, upstream ''' log.info('\033[32mc*********reating sales_cache_table sql=%s\033[0m' % execSql) sqlResult = self.sqlContext.sql(execSql) #将原始表转换成用于查询的表,并保存到内存中去,必须采用一个动作的方法(如:collect)才能载入内存成功 sqlResult.registerTempTable('sales_cache_table') #perist(MEMORY_AND_DISK) #->内存加硬盘模式 self.sqlContext.cacheTable('sales_cache_table') self.sqlContext.sql('SELECT * FROM sales_cache_table').collect() #用于查询的目标表载入内存完成 # 测试代码 if True: import os path = os.path.join('./', 'test.txt') with open(path, 'w') as testFile: testFile.write('100') self.sc.addFile(path) result = self.sc.parallelize([1, 2, 3, 4]).mapPartitions(test_func).collect() log.info('SPARK TEST RESULT: {0}'.format(result) ) # 销售数据的详细报表 def report_sales(self, argu_list): # 暂时不处理分页的数据 argu_list.pop('page_index') argu_list.pop('page_size') query_start_time = argu_list['query_start_time'] query_end_time = argu_list['query_end_time'] argu_list['query_start_time'] = datetime.strptime(argu_list['query_start_time'], '%Y/%m/%d') argu_list['query_end_time'] = datetime.strptime(argu_list['query_end_time'], '%Y/%m/%d') + timedelta(days=1) # 删除值为空的过滤条件 # argu_list = {k: argu_list[k] for k in argu_list if argu_list[k] and argu_list[k] != '' } if 'query_start_time' in argu_list: argu_list['query_start_time'] = int( argu_list['query_start_time'].timestamp() ) if 'query_end_time' in argu_list: argu_list['query_end_time'] = int( argu_list['query_end_time'].timestamp() ) log.info('SparkDriver report REQU: {0}'.format(argu_list) ) # 组装SQL语句 #需要显示的列, select_sql = ''' SELECT SUM(total_face_value) as total_face_value, SUM(total_downstream_price) as total_downstream_price, SUM(total_upstream_price) as total_upstream_price, SUM(total_profit) as total_profit, SUM(total_order_count) as total_order_count,''' where_sql = 'WHERE (start_time>={query_start_time}) AND (start_time<{query_end_time})'.format( query_start_time=argu_list['query_start_time'], query_end_time=argu_list['query_end_time'], ) argu_list.pop('query_start_time') argu_list.pop('query_end_time') groupby_sql = 'GROUP BY' for key in argu_list: if argu_list[key] == '': #全部 continue select_sql += ' {0},'.format(key) groupby_sql += ' {0},'.format(key) if argu_list[key] == '@':#全选 continue elif isinstance(argu_list[key], list): #数组处理 value_list = '' for value in argu_list[key]: value_list += '"{0}",'.format(value) if value_list != '': value_list = value_list[:-1] where_sql += ' AND {key} IN ({value_list})'.format(key=key, value_list=value_list) else: where_sql += ' AND {key}="{value}"'.format(key=key, value=argu_list[key]) default_upstream = '未知' if 'upstream' in argu_list and argu_list['upstream'] == '': default_upstream = '全部' default_downstream = '未知' if 'downstream' in argu_list and argu_list['downstream'] == '': default_downstream = '全部' select_sql = select_sql[:-1] if groupby_sql != 'GROUP BY': groupby_sql = groupby_sql[:-1] else: groupby_sql = '' query_sql = '{select_sql} FROM sales_cache_table {where_sql} {groupby_sql}'.format( select_sql=select_sql, where_sql=where_sql, groupby_sql=groupby_sql ) # query_sql = '' # with open('./test.sql', 'r') as f: # query_sql = f.read() log.info('\033[32m*************query_sql=%s\033[0m ' % query_sql) query_result = self.sqlContext.sql(query_sql) query_output = query_result.collect() data_list=[] for data in query_output: data = data.asDict() log.debug(data) total_face_value = data['total_face_value'] != None and data['total_face_value'] or 0 total_downstream_price = data['total_downstream_price'] != None and data['total_downstream_price'] or 0 total_upstream_price = data['total_upstream_price'] != None and data['total_upstream_price'] or 0 total_order_count = data['total_order_count'] != None and data['total_order_count'] or 0 total_profit = data['total_profit'] != None and data['total_profit'] or 0 one_line = { 'date_range': query_start_time + '-' + query_end_time, 'order_status': 'order_status' in data and data['order_status'] or '', 'upstream': 'upstream' in data and data['upstream'] or default_upstream, 'downstream': 'downstream' in data and data['downstream'] or default_downstream, 'carrier': 'carrier' in data and data['carrier'] or '', 'area': 'area' in data and data['area'] or '', 'total_face_value': total_face_value, 'total_downstream_price': total_downstream_price, 'total_upstream_price': total_upstream_price, 'total_order_count': total_order_count, 'total_profit': total_profit } data_list.append(one_line) return data_list # 销售数据的概要报表 def report_sales_brief(self, argu_list): # 暂时不处理分页的数据 argu_list.pop('page_index') argu_list.pop('page_size') query_start_time = argu_list['query_start_time'] query_end_time = argu_list['query_end_time'] argu_list['query_start_time'] = datetime.strptime(argu_list['query_start_time'], '%Y/%m/%d') argu_list['query_end_time'] = datetime.strptime(argu_list['query_end_time'], '%Y/%m/%d') + timedelta(days=1) if 'query_start_time' in argu_list: argu_list['query_start_time'] = int( argu_list['query_start_time'].timestamp() ) if 'query_end_time' in argu_list: argu_list['query_end_time'] = int( argu_list['query_end_time'].timestamp() ) log.info('SparkDriver report REQU: {0}'.format(argu_list) ) # 组装SQL语句 #需要显示的列, select_sql = ''' SELECT SUM(total_downstream_price) as total_downstream_price, SUM(total_order_count) as total_order_count,''' where_sql = 'WHERE (start_time>={query_start_time}) AND (start_time<{query_end_time})'.format( query_start_time=argu_list['query_start_time'], query_end_time=argu_list['query_end_time'], ) argu_list.pop('query_start_time') argu_list.pop('query_end_time') groupby_sql = 'GROUP BY' for key in argu_list: if argu_list[key] == '': #全部 continue select_sql += ' {0},'.format(key) groupby_sql += ' {0},'.format(key) if argu_list[key] == '@':#全选 continue elif isinstance(argu_list[key], list): #数组处理 value_list = '' for value in argu_list[key]: value_list += '"{0}",'.format(value) if value_list != '': value_list = value_list[:-1] where_sql += ' AND {key} IN ({value_list})'.format(key=key, value_list=value_list) else: where_sql += ' AND {key}="{value}"'.format(key=key, value=argu_list[key]) default_downstream = '未知' if 'downstream' in argu_list and argu_list['downstream'] == '': default_downstream = '全部' select_sql = select_sql[:-1] if groupby_sql != 'GROUP BY': groupby_sql = groupby_sql[:-1] else: groupby_sql = '' query_sql = '{select_sql} FROM sales_cache_table {where_sql} {groupby_sql}'.format( select_sql=select_sql, where_sql=where_sql, groupby_sql=groupby_sql ) log.info('\033[32m*************query_sql=%s\033[0m ' % query_sql) query_result = self.sqlContext.sql(query_sql) query_output = query_result.collect() #组装返回数据格式 all_total_downstream_price = 0 for data in query_output: all_total_downstream_price += data['total_downstream_price'] data_list=[] for data in query_output: data = data.asDict() log.debug(data) total_downstream_price = data['total_downstream_price'] != None and data['total_downstream_price'] or 0 total_order_count = data['total_order_count'] != None and data['total_order_count'] or 0 one_line = { 'date_range': query_start_time + '-' + query_end_time, 'downstream': 'downstream' in data and data['downstream'] or default_downstream, 'carrier': 'carrier' in data and data['carrier'] or '', 'total_downstream_price': total_downstream_price, 'total_order_count': total_order_count, 'percentage':("%.2f%%" % (total_downstream_price*100/all_total_downstream_price)) } data_list.append(one_line) return data_list
for file in file_list: print("---" + str(file_id) + "th file---") fileName = file[0] print(fileName) file_id += 1 temp_JSON = dict() temp_JSON['dataset_name'] = fileName path = inFile + "/" + fileName spark = SparkSession(sc) test = sqlContext.read.format('csv').options(header='true', inferschema='true', delimiter='\t').load(path) test.createOrReplaceTempView("test") sqlContext.cacheTable("test") #spark.sql("select * from test").show() #spark.sql("select count(*) from test").show() attrs = test.schema.names columns = [] candidate_keys = [] for attr in attrs: temp_column = dict() #print("column name:", attr) temp_column['column_name'] = attr temp = spark.sql("SELECT `" + attr + "` as attr FROM test") #temp.show() temp.createOrReplaceTempView("temp") temp_count = temp.count()
lambda x: tuple(x.split('\t'))) print('Cities loaded') print('Creating SQL Context for cities') cities_df = cities_file.map(lambda c: Row(name=c[0], lat=float(c[1]), lon=float(c[2]), country_code=c[3], country_name=c[4], type=c[5])) cities_df = sqlContext.createDataFrame(cities_df) cities_df.registerTempTable('cities') sqlContext.cacheTable('cities') print('SQL context for cities created') # See timestamp function print('Task 1.2 - Calculate local time for each checkin') checkin_df = foursqr.map(lambda l: Row(checkin_id=int(l[0]), user_id=int(l[1]), session_id=l[2], time=timestamp(l[3], l[4]), lat=float(l[5]), lon=float(l[6]), category=l[7], subcategory=l[8])) print('Local time for each checkin calculated')
calculate_forest_bound_udf = func.udf(calculate_forest_bound, ArrayType(FloatType())) df2 = df2.withColumn('forest_bound', calculate_forest_bound_udf('polygon')) join_condition_udf = func.udf(join_condition, BooleanType()) # join suburb dataframe with forest dataframe based on the condition of intersection df1p1 = df1.repartition(1) df2p1 = df2.repartition(10) joined = df1p1.crossJoin(df2p1).where( join_condition_udf(df1p1.suburb_bound, df2p1.forest_bound)) joined = joined.drop('suburb_bound').drop('area').drop('forest_bound') # group and aggregate joined table by suburb code(name) joined.registerTempTable('joined_table') sql_context.cacheTable("joined_table") result = sql_context.sql( 'SELECT FIRST(suburb_name) AS suburb_name, suburb_code, FIRST(multipolygon) AS multipolygon, FIRST(suburb_area) ' 'AS suburb_area, collect_list(polygon) AS forest_multipolygon FROM joined_table GROUP BY suburb_code' ) result = result.withColumn( 'percentage_%', func.udf(calculate_forest_rate, FloatType())('multipolygon', 'forest_multipolygon', 'suburb_area')) result = result.drop('multipolygon').drop('suburb_area').drop( 'forest_multipolygon') result.orderBy('percentage_%', ascending=False).show()
from pyspark.sql import SQLContext, Row conf = SparkConf().setAppName('TriWordCount') sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) trigrams = sc.textFile( 'hdfs:///users/rocks1/12307130174/spark_probabilities_smoothed01/*') trigrams = trigrams.map(lambda line: eval(line)) \ .map(lambda t: Row(word0 = t[0][0], word1=t[0][1], word2=t[0][2], prob=t[1])) schemaTrigram = sqlContext.createDataFrame(trigrams) schemaTrigram.registerTempTable("trigram") sqlContext.cacheTable("trigram") #schemaTrigram.cache() import socket s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) s.bind(("", 54899)) s.listen(5) while True: #word0, word1 = raw_input(">").split() print "in loop" client, _ = s.accept() print "acccpeted" recved = client.recv(1024) print "recived"
def timeslot(f): time24=datetime.datetime.strptime(f,'%I:%M:%S %p').strftime('%X') timesl=int(time24[0:2])/3 timesl=timesl+1#divided time into 8 slots return timesl schemaString="day timeslot block crimetype latitude longitude" reformattedCrime=crimeData.map(lambda line: [date2dayofweek(line[1].split(' ',1)[0]),timeslot(line[1].split(' ',1)[1]),line[2].split(' ',1)[1],line[3],line[4],line[5]]) schemaCrime = sqlContext.createDataFrame(reformattedCrime, ['day','timeslot','block','crimetype','latitude','longitude']) schemaCrime.registerTempTable("chicagocrimedata") sqlContext.cacheTable("chicagocrimedata") timeMatrix=sqlContext.sql("SELECT crimetype,timeslot,count(*) AS countPerTime FROM chicagocrimedata group by crimetype,timeslot order by crimetype") #Extract all classes. Here, distinct crime types CrimeTypes = sqlContext.sql("SELECT distinct(crimetype) AS crimetypes FROM chicagocrimedata order by crimetypes").collect() allCrimeTypes = list() for index in range(len(CrimeTypes)): allCrimeTypes.append(CrimeTypes[index][0]) #Extracting statistics of crimes top 10 crimeCounts=sqlContext.sql("SELECT crimetype,count(*) as crimeCount FROM chicagocrimedata GROUP BY crimetype order by crimeCount desc LIMIT 10").collect() countByCrimeType = {}
writer = csv.writer(f) writer.writerows([["BA", "TimeAndDate", "Demand"]]) f.flush() os.fsync(f) else: # create the union of our new rows and our old latest records and create a table and cache it. demand_table.unionAll( spark.read.csv( "data/latestAndEarliest.csv", header=True, inferSchema=True, timestampFormat="yyyy-MM-dd HH:mm:ss+00:00"). select("BA", "TimeAndDate", "Demand")).write.partitionBy("BA").saveAsTable( "demands") spark.cacheTable('demands') # read the old latest dates and get the mean and sd of each ba and cache it. latestDates = spark.read.csv( "data/latestAndEarliest.csv", header=True, inferSchema=True, timestampFormat="yyyy-MM-dd HH:mm:ss+00:00") latestDates.write.saveAsTable("latestAndEarliest") spark.cacheTable('latestAndEarliest') # get the clean data and write it to a table cleanData = spark.read.csv( "data/elec_demand_hourlyClean.csv", header=True, inferSchema=True, timestampFormat="yyyy-MM-dd HH:mm:ss+00:00") cleanData.write.saveAsTable("demandsClean")
CAST(SQLDATE AS INTEGER), CAST(MonthYear AS INTEGER), CAST(Year AS INTEGER), CASE WHEN Actor1Type1Code = '' AND Actor2Type1Code <> '' THEN Actor2Type1Code ELSE Actor1Type1Code END AS Actor1Type1Code, CAST(NumArticles AS INTEGER), CAST(GoldsteinScale AS INTEGER), CAST(AvgTone AS INTEGER) FROM temp WHERE ActionGeo_CountryCode <> '' AND ActionGeo_CountryCode IS NOT NULL AND Actor1Type1Code <> '' AND Actor1Type1Code IS NOT NULL AND NumArticles <> '' AND NumArticles IS NOT NULL AND GoldsteinScale <> '' AND GoldsteinScale IS NOT NULL AND AvgTone <> '' AND AvgTone IS NOT NULL""") sqlContext.dropTempTable('temp') sqlContext.registerDataFrameAsTable(df2, 'temp3') sqlContext.cacheTable('temp3') dfdaily = sqlContext.sql("""SELECT ActionGeo_CountryCode, SQLDATE, Actor1Type1Code, SUM(NumArticles) AS NumArticles, ROUND(AVG(GoldsteinScale),0) AS GoldsteinScale, ROUND(AVG(AvgTone),0) AS AvgTone FROM temp3 GROUP BY ActionGeo_CountryCode, SQLDATE, Actor1Type1Code""") dfmonthly = sqlContext.sql("""SELECT ActionGeo_CountryCode, MonthYear, Actor1Type1Code,
# In[85]: Sentiment = sqlContext.read.parquet( "swift://notebooks.spark/sentiment.parquet") # In[62]: print Gender # In[37]: followers.registerTempTable("followers") # In[38]: sqlContext.cacheTable("followers") plt1 = sqlContext.sql( "SELECT * FROM followers where followers not in (2870776) order by userDisplayName desc" ) # In[39]: df1 = plt1.toPandas() df1 = df1.set_index('userDisplayName') # In[75]: from pylab import rcParams rcParams['figure.figsize'] = 20, 10 Line = df1.plot(kind='line', title='Count by followers of users',
sqlContext = SQLContext(sc) # In[24]: gender = sqlContext.read.parquet("gender.parquet") # In[25]: gender.registerTempTable("gender"); # In[33]: sqlContext.cacheTable("gender") plt1 = sqlContext.sql("SELECT * FROM gender order by 2") # In[34]: df1 = plt1.toPandas() df1=df1.set_index('USER_GENDER') # In[63]: from pylab import rcParams rcParams['figure.figsize'] = 15,5 barh=df1.plot(kind='barh',title='Count of gender speaking about Dance',stacked=False,color='b') barh.set_ylabel("Gender")
schemaString = "day timeslot block crimetype latitude longitude" #fields = [StructField(field_name, StringType(), True) for field_name in schemaString.split()] #schema = StructType(fields) reformattedCrime = crimeData.map(lambda line: [ date2dayofweek(line[1].split(' ', 1)[0]), timeslot(line[1].split(' ', 1)[1]), line[2].split(' ', 1)[1], line[3], line[4], line[5] ]) #Creating table from the input file. schemaCrime = sqlContext.createDataFrame( reformattedCrime, ['day', 'timeslot', 'block', 'crimetype', 'latitude', 'longitude']) schemaCrime.registerTempTable("chicagocrimedata") sqlContext.cacheTable("chicagocrimedata") #Caching helps in the retrieving the data faster locationVocabulary = sqlContext.sql( "SELECT count(distinct(block)) from chicagocrimedata").collect()[0][0] timeVocabulary = sqlContext.sql( "SELECT count(distinct(timeslot)) from chicagocrimedata").collect()[0][0] dayVocabulary = sqlContext.sql( "SELECT count(distinct(day)) from chicagocrimedata").collect()[0][0] TOTALCRIMES = sqlContext.sql( "SELECT count(distinct(day)) from chicagocrimedata").collect()[0][0] #Creating matrix for NaiveBayes calculations. locationsMatrix = sqlContext.sql( "SELECT crimetype,block,count(*) AS countPerBlock FROM chicagocrimedata group by crimetype,block order by countPerBlock desc" )
timesl=int(time24[0:2])/3 timesl=timesl+1#divided time into 8 slots return timesl #Schema for creating the table. schemaString="day timeslot block crimetype latitude longitude" #fields = [StructField(field_name, StringType(), True) for field_name in schemaString.split()] #schema = StructType(fields) reformattedCrime=crimeData.map(lambda line: [date2dayofweek(line[1].split(' ',1)[0]),timeslot(line[1].split(' ',1)[1]),line[2].split(' ',1)[1],line[3],line[4],line[5]]) #Creating table from the input file. schemaCrime = sqlContext.createDataFrame(reformattedCrime, ['day','timeslot','block','crimetype','latitude','longitude']) schemaCrime.registerTempTable("chicagocrimedata") sqlContext.cacheTable("chicagocrimedata") #Caching helps in the retrieving the data faster locationVocabulary = sqlContext.sql("SELECT count(distinct(block)) from chicagocrimedata").collect()[0][0] timeVocabulary = sqlContext.sql("SELECT count(distinct(timeslot)) from chicagocrimedata").collect()[0][0] dayVocabulary = sqlContext.sql("SELECT count(distinct(day)) from chicagocrimedata").collect()[0][0] TOTALCRIMES = sqlContext.sql("SELECT count(distinct(day)) from chicagocrimedata").collect()[0][0] #Creating matrix for NaiveBayes calculations. locationsMatrix=sqlContext.sql("SELECT crimetype,block,count(*) AS countPerBlock FROM chicagocrimedata group by crimetype,block order by countPerBlock desc") timeMatrix=sqlContext.sql("SELECT crimetype,timeslot,count(*) AS countPerTime FROM chicagocrimedata group by crimetype,timeslot order by crimetype") dayMatrix=sqlContext.sql("SELECT crimetype,day,count(*) AS countPerDay FROM chicagocrimedata group by crimetype,day order by countPerDay desc") #Extract all classes. Here, distinct crime types CrimeTypes = sqlContext.sql("SELECT distinct(crimetype) AS crimetypes FROM chicagocrimedata order by crimetypes").collect() allCrimeTypes = list()
def sql_context_api(spark): sc = spark.sparkContext sqlContext = SQLContext(sc) print("Start running SQL context API") # createDataFrame l = [('Alice', 1)] sqlContext.createDataFrame(l).collect() res = sqlContext.createDataFrame(l, ['name', 'age']).collect() print(res) rdd = sc.parallelize(l) sqlContext.createDataFrame(rdd).collect() df = sqlContext.createDataFrame(rdd, ['name', 'age']) res = df.collect() print(res) print("createDataFrame API finished") # table and cache df = spark.createDataFrame([('Alice', 5, 80), ('Alice', 5, 80), ('Alice', 10, 80)], ["name", "age", "height"]) sqlContext.registerDataFrameAsTable(df, "table1") sqlContext.cacheTable("table1") sqlContext.uncacheTable("table1") sqlContext.cacheTable("table1") sqlContext.clearCache() # sqlContext.createExternalTable("table1", schema = df2) sqlContext.dropTempTable("table1") # res = df2.collect() # print(res) print("External, TempTable and cache API finished") # getConf res = sqlContext.getConf("spark.sql.shuffle.partitions") print(res) res = sqlContext.getConf("spark.sql.shuffle.partitions", u"10") print(res) sqlContext.setConf("spark.sql.shuffle.partitions", u"50") res = sqlContext.getConf("spark.sql.shuffle.partitions", u"10") print(res) print("getConf API finished") # newSession newspark = sqlContext.newSession() print("newSession API finished") # range res = sqlContext.range(1, 7, 2).collect() print(res) res = sqlContext.range(3).collect() print(res) print("range API finished") # read res = sqlContext.read text_sdf = sqlContext.readStream.text( "/ppml/trusted-big-data-ml/work/examples/helloworld.py") res = text_sdf.isStreaming print(res) print("read and readStream API finished") # sql df = spark.createDataFrame([('Alice', 5, 80), ('Alice', 5, 80), ('Alice', 10, 80)], ["name", "age", "height"]) sqlContext.registerDataFrameAsTable(df, "table1") df2 = sqlContext.sql("SELECT name AS f1, age as f2 from table1") res = df2.collect() print(res) print("sql API finished") # table df = spark.createDataFrame([('Alice', 5, 80), ('Alice', 5, 80), ('Alice', 10, 80)], ["name", "age", "height"]) sqlContext.registerDataFrameAsTable(df, "table1") df2 = sqlContext.table("table1") res = (sorted(df.collect()) == sorted(df2.collect())) print(res) print("table API finished") # tableNames df = spark.createDataFrame([('Alice', 5, 80), ('Alice', 5, 80), ('Alice', 10, 80)], ["name", "age", "height"]) sqlContext.registerDataFrameAsTable(df, "table1") res = ("table1" in sqlContext.tableNames()) print(res) res = ("table1" in sqlContext.tableNames("default")) print(res) print("tableNames API finished") # tables sqlContext.registerDataFrameAsTable(df, "table1") df2 = sqlContext.tables() res = df2.filter("tableName = 'table1'").first() print(res) print("tables API finished") # register # strlen = sqlContext.registerFunction("stringLengthString", lambda x: len(x)) # res = spark.sql("SELECT stringLengthString('test')").collect() # print(res) spark.udf.registerJavaFunction("javaStringLength3", "org.apache.spark.sql.JavaStringLength", "integer") res = spark.sql("SELECT javaStringLength3('test')").collect() print(res) print("register API finished") print("Finish running SQL context API")
def save_by_spark_streaming(): async = True root_path = os.path.dirname(os.path.realpath(__file__)) record_path = os.path.join(root_path, "offset.txt") from_offsets = {} if os.path.exists(record_path): datas = [] with open(record_path, 'r') as f: for line in f: datas.append(line) for ii in datas: offset_data = json.loads(ii) topic_partion = TopicAndPartition(offset_data["topic"], offset_data["partition"]) if topic_partion not in from_offsets: from_offsets[topic_partion] = int(offset_data["untilOffset"]) sc = SparkContext(appName="Rec2champzee") global sqlContext sqlContext = SQLContext(sc) ssc = StreamingContext(sc, int(timer)) kvs = KafkaUtils.createDirectStream( ssc=ssc, topics=[topic_name], fromOffsets=from_offsets, kafkaParams={"metadata.broker.list": broker_list}) sql1 = ''' SELECT sku.id AS skuid , sku.SPUID , sku.SKCID , tm1.AttributeStatus AS spu_status , tm2.ContentStatus AS skc_status , sku.SizeStatus AS sku_status , sku.BachName AS NAME , tm1.Cate1 , tm1.cate2 , tm1.Cate3 , tm1.cate3name, tm1.Brand , tm1.Description , tm1.SerialNumber , tm2.pcolour AS pcolor , tm2.color AS color , GROUP_CONCAT( tm1.typename , '-' , tm1. NAME SEPARATOR ' ' ) label , tm2.Material , tm1.Introduction , tm2.VendorColor , sku.PreSizeID , cast(sku.MsrPrice as signed) MsrPrice , cast(sku.PurchPrice as signed) PurchPrice , cast(sku.Price as signed) Price, sku.Type , sku.IsSpecial AS is_special , tm2.Version AS img_version , tm2.ImgType AS img_type , psp.Position , sku.Note , sku.LatestUPCTime , sku.CreatedDate , sku.UpdatedDate, sku.Quantity FROM product sku LEFT JOIN( SELECT spu.id AS id1 , spu.AttributeStatus AS AttributeStatus , spu.Cate1 AS cate1 , spu.cate2 AS cate2 , spu.Cate3 AS cate3 , prc.name as cate3name, spu.Brand AS brand , spu.Description AS Description , spu.SerialNumber AS SerialNumber , AT .TypeName AS typename , pa. NAME AS NAME , spu.Introduction AS Introduction FROM spu LEFT JOIN spuattribute sa ON spu.id = sa.SPUID LEFT JOIN productattribute pa ON sa.AttributeID = pa.id LEFT JOIN attributetype AT ON pa.Type = AT .typeID LEFT JOIN productcate prc ON spu.cate3 = prc .id ) tm1 ON sku.SPUID = tm1.id1 LEFT JOIN( SELECT skc.id AS id2 , skc.ContentStatus AS ContentStatus , pc.ParentID AS pcolour , skc.Color AS color , skc.Material AS Material , skc.VendorColor AS VendorColor , skc.Version AS Version , skc.ImgType AS ImgType FROM productcolor pc LEFT JOIN skc ON skc.Color = pc.id ) tm2 ON sku.skcid = tm2.id2 LEFT JOIN productskuposition psp ON sku.id = psp.ProdID GROUP BY sku.id ''' sql2 = ''' SELECT t1.id AS cate1 , t1. NAME AS cate1name , t2.id AS cate2 , t2.name as cate2name, t3.id AS cate3, t3.name as cate3name FROM ( productcate AS t1 INNER JOIN productcate AS t2 ON t1.id = t2.parentid ) INNER JOIN productcate AS t3 ON t2.id = t3.parentid WHERE t1.DELETEd = 0 AND t2.deleted = 0 AND t3.deleted = 0 AND floor(t3.id / 100) > 0 ''' prod_detail = mysql.get(sql1) cate_detail = mysql.get(sql2) sqlContext.createDataFrame( sc.parallelize(prod_detail)).createOrReplaceTempView('pd') sqlContext.createDataFrame( sc.parallelize(cate_detail)).createOrReplaceTempView('cd') sqlContext.cacheTable('pd') sqlContext.cacheTable('cd') kvs.transform(store_offset_ranges).flatMap(reco).foreachRDD(sync_data2es) ssc.start() ssc.awaitTermination() ssc.stop()
from pyspark import SparkContext from pyspark.sql import SQLContext, Row sc = SparkContext("local", "barcos") sq = SQLContext(sc) df = sq.read.load("data/containers_tiny.parquet") df.registerTempTable("container") sq.cacheTable("container") df.select("ship_imo", "ship_name", "country").filter(df['country'] == 'DK').show() sq.sql("SELECT ship_imo, ship_name FROM container WHERE country = 'DK'").show() sq.sql( "SELECT ship_imo, count(container_id) number, sum(net_weight) total_weight FROM container GROUP BY ship_imo" ).show() input("Press Enter to continue... http://localhost:4040/storage")
if demand_table.count() != 0: # check if the demand column contains a "None" entry if demand_table.dtypes[2][1] == "string": demand_table = demand_table.select("BA", "TimeAndDate", "Demand").filter("Demand!='None'") else: demand_table = demand_table.select("BA", "TimeAndDate", "Demand") # check that there are still new rows. if (demand_table.count() == 0): with open('data/newRows.csv', 'w', newline="") as f: pass else: # create a table of the demand data demand_table.write.saveAsTable("demands") # cache it spark.cacheTable('demands') # get the earliest and latest date within the new records dates = spark.sql( "select min(TimeAndDate), max(TimeAndDate) from demands").collect( )[0] # create a pandas dataframe for the calender dataframe = spark.createDataFrame( pd.DataFrame(pd.date_range(dates[0], dates[1], freq="H", tz='UTC')), ["TimeAndDate"]).repartition(8).registerTempTable("dates") # cache the table of dates spark.cacheTable('dates') # work out mean and sd values getOutliersImproved2().write.saveAsTable("demandsOut3") # wrtie it out to file spark.sql("select * from demandsOut3").coalesce(1).write.option(
def sql_context_api(spark): sc = spark.sparkContext sqlContext = SQLContext(sc) print("Start running SQL context API") # createDataFrame l = [('Alice', 1)] sqlContext.createDataFrame(l).collect() res = sqlContext.createDataFrame(l, ['name', 'age']).collect() print(res) rdd = sc.parallelize(l) sqlContext.createDataFrame(rdd).collect() df = sqlContext.createDataFrame(rdd, ['name', 'age']) res = df.collect() print(res) print("createDataFrame API finished") # table and cache df = spark.createDataFrame([('Alice', 5, 80), ('Alice', 5, 80), ('Alice', 10, 80)], ["name", "age", "height"]) sqlContext.registerDataFrameAsTable(df, "table1") sqlContext.cacheTable("table1") sqlContext.uncacheTable("table1") sqlContext.cacheTable("table1") sqlContext.clearCache() # sqlContext.createExternalTable("table1", schema = df2) sqlContext.dropTempTable("table1") # res = df2.collect() # print(res) print("External, TempTable and cache API finished") # getConf res = sqlContext.getConf("spark.sql.shuffle.partitions") print(res) res = sqlContext.getConf("spark.sql.shuffle.partitions", u"10") print(res) sqlContext.setConf("spark.sql.shuffle.partitions", u"50") res = sqlContext.getConf("spark.sql.shuffle.partitions", u"10") print(res) print("getConf API finished") # newSession newspark = sqlContext.newSession() print("newSession API finished") # range res = sqlContext.range(1, 7, 2).collect() print(res) res = sqlContext.range(3).collect() print(res) print("range API finished") # read res = sqlContext.read # text_sdf = sqlContext.readStream.text(tempfile.mkdtemp()) # res = text_sdf.isStreaming # print(res) print("read and readStream API finished") # register # sql df = spark.createDataFrame([('Alice', 5, 80), ('Alice', 5, 80), ('Alice', 10, 80)], ["name", "age", "height"]) sqlContext.registerDataFrameAsTable(df, "table1") df2 = sqlContext.sql("SELECT name AS f1, age as f2 from table1") res = df2.collect() print(res) print("sql API finished") # table df = spark.createDataFrame([('Alice', 5, 80), ('Alice', 5, 80), ('Alice', 10, 80)], ["name", "age", "height"]) sqlContext.registerDataFrameAsTable(df, "table1") df2 = sqlContext.table("table1") res = (sorted(df.collect()) == sorted(df2.collect())) print(res) print("table API finished") # tableNames df = spark.createDataFrame([('Alice', 5, 80), ('Alice', 5, 80), ('Alice', 10, 80)], ["name", "age", "height"]) sqlContext.registerDataFrameAsTable(df, "table1") res = ("table1" in sqlContext.tableNames()) print(res) res = ("table1" in sqlContext.tableNames("default")) print(res) print("tableNames API finished") # tables sqlContext.registerDataFrameAsTable(df, "table1") df2 = sqlContext.tables() res = df2.filter("tableName = 'table1'").first() print(res) print("tables API finished") print("Finish running SQL context API")
def main(argv): conf = SparkConf().setAppName("checkpoint-5") sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) partition = int(argv[0]) input_file = argv[1] # Print info to the console print '\nCreating dataframe:\n' print 'No. of partitions : ', partition print 'input_file : ', input_file ## Create rdd from input file and apply map operation to split strings rdd = sc.textFile(input_file, partition) mapped_rdd = rdd.map(lambda l: l.split(",")) ## Converting fields into appropriate data types aadhaar = mapped_rdd.map(lambda l: \ Row(date=(datetime.strptime(l[0].strip(), '%Y%m%d').date()), \ registrar=(l[1].strip()), \ private_agency=(l[2].strip()), \ state=(l[3].strip()), \ district=(l[4].strip()), \ sub_district=(l[5].strip()), \ pincode=(l[6].strip()), \ gender=(l[7].strip()), \ age=int(l[8].strip()), \ aadhaar_generated=int(l[9].strip()), \ rejected=int(l[10].strip()), \ mobile_number=int(l[11].strip()), \ email_id=int(l[12].strip()))) # Apply schema to RDD and create Dataframe aadhaar_df = sqlContext.createDataFrame(aadhaar) aadhaar_df.registerTempTable('aadhaar_card_table') sqlContext.cacheTable('aadhaar_card_table') # male and female percentage in each state query aadhaar_male_female_query = 'select state,\ sum(case when gender = \'F\' then aadhaar_generated ELSE 0 END) as aadhaar_generated_female, \ sum(case when gender = \'M\' then aadhaar_generated ELSE 0 END) as aadhaar_generated_male, \ sum(aadhaar_generated) as aadhaar_generated from aadhaar_card_table group by state' aadhaar_male_female_dataframe = sqlContext.sql(aadhaar_male_female_query) aadhaar_male_female_dataframe.registerTempTable( 'aadhaar_male_female_table') sqlContext.cacheTable('aadhaar_male_female_table') aadhaar_male_female_prctnge_query = 'select state, \ (aadhaar_generated_male * 100.0)/aadhaar_generated as aadhaar_generated_male_percentage, \ (aadhaar_generated_female * 100.0)/aadhaar_generated as aadhaar_generated_female_percentage \ from aadhaar_male_female_table' aadhaar_male_female_prctnge_dataframe = sqlContext.sql( aadhaar_male_female_prctnge_query) aadhaar_male_female_prctnge_dataframe.registerTempTable( 'aadhaar_male_female_prctnge_table') sqlContext.cacheTable('aadhaar_male_female_prctnge_table') # 22. The top 3 states where the percentage of Aadhaar cards being generated for males is the highest. print('\n') print( 'The top 3 states where the percentage of Aadhaar cards being generated for males is the highest : ' ) query = 'select state from ( select * from aadhaar_male_female_prctnge_table \ order by aadhaar_generated_male_percentage DESC limit 3)temp' top_3_males_dataframe = sqlContext.sql(query) top_3_males_dataframe.persist() top_3_males_dataframe.registerTempTable('top_3_males_table') top_3_males_dataframe.show(top_3_males_dataframe.count(), truncate=False) print('\n\n') # 23. The top 3 districts where the percentage of Aadhaar cards being rejected for females is the highest. print('\n') print( 'The top 3 districts where the percentage of Aadhaar cards being rejected for females is the highest' ) query = 'select top_3_males_table.state, act.district, act.gender, act.rejected \ from top_3_males_table LEFT OUTER JOIN aadhaar_card_table act\ ON(top_3_males_table.state = act.state)' top_3_males_info_dataframe = sqlContext.sql(query) top_3_males_info_dataframe.registerTempTable('top_male_states_info_table') query = 'select district from ( select district, \ (case when rejected = 0 then 0 ELSE (aadhaar_rejected_female * 100.0) / rejected END) as aadhaar_rejected_female_percentage \ from ( select district, \ sum(case when gender = \'F\' then rejected ELSE 0 END) as aadhaar_rejected_female, \ sum(rejected) as rejected \ from top_male_states_info_table \ group by district)temp1 \ order by aadhaar_rejected_female_percentage DESC limit 3)temp2' top_3_female_rejected_aadhar_df = sqlContext.sql(query) top_3_female_rejected_aadhar_df.show( top_3_female_rejected_aadhar_df.count(), truncate=False) print('\n\n') # 24. The top 3 states where the percentage of Aadhaar cards being generated for females is the highest. print('\n') print( 'The top 3 states where the percentage of Aadhaar cards being generated for females is the highest.' ) query = 'select state from ( select * from aadhaar_male_female_prctnge_table order by aadhaar_generated_female_percentage DESC limit 3)A' top_3_females_dataframe = sqlContext.sql(query) top_3_females_dataframe.persist() top_3_females_dataframe.registerTempTable('top_3_females_table') top_3_females_dataframe.show(top_3_females_dataframe.count(), truncate=False) print('\n\n') # 25. The top 3 districts where the percentage of Aadhaar cards being rejected for males is the highest. print('\n') print( 'The top 3 districts where the percentage of Aadhaar cards being rejected for males is the highest' ) query = 'select top_3_females_table.state, act.district, act.gender, act.rejected \ from top_3_females_table LEFT OUTER JOIN aadhaar_card_table act \ ON(top_3_females_table.state = act.state)' top_female_states_info_dataframe = sqlContext.sql(query) top_female_states_info_dataframe.registerTempTable( 'top_female_states_info_table') query = 'select district from ( select district, \ (case when rejected = 0 then 0 \ ELSE (aadhaar_rejected_male * 100.0)/rejected END) as aadhaar_rejected_male_percentage \ from ( select district, \ sum(case when gender = \'M\' then rejected ELSE 0 END) as aadhaar_rejected_male, \ sum(rejected) as rejected \ from top_female_states_info_table \ group by district)temp1 \ order by aadhaar_rejected_male_percentage DESC limit 3)temp2' top_3_male_rejected_aadhar_df = sqlContext.sql(query) top_3_female_rejected_aadhar_df.show( top_3_female_rejected_aadhar_df.count(), truncate=False) print('\n\n') # 26. The summary of the acceptance percentage of all the Aadhaar cards applications by bucketing the age group into 10 buckets. age_df = aadhaar_df.select("age") upper_limit = age_df.rdd.max()[0] lower_limit = age_df.rdd.min()[0] diff_max_min = float(upper_limit - lower_limit) bucket_size = int(math.ceil(diff_max_min / 10)) query1 = 'select (case ' \ + 'when age <= ' + str(lower_limit + 1*bucket_size) + \ ' then \'' + str(lower_limit) + '_' + str(lower_limit + bucket_size) + '\'' \ + 'when age > ' + str(lower_limit + 1*bucket_size) + ' and age <= ' + str(lower_limit + 2*bucket_size) + \ ' then \'' + str(lower_limit + 1*bucket_size) + '_' + str(lower_limit + 2*bucket_size) + '\' ' \ + 'when age > ' + str(lower_limit + 2*bucket_size) + ' and age <= ' + str(lower_limit + 3*bucket_size) + \ ' then \'' + str(lower_limit + 2*bucket_size) + '_' + str(lower_limit + 3*bucket_size) + '\' ' \ + 'when age > ' + str(lower_limit + 3*bucket_size) + ' and age <= ' + str(lower_limit + 4*bucket_size) + \ ' then \'' + str(lower_limit + 3*bucket_size) + '_' + str(lower_limit + 4*bucket_size) + '\' ' \ + 'when age > ' + str(lower_limit + 4*bucket_size) + ' and age <= ' + str(lower_limit + 5*bucket_size) + \ ' then \'' + str(lower_limit + 4*bucket_size) + '_' + str(lower_limit + 5*bucket_size) + '\' ' \ + 'when age > ' + str(lower_limit + 5*bucket_size) + ' and age <= ' + str(lower_limit + 6*bucket_size) + \ ' then \'' + str(lower_limit + 5*bucket_size) + '_' + str(lower_limit + 6*bucket_size) + '\' ' \ + 'when age > ' + str(lower_limit + 6*bucket_size) + ' and age <= ' + str(lower_limit + 7*bucket_size) + \ ' then \'' + str(lower_limit + 6*bucket_size) + '_' + str(lower_limit + 7*bucket_size) + '\' ' \ + 'when age > ' + str(lower_limit + 7*bucket_size) + ' and age <= ' + str(lower_limit + 8*bucket_size) + \ ' then \'' + str(lower_limit + 7*bucket_size) + '_' + str(lower_limit + 8*bucket_size) + '\' ' \ + 'when age > ' + str(lower_limit + 8*bucket_size) + ' and age <= ' + str(lower_limit + 9*bucket_size) + \ ' then \'' + str(lower_limit + 8*bucket_size) + '_' + str(lower_limit + 9*bucket_size) + '\' ' \ + 'when age > ' + str(lower_limit + 9*bucket_size) + ' and age <= ' + str(lower_limit + 10*bucket_size) + \ ' then \'' + str(lower_limit +9*bucket_size) + '_' + str(lower_limit + 10*bucket_size) + '\' ' \ + 'ELSE \'more_than_' + str(upper_limit) + '\' end) as age_group, ' + 'aadhaar_generated as selected, ' \ + 'rejected ' + 'from aadhaar_card_table' query_final = 'select age_group, (100 * sum(selected))/(sum(selected) + sum(rejected)) as percentage_summary ' \ + 'from ('+ query1 +')A group by age_group order by length(age_group), age_group' summary_dataframe = sqlContext.sql(query_final) summary_dataframe.show(summary_dataframe.count()) ## unpersist elements from memory (optional) top_3_females_dataframe.unpersist() top_3_males_dataframe.unpersist() sqlContext.uncacheTable('aadhaar_card_table') sqlContext.uncacheTable('aadhaar_male_female_table') sqlContext.uncacheTable('aadhaar_male_female_prctnge_table')
# Importing dataframe from HDFS with datasetnames #datasets = sqlContext.read.format("csv").option("header", "false").option("delimiter", "\t").load(inputDirectory + "datasets.tsv") # List of dataset file names #dataList = [str(row._c0) for row in datasets.select('_c0').collect()] # Iteration over dataframes begins bu using dataframe file names #for filename in dataList[0:4]: df = sqlContext.read.format("csv").option("header", "true").option("inferSchema", "true").option("delimiter", "\t").load("dummy_set.tsv") #col_names = df.columns #print(col_names) df.createOrReplaceTempView("df") sqlContext.cacheTable("df") df_result = spark.sql("select * \ from df \ ").show() # Copy of the jsonSchema for current iteration outJSON = jsonSchema.copy() for col, dtype in df.dtypes: # change to map function later if dtype is 'integer': intJSON = intSchema.copy() intJSON["column_name"] = col intJSON = fill_numeric_json(name, df, dtype, intJSON) outJSON["columns"].append(intJSON)
sqlContext = SQLContext(sc) # In[4]: FOLLCOUNT = sqlContext.read.parquet("FOLLOWERS.PARQUET") # In[5]: FOLLCOUNT.registerTempTable("FOLLCOUNT"); # In[20]: sqlContext.cacheTable("FOLLCOUNT") Q1 = sqlContext.sql("SELECT * FROM FOLLCOUNT WHERE F_COUNT > 1000000") # In[21]: F1 = Q1.toPandas() F1=F1.set_index('USER_DISPLAY_NAME') # In[22]: from pylab import rcParams import pylab rcParams['figure.figsize'] = 20,10 Line=F1.plot(kind='line',title='Count by followers of users',stacked=False)
td.timezone AS dest_timezone, to.timezone AS origin_timezone, TO_UTC_TIMESTAMP(DATE_TRUNC('hour', TO_TIMESTAMP(CONCAT(fl_date, ' ', LPAD(crs_dep_time, 4, '0')), 'yyyy-MM-dd HHmm')), to.timezone) AS truncated_crs_dep_time_utc, TO_UTC_TIMESTAMP(DATE_TRUNC('hour', TO_TIMESTAMP(CONCAT(fl_date, ' ', LPAD(crs_dep_time, 4, '0')), 'yyyy-MM-dd HHmm')), to.timezone) - INTERVAL 3 HOURS AS truncated_crs_dep_minus_three_utc, TO_UTC_TIMESTAMP(TO_TIMESTAMP(CONCAT(fl_date, ' ', LPAD(crs_dep_time, 4, '0')), 'yyyy-MM-dd HHmm'), to.timezone) AS crs_dep_time_utc, TO_UTC_TIMESTAMP(TO_TIMESTAMP(CONCAT(fl_date, ' ', LPAD(crs_dep_time, 4, '0')), 'yyyy-MM-dd HHmm'), to.timezone) - INTERVAL 2 HOURS 15 MINUTES AS crs_dep_minus_two_fifteen_utc, TO_UTC_TIMESTAMP(TO_TIMESTAMP(CONCAT(fl_date, ' ', LPAD(crs_arr_time, 4, '0')), 'yyyy-MM-dd HHmm'), to.timezone) AS crs_arr_time_utc FROM airlines_temp AS f LEFT JOIN city_state_timezone AS td ON f.short_dest_city_name = td.city_state LEFT JOIN city_state_timezone AS to ON f.short_orig_city_name = to.city_state """) # make sure view is cached for subsequent operations sqlContext.cacheTable("airlines") # reassign airlines df to derived view and cache it airlines = sqlContext.sql("SELECT * FROM airlines").cache() # COMMAND ---------- # MAGIC %md # MAGIC #### Weather # COMMAND ---------- # add airport code to weather data to facilitate join # filter to only records with valid airport code
# get command-line arguments inFile = sys.argv[1] supp = sys.argv[2] conf = sys.argv[3] prot = sys.argv[4] print ("Executing HW2SQL with input from " + inFile + ", support=" +supp + ", confidence=" + conf + ", protection=" + prot) pp_schema = StructType([ StructField("uid", IntegerType(), True), StructField("attr", StringType(), True), StructField("val", IntegerType(), True)]) Pro_Publica = sqlContext.read.format('csv').options(header=False).schema(pp_schema).load(inFile) Pro_Publica.createOrReplaceTempView("Pro_Publica") sqlContext.cacheTable("Pro_Publica") # compute frequent itemsets of size 1, store in F1(attr, val) query = "select attr, val, count(*) as supp \ from Pro_Publica \ group by attr, val \ having count(*) >= " + str(supp) F1 = spark.sql(query) F1.createOrReplaceTempView("F1") #F1.show() query = "select * from Pro_Publica where attr != 'vdecile' " Pro_Publica_Attr = spark.sql(query) Pro_Publica_Attr.createOrReplaceTempView("Pro_Publica_Attr") query = "select * from Pro_Publica where attr = 'vdecile'"
airline=p[1], GeoSummary=p[5], GeoRegion=p[6], PriceCategory=p[8], passengercount=int(p[11]), Year=int(p[14]), Month=p[15], ) ) # Apply Row schema euroSchema = sqlContext.createDataFrame(euroRows) # Register 'euroRegion' table with 7 columns: airline, summary, region, price category, passenger count, year, month euroSchema.registerTempTable("euroRegion") sqlContext.cacheTable("euroRegion") # average passenger count of airline in the month of june in euro region passengerCountbyEachMonth = sqlContext.sql("SELECT airline, AVG(passengercount)FROM euroRegion WHERE Month = 'June' GROUP BY airline").collect() # print the results print passengerCountbyEachMonth # average passenger count of virgin atlantic airline in the every year avgPassengerCountVirginAirline = sqlContext.sql("SELECT Year,AVG(passengercount)FROM euroRegion WHERE airline='Virgin Atlantic' GROUP BY Year").collect() print avgPassengerCountVirginAirline # average passenger count of united airlines airline in the every year avgPassengerCountUnitedAirline = sqlContext.sql("SELECT Year,AVG(passengercount)FROM euroRegion WHERE airline='United Airlines' GROUP BY Year").collect()
from pyspark.sql import SQLContext, Row conf = SparkConf().setAppName('TriWordCount') sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) trigrams = sc.textFile('hdfs:///users/rocks1/12307130174/spark_probabilities_smoothed01/*') trigrams = trigrams.map(lambda line: eval(line)) \ .map(lambda t: Row(word0 = t[0][0], word1=t[0][1], word2=t[0][2], prob=t[1])) schemaTrigram= sqlContext.createDataFrame(trigrams) schemaTrigram.registerTempTable("trigram") sqlContext.cacheTable("trigram") #schemaTrigram.cache() import socket s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) s.bind(("",54899)) s.listen(5) while True: #word0, word1 = raw_input(">").split() print "in loop" client, _ = s.accept() print "acccpeted" recved = client.recv(1024) print "recived"