def run_spark(): spark = SparkSession.builder.master("local").appName("Word Count").getOrCreate() df = spark.read.format("csv").option("header", "true").load("hdfs:///project/samples sqlContext=SQLContext(spark) sqlContext.registerDataFrameAsTable(df, "table1") sentences=sqlContext.sql("""SELECT `reviews.rating`,`reviews.text` FROM table1""").rdd sentences.collect() alist=sentences.map(lambda x:x[0] if x[0] is not None else 0).collect() blist=sentences.map(lambda x:get_score(x[1]) if x[1] is not None else 0).collect() return alist,blist def draw_plots(alist,blist) alist,blist = run_spark() t_plt, = plt.plot(np.arange(1, len(alist)+1), alist, 'r') v_plt, = plt.plot(np.arange(1, len(alist)+1), blist) plt.title('NLP Emotion Analysis') # plt.xlabel('epoch') # plt.ylabel('score') # plt.legend((t_plt, v_plt), ('rating', 'score')) # plt.savefig("result.png") with open("x1.txt","w+") as input_: input_.write(','.join([str(x) for x in alist])) input_.write(','.join([str(x) for x in blist])) if __name__=='__main__': run_spark()
def ALS_fit(): usern = request.args.get('usern') users_df = pd.read_sql_query( '''SELECT DISTINCT mt3ratings.user, user_id FROM mt3ratings WHERE appdata = 1''', engine) if usern not in users_df['user'].values: return_str = "can't find user" return jsonify(result=return_str) user_id = users_df.user_id[users_df.user == usern].values[0] try: key = request.args.get('key') except NameError: key = 'e' if key == 'abcd': #start spark try: conf = SparkConf().setAppName("BeerSleuthALS").set( "spark.executor.memory", "4g") sc = SparkContext(conf=conf) except ValueError: pass sqlContext = SQLContext(sc) ratings_sqldf = modeling.get_item_user_rev_from_pg(engine, sqlContext) sqlContext.registerDataFrameAsTable(ratings_sqldf, "ratings") print('fitting model') model = modeling.fit_final_model(ratings_sqldf) beer_ids = beer_dict.values() to_predict = zip([user_id] * len(beer_ids), beer_ids) to_predict_top20 = zip([user_id] * len(beer_id_filt), beer_id_filt) user_preds = model.predictAll(sc.parallelize(to_predict)).collect() user_preds_top20 = model.predictAll( sc.parallelize(to_predict_top20)).collect() print('got preds') preds = Counter({x[1]: x[2] for x in user_preds}) preds_top20 = Counter({x[1]: x[2] for x in user_preds_top20}) with open('%s%s_preds.pkl' % (pred_path, user_id), 'wb') as f: pickle.dump(preds, f) with open('%s%s_preds_top20.pkl' % (pred_path, user_id), 'wb') as f: pickle.dump(preds_top20, f) print('done') sc.stop() return jsonify( result="Model training complete, you may now get predictions")
def ALS_fit(): usern = request.args.get('usern') users_df = pd.read_sql_query('''SELECT DISTINCT mt3ratings.user, user_id FROM mt3ratings WHERE appdata = 1''', engine) if usern not in users_df['user'].values: return_str = "can't find user" return jsonify(result = return_str) user_id = users_df.user_id[users_df.user == usern].values[0] try: key = request.args.get('key') except NameError: key = 'e' if key == 'abcd': #start spark try: conf = SparkConf().setAppName("BeerSleuthALS").set("spark.executor.memory", "4g") sc = SparkContext(conf=conf) except ValueError: pass sqlContext = SQLContext(sc) ratings_sqldf = modeling.get_item_user_rev_from_pg(engine, sqlContext) sqlContext.registerDataFrameAsTable(ratings_sqldf, "ratings") print('fitting model') model = modeling.fit_final_model(ratings_sqldf) beer_ids = beer_dict.values() to_predict = zip([user_id]*len(beer_ids), beer_ids) to_predict_top20 = zip([user_id]*len(beer_id_filt), beer_id_filt) user_preds = model.predictAll(sc.parallelize(to_predict)).collect() user_preds_top20 = model.predictAll(sc.parallelize(to_predict_top20)).collect() print('got preds') preds = Counter({x[1]: x[2] for x in user_preds}) preds_top20 = Counter({x[1]: x[2] for x in user_preds_top20}) with open('%s%s_preds.pkl'%(pred_path, user_id),'wb') as f: pickle.dump(preds, f) with open('%s%s_preds_top20.pkl'%(pred_path, user_id),'wb') as f: pickle.dump(preds_top20, f) print('done') sc.stop() return jsonify(result="Model training complete, you may now get predictions")
#input = "/impala/parquet/back/back-portal-loginflowlog/dat=%s*" % ym input = '/input/loginfowlog/*' spark_home = '/opt/cloud/spark' os.environ['SPARK_HOME'] = spark_home conf = (SparkConf() .setMaster(master) .setAppName(appName) .set("spark.sql.parquet.binaryAsString","true") ) sc = SparkContext(conf = conf) sql_context = SQLContext(sc) sql_context.registerFunction("to_mac", lambda x: normal_mac(x), StringType()) parquet_df = sql_context.read.parquet(input) sql_context.registerDataFrameAsTable(parquet_df, "loginflowlog") #_sql = "select to_mac(upper(usermac)),count(distinct dat) days from loginflowlog group by to_mac(upper(usermac))" _sql = "select to_mac(upper(usermac)),count(distinct logtime) days from loginflowlog group by to_mac(upper(usermac))" rs_df = sql_context.sql(_sql) rs = rs_df.collect() logger.info("---->" + str(len(rs))) lists = [] for r in rs: usermac = r[0] days = r[1] t = (usermac,days) lists.append(t) #logger.debug(t) dao = MysqlDao()
from pyspark import SparkConf, SparkContext, SQLContext conf = SparkConf().setMaster('local').setAppName('py03a') sc = SparkContext(conf=conf) sqc = SQLContext(sc) #and show that you could exit your pyspark shell and come back in it df = sqc.read.parquet('auction_parquet') df.show(5) sqc.registerDataFrameAsTable(df, 'auction_p') print 'Bid history for bidder pagep123' sqc.sql(''' SELECT * FROM auction_p WHERE bidder = 'pagep123' ORDER BY auctionid, bid ''').show()
if __name__ == '__main__': # set up environment conf = SparkConf() \ .setAppName("BeerSleuthALS") \ .set("spark.driver.memory", "8g") sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) #load data engine = create_engine( 'postgresql://*****:*****@localhost:5432/beersleuth') ratings_sqldf = get_item_user_rev_from_pg(engine, sqlContext) beer_sqldf = get_beer_data(engine) sqlContext.registerDataFrameAsTable(ratings_sqldf, "ratings") # train, test = sqlContext.table('ratings').randomSplit([.8, .2]) # train = train.cache() # test = test.cache() ## add_rating_to_db(user='******', beer=u'101 North Heroine IPA' , taste=8, engine=engine) ## add_rating_to_db(user='******', beer=u'Boulder Creek Golden Promise' , taste=6, engine=engine) ## model_param_sweep(train, test) # import timeit # start_time = timeit.default_timer() # model = fit_final_model(ratings_sqldf) # elapsed = timeit.default_timer() - start_time ''' sim_dict={} for i in beer_data.index: sim_dict[i] = Counter() for j in beer_data.index:
def Homepage(): """Renders a sample page.""" style.use('ggplot') a = 5 ''' Spark details: data was cleaned using spark -pandas library thh csv was converted to Dataframe using SQL context and then registered as a table which was accesses using SQL ''' sc = SparkContext(appName="DemoCount") sqlct = SQLContext(sc) #pandas_df = pd.read_csv('C:/Users/madhumita/Downloads/Test_final.csv') pandas_df = pd.read_csv('Demo_data.csv') s_df1 = sqlct.createDataFrame(pandas_df) sqlct.registerDataFrameAsTable(s_df1, "Demo_2") df0_3 = sqlct.sql("Select * from Demo_2") df_0_3 = df0_3.groupby('CTYNAME').sum() df0_3 = sqlct.sql( "Select CTYNAME,TOT_POP,TOT_MALE,TOT_FEMALE from Demo_2 where YEAR=8 and AGEGRP >0 and AGEGRP <=3" ) df_0_3 = df0_3.groupby('CTYNAME').sum() a0_3 = df_0_3.select('CTYNAME', 'sum(TOT_POP)', 'sum(TOT_MALE)', 'sum(TOT_FEMALE)').collect() for row in a0_3: CityName = str(row[0]) Tot_pop = str(row[1]) Tot_Male = str(row[2]) Tot_Female = str(row[3]) AGEGRP = '0:3' print(CityName + "-TP-" + Tot_pop + "-TM-" + Tot_Male + "-TF-" + Tot_Female) SQLCommand = ( "SELECT * FROM loc_tet where CTYNAME= ? and AGEGRP='0:3'") Values = [CityName] cursor.execute(SQLCommand, Values) if cursor.rowcount == 0: SQLCommand = ("INSERT INTO loc_tet " "(CTYNAME, AGEGRP,TOT_POP,TOT_MALE,TOT_FEMALE) " "VALUES (?,?,?,?,?)") Values = [CityName, AGEGRP, Tot_pop, Tot_Male, Tot_Female] cursor.execute(SQLCommand, Values) connection.commit() else: cursor.execute( """Update loc_tet Set TOT_POP=(?) ,TOT_MALE=(?),TOT_FEMALE=(?) where CTYNAME=(?) and AGEGRP='0:3'""", (Tot_pop, Tot_Male, Tot_Female, CityName)) connection.commit() df4_9 = sqlct.sql( "Select CTYNAME,TOT_POP,TOT_MALE,TOT_FEMALE from Demo_2 where YEAR=8 and AGEGRP >4 and AGEGRP <=9" ) df_4_9 = df4_9.groupby('CTYNAME').sum() b4_9 = df_4_9.select('CTYNAME', 'sum(TOT_POP)', 'sum(TOT_MALE)', 'sum(TOT_FEMALE)').collect() for row in b4_9: CityName = str(row[0]) Tot_pop = str(row[1]) Tot_Male = str(row[2]) Tot_Female = str(row[3]) AGEGRP = '4:9' print(CityName + "-TP-" + Tot_pop + "-TM-" + Tot_Male + "-TF-" + Tot_Female) SQLCommand = ( "SELECT * FROM loc_tet where CTYNAME= ? and AGEGRP='4:9'") Values = [CityName] cursor.execute(SQLCommand, Values) if cursor.rowcount == 0: SQLCommand = ("INSERT INTO loc_tet " "(CTYNAME, AGEGRP,TOT_POP,TOT_MALE,TOT_FEMALE) " "VALUES (?,?,?,?,?)") Values = [CityName, AGEGRP, Tot_pop, Tot_Male, Tot_Female] cursor.execute(SQLCommand, Values) connection.commit() else: cursor.execute( """Update loc_tet Set TOT_POP=(?) ,TOT_MALE=(?),TOT_FEMALE=(?) where CTYNAME=(?) and AGEGRP='4:9'""", (Tot_pop, Tot_Male, Tot_Female, CityName)) connection.commit() df10_13 = sqlct.sql( "Select CTYNAME,TOT_POP,TOT_MALE,TOT_FEMALE from Demo_2 where YEAR=8 and AGEGRP >9 and AGEGRP <=13" ) df_10_13 = df10_13.groupby('CTYNAME').sum() c10_13 = df_10_13.select('CTYNAME', 'sum(TOT_POP)', 'sum(TOT_MALE)', 'sum(TOT_FEMALE)').collect() for row in c10_13: CityName = str(row[0]) Tot_pop = str(row[1]) Tot_Male = str(row[2]) Tot_Female = str(row[3]) AGEGRP = '10:13' print(CityName + "-TP-" + Tot_pop + "-TM-" + Tot_Male + "-TF-" + Tot_Female) SQLCommand = ( "SELECT * FROM loc_tet where CTYNAME= ? and AGEGRP='10:13'") Values = [CityName] cursor.execute(SQLCommand, Values) if cursor.rowcount == 0: SQLCommand = ("INSERT INTO loc_tet " "(CTYNAME, AGEGRP,TOT_POP,TOT_MALE,TOT_FEMALE) " "VALUES (?,?,?,?,?)") Values = [CityName, AGEGRP, Tot_pop, Tot_Male, Tot_Female] cursor.execute(SQLCommand, Values) connection.commit() else: cursor.execute( """Update loc_tet Set TOT_POP=(?) ,TOT_MALE=(?),TOT_FEMALE=(?) where CTYNAME=(?) and AGEGRP='10:13'""", (Tot_pop, Tot_Male, Tot_Female, CityName)) connection.commit() df13 = sqlct.sql( "Select CTYNAME,TOT_POP,TOT_MALE,TOT_FEMALE from Demo_2 where YEAR=8 and AGEGRP >13 " ) df_13 = df13.groupby('CTYNAME').sum() d_13 = df_13.select('CTYNAME', 'sum(TOT_POP)', 'sum(TOT_MALE)', 'sum(TOT_FEMALE)').collect() for row in d_13: CityName = str(row[0]) Tot_pop = str(row[1]) Tot_Male = str(row[2]) Tot_Female = str(row[3]) AGEGRP = '14:18' print(CityName + "-TP-" + Tot_pop + "-TM-" + Tot_Male + "-TF-" + Tot_Female) SQLCommand = ( "SELECT * FROM loc_tet where CTYNAME= ? and AGEGRP='14:18'") Values = [CityName] cursor.execute(SQLCommand, Values) if cursor.rowcount == 0: SQLCommand = ("INSERT INTO loc_tet " "(CTYNAME, AGEGRP,TOT_POP,TOT_MALE,TOT_FEMALE) " "VALUES (?,?,?,?,?)") Values = [CityName, AGEGRP, Tot_pop, Tot_Male, Tot_Female] cursor.execute(SQLCommand, Values) connection.commit() else: cursor.execute( """Update loc_tet Set TOT_POP=(?) ,TOT_MALE=(?),TOT_FEMALE=(?) where CTYNAME=(?) and AGEGRP='14:18'""", (Tot_pop, Tot_Male, Tot_Female, CityName)) connection.commit() return render_template("main.html", Data0_3=a0_3, Data4_9=b4_9, Data10_13=c10_13, Data13=d_13)
df=sqlContext.createDataFrame(l, ['name', 'age']) # print(df) d = [{'name': 'paul', 'age': 10,'gender':'male'},{'name': 'alice', 'age': 30,'gender':None}] print(sqlContext.createDataFrame(d).collect()) # rdd = sc.parallelize(l) df = sqlContext.createDataFrame(rdd, ['name', 'age']) # print(df.collect()) #df=sqlContext.createDataFrame(rdd) # print(df.printSchema()) print(df.head(2)) sqlContext.registerDataFrameAsTable(df, "table1") df2 = sqlContext.sql("SELECT name,age from table1 where name='bob'") print(df2.collect()) print(sqlContext.tableNames()) sqlContext.dropTempTable("table1") print(sqlContext.tableNames()) df = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load("/home/harsh/mapping_minds_training/spark/train_u6lujuX_CVtuZ9i.csv") print(df.groupBy('Gender').agg({'ApplicantIncome': 'mean'}).show()) print(df.head(3)) print(df.printSchema()) print(df.columns) df.cache() print('count-------------------------->',df.count())
tweets_sample = sqlContext.read.json(path_to_data) print("El dataset cargado contiene %d tweets" % tweets_sample.count()) # Estudiamos el schema de los datos importados print("\nShcema de los datos cargados:\n") tweets_sample.printSchema() print("\nVisualización de los datos:\n") tweets_sample.show() # *************** VISUALIZACIÓN CON SQL ************************ sqlContext.sql('DROP TABLE IF EXISTS tweets_sample') sqlContext.registerDataFrameAsTable(tweets_sample, "tweets_sample") # Ver usuarios con más tweets, incluyendo información adicional users_agg = sqlContext.sql( "SELECT user.screen_name, MAX(user.friends_count) AS friends_count, MAX(user.followers_count) AS followers_count, user.lang, COUNT(text) AS tweets FROM tweets_sample WHERE user.lang = 'es' GROUP BY user.screen_name, user.lang ORDER BY tweets DESC" ) users_agg.show() # Cargamos la visualización en una tabla sqlContext.sql('DROP TABLE IF EXISTS user_agg') sqlContext.registerDataFrameAsTable(users_agg, "user_agg") # Estudiamos los usuarios que han recibido más retweets, visualizando otra información adicional y calculando el ration de retweets por tweets
day = pro_time.strftime("%Y%m%d") master = "spark://hadoop:7077" appName = "spark_pageflow_outflow" input = "/impala/parquet/site/site-pageflowv1/dat=%s" % day spark_home = '/opt/cloud/spark' os.environ['SPARK_HOME'] = spark_home sc = SparkContext(master, appName) sql_context = SQLContext(sc) sql_context.registerFunction("to_day", lambda x: mill_date_str(x), StringType()) sql_context.registerFunction("to_str", lambda x: bytearray_str(x), StringType()) parquet_df = sql_context.read.parquet(input) sql_context.registerDataFrameAsTable(parquet_df, "site_pageflowv1") _sql = "select to_str(url),to_day(createtime) day,count(1) pv,count(distinct to_str(guuid)) uv " \ "from site_pageflowv1 where dat= %s and to_str(name)='outflow' " \ "group by to_str(url),to_day(createtime)" % day rs_df = sql_context.sql(_sql) rs = rs_df.collect() logger.info("---->" + str(len(rs))) list = [] for r in rs: url = r[0] day = r[1] pv = r[2] uv = r[3]
return [ record[i].replace('"','') for i in indexes] def filterData(record): flag = True if (int(record[-4])<1) or (record[-2] not in (['1','4'])) or (record[-1] != ''): flag = False return flag if __name__ == '__main__': sc = SparkContext(appName = 'CF_prod_in_transaction') sqlContext = SQLContext(sc) in_file = sc.textFile(sys.argv[1]) data = in_file.map(oritentData).filter(filterData).map(lambda x: [int(i) for i in x[:-3]]) Record = Row('customer_id','product_id','invoice_id','units') data = data.map(lambda x: Record(*x)) data = sqlContext.createDataFrame(data) sqlContext.registerDataFrameAsTable(data,'table1') df = sqlContext.sql('select customer_id, product_id, sum(units) as prod_in_transactions from table1 group by customer_id, product_id') df.map(lambda x: ','.join([str(r) for r in x])).saveAsTextFile(sys.argv[2]) sc.stop() data_path,header,train_sample,number,support,confidence,lift,k,testing,testing_split,seed,output_path write = open('test.csv','w') wrtr = csv.writer(write) import csv read = open('arqiva.csv') for line in read: wrtr.writerow(line) from e
sim_df.drop_duplicates() return sim_df if __name__ == '__main__': # set up environment conf = SparkConf() \ .setAppName("BeerSleuthALS") \ .set("spark.driver.memory", "8g") sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) #load data engine = create_engine('postgresql://*****:*****@localhost:5432/beersleuth') ratings_sqldf = get_item_user_rev_from_pg(engine, sqlContext) beer_sqldf = get_beer_data(engine) sqlContext.registerDataFrameAsTable(ratings_sqldf, "ratings") # train, test = sqlContext.table('ratings').randomSplit([.8, .2]) # train = train.cache() # test = test.cache() ## add_rating_to_db(user='******', beer=u'101 North Heroine IPA' , taste=8, engine=engine) ## add_rating_to_db(user='******', beer=u'Boulder Creek Golden Promise' , taste=6, engine=engine) ## model_param_sweep(train, test) # import timeit # start_time = timeit.default_timer() # model = fit_final_model(ratings_sqldf) # elapsed = timeit.default_timer() - start_time
class Analysiser: def __init__(self): conf = SparkConf().setAppName('Analysiser').set("spark.sql.crossJoin.enabled", True) self.sc = SparkContext(conf=conf) self.sqlctx = SQLContext(self.sc) self.pdf = pd.read_excel('data_o.xlsx', sheetname=0, header=0, parse_cols=[9, 10, 23, 32, 45, 60]) schema = StructType([ StructField('TI',StringType(),True), StructField('SO', StringType(), True), StructField('C1', StringType(), True), StructField('TC', StringType(), True), StructField('PY', StringType(), True), StructField('UT', StringType(), True) ]) df = self.sqlctx.createDataFrame(self.pdf,schema) def m_clean(x): try: py = int(x['PY']) tc = int(x['TC']) authors = x['C1'] if py>=2006 and py<=2016 and authors != '': first_author = authors[1:].split(']')[0].split('; ')[0] return [(x['TI'],x['SO'],x['C1'],first_author,x['TC'],int(x['PY']),x['UT']),] else: return [] except Exception as e: return [] schema2 = StructType([ StructField('TI', StringType(), True), StructField('SO', StringType(), True), StructField('C1', StringType(), True), StructField('first_author', StringType(), True), StructField('TC', StringType(), True), StructField('PY', IntegerType(), True), StructField('UT', StringType(), True) ]) self.df = self.sqlctx.createDataFrame(df.rdd.flatMap(m_clean),schema2) #self.df.show() # def parse(self): # .wb = load_workbook('data_min.xlsx') # sheet = wb.get_sheet_by_name('all') # new_wb = openpyxl.Workbook() # new_sheet = new_wb.create_sheet('simple') # new_sheet.append(['TI', 'SO', 'C1', 'TC', 'PY', 'UT']) # # # for row in list(sheet.rows)[2:100]: # r = [c.value for c in row] # r_min = [r[9],r[10],r[23],r[32],r[45],r[60]] # print(r_min) # new_sheet.append(r_min) # new_wb.save('export.xlsx') def parse2(self): self.df.ExcelWriter('output.xls') def func1(self): df = self.df.toPandas() #print(df.head()) plt.figure(figsize=(9, 6)) plt.scatter(df['PY'], df['TC'], s=25, alpha=0.4, marker='o') # T:散点的颜色 # s:散点的大小 # alpha:是透明程度 plt.show() def func2(self): df = self.df first_author_df = df.select('first_author','PY').groupBy('first_author').max('PY').withColumnRenamed('max(PY)','maxPY') self.sqlctx.registerDataFrameAsTable(df.drop('first_author'),'df') self.sqlctx.registerDataFrameAsTable(first_author_df,'fa') sql = "select first_author,TC from (fa outer join df on C1 like CONCAT('%',first_author,'%'))" join = self.sqlctx.sql(sql) join_rdd = join.rdd.map(lambda x:(x['first_author'],x['TC'])).reduceByKey(lambda x,y:x+'-'+y) # for r in join_rdd.collect(): # print(r) def m_h(x): flag = False h = 0 cts = [int(x) for x in x[1].split('-')] cts.sort(reverse=True) for i in range(1, len(list(cts))+1): if i >= cts[i-1]: flag = True h = i # TODO or cts[i-1] break if flag: return [(x[0],h),] else: return [] author_h_rdd = join_rdd.flatMap(m_h) author_h_df = self.sqlctx.createDataFrame(author_h_rdd,['first_author','h']) final_df = author_h_df.join(first_author_df,'first_author','left_outer').select('h','maxPY') pdf = final_df.toPandas() plt.figure(figsize=(9, 6)) plt.scatter(pdf['maxPY'], pdf['h'], s=25, alpha=0.4, marker='o') # T:散点的颜色 # s:散点的大小 # alpha:是透明程度 plt.show()
## DataFrame Operate sales_rdc.filter(col('dc_id') == '772').show() # filter by a Column sales_rdc.filter((col('dc_id') == '772') & ( col('item_first_cate_cd') == '1620')).show() # filter by some Columns sales_rdc.filter((col('dc_id') == '772') & (col('item_first_cate_cd') == '1620') & (col('total_sales') != 0)).show() data = [[2, 3, 4], [1, 2, 3], [7, 6, 5]] data_df = spark.createDataFrame(data, list('abc')) # create a DF, with columns name data_df2 = spark.createDataFrame(data) # create a DF data = [[2, 3, 4], [1, 2, 3], [7, 6, 5]] sqlContext.registerDataFrameAsTable(data_df2, "test_table") # register a Tmp Table test_data = spark.sql('select * from test_table') # sqlContext.dropTempTable("test_table") sqlContext.udf.register("stringLengthInt", lambda x: len(str(x)), IntegerType()) # register a Function for SQL sqlContext.registerFunction("stringLengthInt", lambda x: len(str(x)), IntegerType()) sqlContext.sql("SELECT stringLengthInt('test') as len").show() sqlContext.sql("SELECT stringLengthInt(a) as len from test_table ").show() df_as1 = data_df.alias("df_as1") # alias df_as2 = data_df.alias("df_as2") joined_df = df_as1.join(df_as2, col("df_as1.a") == col("df_as2.a"), 'inner') # 保留了全部列名 joined_df.select("df_as1.a", "df_as2.a", "df_as2.b", "df_as2.c").show()
#Defining the schema ischema = StructType([ StructField('station', StringType(), False), StructField('date', StringType(), False), StructField('observation', StringType(), False), StructField('value', IntegerType(), False), StructField('useless', StringType(), False), StructField('quality_flag', StringType(), False) ]) #Reading the csv file df = sqlContext.read.format('com.databricks.spark.csv').load( inputs, schema=ischema).cache() #Registering table all_weather_data sqlContext.registerDataFrameAsTable(df, "all_weather_data") #Filtering TMAX and TMAX values and renaming the columns as min_temp and max_temp min_temp = sqlContext.sql(""" SELECT date, station, value as min_temp FROM all_weather_data WHERE observation="TMIN" AND quality_flag="" """) sqlContext.registerDataFrameAsTable(min_temp, "min_temp") max_temp = sqlContext.sql(""" SELECT date, station, value as max_temp FROM all_weather_data WHERE observation="TMAX" AND quality_flag="" """) sqlContext.registerDataFrameAsTable(max_temp, "max_temp")
conf.set("spark.driver.maxResultSize", "10g") sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) # path to hillary/enron avro enr = sqlContext.read.format( "com.databricks.spark.avro").load( "s3n://datasets-396316040607/enron_data/*.avro").repartition(16) hil = sqlContext.read.format( "com.databricks.spark.avro").load( "s3n://datasets-396316040607/hillary/*.avro").repartition(16) # register tables sqlContext.registerDataFrameAsTable(hil, "hillary") sqlContext.registerDataFrameAsTable(enr, "enron") # register udf sqlContext.registerFunction( "getCos", lambda x, y: get_cosine(text_to_vector(x), text_to_vector(y)) ) # do the cosine similarity on the text, get the top 1000 matches out = sqlContext.sql("SELECT h.author h_auth, e.author e_auth, " "e.contents e_mail, h.contents h_mail, " "getCos(e.contents, h.contents) as cos_sim " "from hillary as h join enron as e order by cos_sim " "desc limit 1000") # write back out to s3