def get_query7(): plt.clf() # os.remove("Query7.csv") # os.remove("graph7.png") qandriod = spark.sql( "select 'Andriod' as DeviceName, count(user.name) as TweetsCount from tweets where source LIKE '%and%'" ) qiphone = spark.sql( "select 'Iphone' as DeviceName, count(user.name) as TweetsCount from tweets where source LIKE '%iphone%' " ) qipad = spark.sql( "select 'Ipad' as DeviceName, count(user.name) as TweetsCount from tweets where source LIKE '%ipad%' " ) qwebpage = spark.sql( "select 'Web App' as DeviceName, count(user.name) as TweetsCount from tweets where source LIKE '%Web App%' " ) query_7 = qandriod.union(qiphone).union(qipad).union(qwebpage) pd = query_7.toPandas() pd.to_csv("Query7.csv", index=False) plt.bar(x=pd.DeviceName.tolist(), height=pd.TweetsCount.tolist()) plt.title("Devices Used for Tweets") plt.xlabel('Source') plt.ylabel('TweetsCount') # plt.axis('equal') plt.tight_layout() # plt.show() plt.savefig("graph7.png") graph7 = BytesIO() plt.savefig(graph7) graph7.seek(0) return graph7
def datasetTrain_ds(typologie_add_0): max_date = spark.sql("select max(ref_chargement) from" + "name") ref_charg = max_date.collect()[0][0] dataset_code_lib_DS_ok0=spark.sql("select * from prd_tech.b_r_datasmart_ml_03_code_libelle_03_learn_ds where ref_chargement='{}'".format(ref_charg)) dataset_code_lib_DS_ok1 = dataset_code_lib_DS_ok0.toPandas() dataset_code_lib_DS_ok1['temp']=dataset_code_lib_DS_ok1['nom_champ1']+','+dataset_code_lib_DS_ok1['nom_champ2'] A=np.unique(dataset_code_lib_DS_ok1['temp'],return_index=True) dataset_code_lib_DS_ok2=dataset_code_lib_DS_ok1.loc[A[1],:].reset_index(drop=True) dataset_code_lib_DS_ok2.drop(['temp'],axis=1,inplace=True) dataset_code_lib_DS_ok3=dataset_code_lib_DS_ok2[pd.isnull(dataset_code_lib_DS_ok2['statut'])==False] dataset_code_lib_DS_ok4=dataset_code_lib_DS_ok3.merge(typologie_add_0.loc[:,['nom_champ1','typologie1']],on=['nom_champ1']) dataset_code_lib_DS_ok5=dataset_code_lib_DS_ok4.merge(typologie_add_0.loc[:,['nom_champ2','typologie2']],on=['nom_champ2']) dataset_code_lib_DS_ok6=dataset_code_lib_DS_ok5[dataset_code_lib_DS_ok5['typologie1']=='code/identifiant'] dataset_code_lib_DS_ok7=dataset_code_lib_DS_ok6[dataset_code_lib_DS_ok6['typologie2']=='libelle'] dataset_code_lib_DS_ok7['source']='manuelDS' dataset_code_lib_DS_ok7['temp']=dataset_code_lib_DS_ok7['nom_champ1']+','+dataset_code_lib_DS_ok7['nom_champ2'] dataset_code_lib_DS_ok7['longuest']=dataset_code_lib_DS_ok7['temp'].apply(longest_common_substring) dataset_code_lib_DS_ok7['len_longuest']=dataset_code_lib_DS_ok7['longuest'].apply(len) dataset_code_lib_DS_ok7.drop(['temp','longuest'],axis=1,inplace=True) del dataset_code_lib_DS_ok0,dataset_code_lib_DS_ok1,dataset_code_lib_DS_ok2,dataset_code_lib_DS_ok3,dataset_code_lib_DS_ok4 del dataset_code_lib_DS_ok5,dataset_code_lib_DS_ok6 return dataset_code_lib_DS_ok7
def readTable(name): max_date = spark.sql("select max(ref_chargement) from" + "name") ref_charg = max_date.collect()[0][0] table_add_0= spark.sql("select * from "+"name"+ "where ref_chargement='{}'".format(ref_charg)) table_add_0 = table_add_0.toPandas() typologie_add_0= spark.sql("select * from prd_tech.b_r_datasmart_ml_02_typologie_03_predict_primitive_all where ref_chargement='{}'".format(ref_charg)) typologie_add_0 = typologie_add_0.toPandas()
def get_query12(): plt.clf() # os.remove("Query12.csv") # os.remove("graph12.png") query_12 = spark.sql( "select sum(geo_count) as geo_count,geo_enabled from (select count(*) as geo_count,geo_enabled from user where geo_enabled is not null group by geo_enabled union select count(*) as geo_count,user.geo_enabled as geo_enabled from user_retweeted where user.geo_enabled is not null group by user.geo_enabled) group by geo_enabled" ) pd = query_12.toPandas() pd.to_csv('Query12.csv', index=False) pd.plot.pie(y="geo_count", labels=pd.geo_enabled.tolist(), autopct='%1.2f%%', fontsize=10) my_circle = plt.Circle((0, 0), 0.5, color='white') p = plt.gcf() p.gca().add_artist(my_circle) plt.title("Geo Location Settings Enabled for Account") plt.axis('equal') plt.tight_layout() # plt.show() plt.savefig("graph12.png") graph12 = BytesIO() plt.savefig(graph12) graph12.seek(0) return graph12
def get_query9(): plt.clf() # os.remove("Query9.csv") # os.remove("graph9.png") query_9 = spark.sql( "select sum(img_count) as img_count,profile_use_background_image as background_img from (select count(*) as img_count,profile_use_background_image from user where profile_use_background_image is not null group by profile_use_background_image union select count(*) as img_count,user.profile_use_background_image as profile_use_background_image from user_retweeted where user.profile_use_background_image is not null group by user.profile_use_background_image) group by profile_use_background_image" ) pd = query_9.toPandas() pd.to_csv('Query9.csv', index=False) # pd.set_index("background_img", drop=True, inplace=True) pd.plot.pie(y="img_count", labels=pd.background_img.tolist(), autopct='%1.2f%%', fontsize=10) my_circle = plt.Circle((0, 0), 0.5, color='white') p = plt.gcf() p.gca().add_artist(my_circle) plt.title("User's Profile Background") plt.axis('equal') plt.tight_layout() # plt.show() plt.savefig('graph9.png') graph9 = BytesIO() plt.savefig(graph9) graph9.seek(0) return graph9
def json_dataset(spark, path): sc = spark.sparkContext # A JSON dataset is pointed to by path. #peopleDF = spark.read\ # .option("multiLine", true)\ # .option("mode", "PERMISSIVE")\ # .json(path) dataRDD = spark.read\ .json(sc.wholeTextFiles(path)\ .values()) print("RDD") dataRDD.show(10) # The inferred schema can be visualized using the printSchema() method dataRDD.printSchema() # Creates a temporary view using the DataFrame dataRDD.createOrReplaceTempView("table_data") # SQL statements can be run by using the sql methods provided by spark dataDF = spark.sql("SELECT * FROM table_data \ where longitude not like '%not%relevant%' and latitude not like '%not%relevant%'" ) print('DF') #dataDF.show(10) return dataRDD, dataDF
def get_query11(): plt.clf() # os.remove("Query11.csv") # os.remove("graph11.png") query_11 = spark.sql( "select usermention, count(*) as value from entity lateral view explode(entity.user_mentions.name) as usermention group by usermention order by value desc limit 20" ) pd = query_11.toPandas() pd.to_csv('Query11.csv', index=False) matplotlib.rcParams['figure.figsize'] = 15, 5 x = pd.usermention.tolist() y = pd.value.tolist() plt.plot(x, y) plt.title("Most Mentioned Twitter Accounts") plt.xlabel('Tweeted User') plt.ylabel('User mentioned count') plt.xticks(rotation=45, ha="right") # plt.axis('equal') plt.tight_layout() # plt.show() plt.savefig('graph11.png') graph11 = BytesIO() plt.savefig(graph11) graph11.seek(0) return graph11
def get_query5(): plt.clf() # os.remove("graph5.png") date = df.select("created_at") def dateMTest(dateval): dt = datetime.datetime.strptime(dateval, '%a %b %d %H:%M:%S +0000 %Y') return dt d = udf(dateMTest, DateType()) df1 = df.withColumn("created_date", d(date.created_at)) df1.createOrReplaceTempView("Tweet_Sentiment") query_5 = spark.sql( "SELECT id,text,created_date FROM Tweet_Sentiment WHERE 1=1 AND (upper(text) LIKE '%COVID%' OR text LIKE '%coronavirus%' or text like '%covid%' or upper(text) like '%CORONAVIRUS%')" ) i = 0 positive = 0 neutral = 0 negative = 0 for t in query_5.select("text").collect(): i = i + 1 # print("It is ",i,str(t.text)) analysis = TextBlob(str((t.text).encode('ascii', 'ignore'))) # print(analysis.sentiment.polarity) if (analysis.sentiment.polarity < 0): negative = negative + 1 # print(i, " in negative") elif (analysis.sentiment.polarity == 0.0): neutral = neutral + 1 # print(i, " in neutral") elif (analysis.sentiment.polarity > 0): positive = positive + 1 # print(i, " in positive") # print("Total negative % is", ((negative) * 100) / i) # print("Total neutral % is", ((neutral) * 100) / i) # print("Total positive % is", ((positive) * 100) / i) negative_percent = ((negative) * 100) / i positive_percent = ((positive) * 100) / i neutral_percent = ((neutral) * 100) / i size_of_groups = [negative_percent, positive_percent, neutral_percent] names = 'negative_percent', 'positive_percent', 'neutral_percent' plt.bar(x=names, height=size_of_groups, color=['red', 'green', 'blue']) plt.title("Sentiment Analysis on Tweet") # plt.axis('equal') plt.tight_layout() # plt.show() plt.savefig('graph5.png') graph5 = BytesIO() plt.savefig(graph5) graph5.seek(0) return graph5
def datasetTrain_add(name): max_date = spark.sql("select max(ref_chargement) from" + "name") ref_charg = max_date.collect()[0][0] dataset_code_lib_add_ok0=spark.sql("select * from prd_tech.b_r_datasmart_ml_03_code_libelle_03_learn_add where ref_chargement='{}'".format(ref_charg)) dataset_code_lib_add_ok0 = dataset_code_lib_add_ok0.toPandas() dataset_code_lib_add_ok1=dataset_code_lib_add_ok0[pd.isnull(dataset_code_lib_add_ok0['statut'])==False] dataset_code_lib_add_ok1['temp']=dataset_code_lib_add_ok1['nom_champ1']+','+dataset_code_lib_add_ok1['nom_champ2'] dataset_code_lib_add_ok1.reset_index(drop=True,inplace=True) A=np.unique(dataset_code_lib_add_ok1['temp'],return_index=True) dataset_code_lib_add_ok2=dataset_code_lib_add_ok1.loc[A[1],:].reset_index(drop=True) dataset_code_lib_add_ok2.drop(['temp'],axis=1,inplace=True) dataset_code_lib_add_ok2['source']='manuelADD' dataset_code_lib_add_ok2['temp']=dataset_code_lib_add_ok2['nom_champ1']+','+dataset_code_lib_add_ok2['nom_champ2'] dataset_code_lib_add_ok2['longuest']=dataset_code_lib_add_ok2['temp'].apply(longest_common_substring) dataset_code_lib_add_ok2['len_longuest']=dataset_code_lib_add_ok2['longuest'].apply(len) dataset_code_lib_add_ok2.drop(['temp','longuest'],axis=1,inplace=True) return dataset_code_lib_add_ok2 del dataset_code_lib_add_ok0,dataset_code_lib_add_ok1
def get_query13(): plt.clf() # os.remove("Query13.csv") # os.remove("graph13.png") qcovid = spark.sql( "select 'covid' as keyword, count(*) as Count from tweets where text like '%covid%'" ) qpandemic = spark.sql( "select 'pandemic' as keyword, count(*) as Count from tweets where text like '%pandemic%'" ) qmask = spark.sql( "select 'masks' as keyword , count(*) as Count from tweets where text like '%masks%'" ) qlock = spark.sql( "select 'lockdown' as keyword, count(*) as Count from tweets where text like '%lockdown%'" ) qstayhome = spark.sql( "select 'stayhome' as keyword, count(*) as Count from tweets where text like '%stayhome%'" ) query_13 = qcovid.union(qpandemic).union(qmask).union(qlock).union( qstayhome) pd = query_13.toPandas() pd.to_csv('Query13.csv', index=False) pd.set_index("keyword", drop=True, inplace=True) pd.plot.bar() plt.xticks(rotation=0, ha="right") plt.title("Inspect Based on Keywords") plt.ylabel('tweets count') # plt.axis('equal') plt.tight_layout() # plt.show() plt.savefig("graph13.png") graph13 = BytesIO() plt.savefig(graph13) graph13.seek(0) return graph13
def get_query17(): plt.clf() # os.remove("Query17.csv") # os.remove("graph17.png") query_17 = spark.sql( "select(select count(*) from tweets)as tweets_count,(select count(*) from user)as Users_count" ) pd = query_17.toPandas() pd.to_csv('Query17.csv', index=False) plt.bar(x=pd.Users_count.tolist(), height=pd.tweets_count.tolist()) plt.xlabel('Users count') plt.ylabel('tweets count') plt.title("Users & Tweets Analyzed for Processing") # plt.axis('equal') plt.tight_layout() # plt.show() plt.savefig('graph17.png') graph17 = BytesIO() plt.savefig(graph17) graph17.seek(0) return graph17
def get_query4(): plt.clf() # os.remove("Query4.csv") # os.remove("graph4.png") query4 = spark.sql( "SELECT LOWER(hashtags.text) As Hashtags, COUNT(*) AS total_count FROM tweets LATERAL VIEW EXPLODE(entities.hashtags) AS hashtags GROUP BY LOWER(hashtags.text) ORDER BY total_count DESC LIMIT 20" ) pd = query4.toPandas() pd.to_csv('Query4.csv', index=False) plt.plot(pd.Hashtags.tolist(), pd.total_count.tolist()) plt.xticks(rotation=45, ha="right") plt.title("Trending Hashtags") plt.xlabel('Hashtags') plt.ylabel('Count') # plt.axis('equal') plt.tight_layout() # plt.show() plt.savefig("graph4.png") graph4 = BytesIO() plt.savefig(graph4) graph4.seek(0) return graph4
def get_query1(): plt.clf() # os.remove("Query1.csv") # os.remove("graph1.png") query_1 = spark.sql( "SELECT user.name,user.screen_name,user.id,geo.coordinates[0] As latt,geo.coordinates[1] As long from tweets where geo is not null" ) pd = query_1.toPandas() pd.to_csv('Query1.csv', index=False) geometry = [Point(xy) for xy in zip(pd['long'], pd['latt'])] gdf = GeoDataFrame(pd, geometry=geometry) world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres')) gdf.plot(ax=world.plot(), marker='o', color='red', markersize=15) plt.xlabel('Longitude') plt.ylabel('Latitude') plt.title("Geo Locations of Corona Tweets") plt.tight_layout() # plt.show() plt.savefig('graph1.png') graph1 = BytesIO() plt.savefig(graph1) graph1.seek(0) return graph1
def get_query14(): plt.clf() # os.remove("Query14.csv") # os.remove("graph14.png") query_14 = spark.sql( "select place.country as country , count(*) as Count from tweets where place.country IS not null Group by place.country order by count DESC Limit 10" ) pd = query_14.toPandas() pd.to_csv('Query14.csv', index=False) x = pd.country.tolist() y = pd.Count.tolist() plt.stackplot(x, y) plt.title("Twitter Most Used Countries") plt.xlabel('Country') plt.ylabel('Count') plt.xticks(rotation=45, ha="right") # plt.axis('equal') plt.tight_layout() # plt.show() plt.savefig('graph14.png') graph14 = BytesIO() plt.savefig(graph14) graph14.seek(0) return graph14
def get_query6(): plt.clf() # os.remove("Query6.csv") # os.remove("graph6.png") query_6 = spark.sql( "select lang, count(*) as count from tweets where lang is not null group by lang order by count desc limit 10" ) pd = query_6.toPandas() pd.to_csv('Query6.csv', index=False) pd.plot.pie(y="count", labels=pd.lang.tolist(), autopct='%1.2f%%', shadow=False, legend=False, fontsize=8) plt.title("Top Languages Used in Profile") plt.axis('equal') plt.tight_layout() # plt.show() plt.savefig("graph6.png") graph6 = BytesIO() plt.savefig(graph6) graph6.seek(0) return graph6
def get_query3(): plt.clf() # os.remove("Query3.csv") # os.remove("graph3.png") query_3 = spark.sql( "select t.Name,t.Retweeted_Count from(select user.screen_name as Name,SUM(retweet_count) as Retweeted_Count, count(*) from user_retweeted where user.screen_name is not null group by Name ) t order by Retweeted_Count DESC LIMIT 20" ) pd = query_3.toPandas() pd.to_csv('Query3.csv', index=False) x = pd.Name.tolist() y = pd.Retweeted_Count.tolist() plt.stackplot(x, y) plt.title("Most Influential Person Based on Retweet") plt.xlabel('User Name') plt.ylabel('Retweeted count') plt.xticks(rotation=45, ha="right") # plt.axis('equal') plt.tight_layout() # plt.show() plt.savefig('graph3.png') graph3 = BytesIO() plt.savefig(graph3) graph3.seek(0) return graph3
def get_query2(): plt.clf() # os.remove("Query2.csv") # os.remove("graph2.png") query_2 = spark.sql( "select name, followers_count from user where name is not null and verified=='true' order by followers_count DESC LIMIT 20" ) pd = query_2.toPandas() pd.to_csv('Query2.csv', index=False) pd.plot.pie(y="followers_count", labels=pd.name.tolist(), autopct='%1.2f%%', shadow=False, legend=False, fontsize=8) plt.title("Followers Count of verified Users") plt.axis('equal') plt.tight_layout() # plt.show() plt.savefig('graph2.png') graph2 = BytesIO() plt.savefig(graph2) graph2.seek(0) return graph2
def get_query16(): plt.clf() # os.remove("Query16.csv") # os.remove("graph16.png") query_16 = spark.sql( "select text, count(*) as no_of_tweets from tweets where text is not null and retweeted_status.id is not null and lang='en' group by text order by no_of_tweets desc limit 10" ) pd = query_16.toPandas() pd.to_csv('Query16.csv', index=False) pd.plot.pie(y="no_of_tweets", labels=pd.text.tolist(), autopct='%1.2f%%', shadow=False, legend=False, fontsize=8) plt.title("Top Retweeted Tweets(Tweet in english language)") plt.axis('equal') plt.tight_layout() # plt.show() plt.savefig('graph16.png') graph16 = BytesIO() plt.savefig(graph16) graph16.seek(0) return graph16
def get_query15(): plt.clf() # os.remove("Query15.csv") # os.remove("graph15.png") query_15 = spark.sql( "select screen_name as Name, location as Location, followers_count as FollowersCount from user where location is not null ORDER BY followers_count DESC LIMIT 10" ) pd = query_15.toPandas() pd.to_csv("Query15.csv", index=False) pd.plot.pie(y="FollowersCount", labels=pd.Name.tolist(), autopct='%1.2f%%', shadow=False, legend=False, fontsize=8) plt.title("Accounts With More Followers Count Based on Location") plt.axis('equal') plt.tight_layout() # plt.show() plt.savefig("graph15.png") graph15 = BytesIO() plt.savefig(graph15) graph15.seek(0) return graph15
def get_query8(): plt.clf() # os.remove("Query8.csv") # os.remove("graph8.png") query_8 = spark.sql( "select count(*) as TweetCount,SUBSTR(user.created_at,0,10) as TWEETDATE from tweets where created_at IS not null GROUP BY user.created_at ORDER BY TweetCount DESC LIMIT 30" ) pd = query_8.toPandas() pd.to_csv('Query8.csv', index=False) y = pd.TweetCount.tolist() x = pd.TWEETDATE.tolist() plt.plot(x, y, alpha=0.5) plt.title("Stats of Tweets Per Day") plt.xlabel('Tweet Date') plt.ylabel('Tweet Count') plt.xticks(rotation=45, ha="right") # plt.axis('equal') plt.tight_layout() # plt.show() plt.savefig('graph8.png') graph8 = BytesIO() plt.savefig(graph8) graph8.seek(0) return graph8
def get_query10(): plt.clf() # os.remove("Query10.csv") # os.remove("graph10.png") query_10 = spark.sql( "select name,friends_count as friends from user order by friends desc limit 20" ) pd = query_10.toPandas() pd.to_csv('Query10.csv', index=False) x = pd.name.tolist() y = pd.friends.tolist() plt.plot(x, y, alpha=0.5) plt.title("Trends on Number of friends") plt.xlabel('User Name') plt.ylabel('Friends Count') plt.xticks(rotation=45, ha="right") # plt.axis('equal') plt.tight_layout() # plt.show() plt.savefig("graph10.png") graph10 = BytesIO() plt.savefig(graph10) graph10.seek(0) return graph10
'New Daily Clustering Bikes by location Model has been trained on the {}' .format(datetime.now())) if mode == "predicting": model = KMeansModel.load(modelpath) logger.info( 'Daily Clustering Bikes by location Started on the {} '.format( datetime.now())) transformed_tot = model.transform(dataDF).withColumnRenamed( "prediction", "cluster_id") #creating output dataset transformed_tot.createOrReplaceTempView("data_table") data_table = spark.sql( "SELECT data_table.id, data_table.name, data_table.address, data_table.latitude, data_table.longitude, data_table.position, data_table.coordinates, data_table.cluster_id FROM data_table" ) data_table.coalesce(1).write.format('json').save( datapath_output_file1 + currentdate + '.json') logger.info('Daily Output file {} generated on {}'.format( datapath_output_file1 + currentdate + '.json', datetime.now())) transformed_tot.cache() # creating the centroides grouping and output file for this gouping centerList = list() centers1 = model.clusterCenters() count = int() for center in centers1: temp_list = center.tolist() temp_list.append(count) centerList.append(temp_list)
import pandas as pd ## constants observ_daterange = '1910_1912' label_daterange = '2001' ## HIVE SQL cj_1910_1912 = spark.sql(""" SELECT distinct actor_id, event, yyyymm AS behavior_yyyymm, action_sponsor, action_time, action_type, object_id, twd_amt, txn_currency_code, txn_type_desc, atm_location, target_acct_nbr, target_bank_code FROM vp_bank.customer_journey_event a lateral view json_tuple(a.attrs, "action") attr_a as attrs_action lateral view json_tuple(a.attrs, "channel") attr_c as attrs_channel lateral view json_tuple(a.attrs, "object") attr_o as attrs_object lateral view json_tuple(attr_a.attrs_action, "twd_amt") n3 as twd_amt lateral view json_tuple(attr_a.attrs_action, "txn_currency_code") n4 as txn_currency_code lateral view json_tuple(attr_a.attrs_action, "txn_type_desc") n5 as txn_type_desc lateral view json_tuple(attr_c.attrs_channel, "atm_location") n4 as atm_location lateral view json_tuple(attr_o.attrs_object, "target_acct_nbr") n4 as target_acct_nbr lateral view json_tuple(attr_o.attrs_object, "target_bank_code") n4 as target_bank_code where event in ("atm_transaction", "myatm_transaction", "mybank_transaction") and yyyymm between "201910" and "201912" order by actor_id, action_time asc """) accounts_2001 = spark.sql = (""" SELECT distinct customer_id, acct_nbr, customer_class_code, yyyymm, start_ym, label FROM btmp_cmd.nt83716_cip_acct_base_ds WHERE yyyymm = "202001" """)
nd11 = d1.map(lambda x: (x[0].split("_")[0], x[1], x[2], x[3])) nd1 = nd11.map(lambda x: (x[0:2], x[2:])) nd2 = d2.map(lambda x: (x[0:2], x[2:])) join = nd1.join(nd2) spark = SparkSession(sc) df = join.map(lambda (k, v): Row(k, v[0], v[1])).toDF() df.createTempView("temp") temp = spark.sql( "select _1._1 as Name, _1._2 as Degree, _2._1 as Country, _2._2 as State, _3._1 as Hotel_1, _3._2 as Hotel_2, " "cast (_3._3 as int) as Salary_1, cast (_3._4 as int) as Salary_2 from temp " ) temp.createTempView("resume") res = spark.sql("select * from resume") res.show() res1 = spark.sql( "select Name, ((Salary_1+Salary_2)/2) as Average_Salary from resume where Degree like 'Bachelor%'" ) res1.show() # res1.write.option("header","true").csv("/home/lucky/Downloads/Output/Result1.csv")
import json import spark as spark import time as today def convertToJson(json_list): load_date = today.strftime("%Y-%m-%d") client = 'zeus' output = load_date + '|' + client + '|' + str(json.dumps(json_list)) return output jsonDF = spark.sql("""select * from json_ready_tbl""") #jsonRDD = jsonDF.rdd.map(lambda x: convertToJson(x)).saveAsTextFile('/hdfs/output/dir') # n is the number of batch n = 1000 i = 0 json_list = [] for jdf in jsonDF: json_list.append(jdf) i += 1 if i > n - 1: output = convertToJson(json_list) #print(output) jsonRDD = jsonDF.rdd.map(output).saveAsTextFile('/hdfs/output/dir') i = 0 json_list = []
import spark table_list = ['table_1', 'table_2'] for t in table_list: df = spark.sql("""desc formatted schema_name.{table_name}""".format( table_name=t)) print(df.filter("col_name=='Location'").collect()[0].data_type) # for one table # spark.sql("""desc formatted schema_name.table_name""").filter("col_name=='Location'").collect()[0].data_type