def ageCalculate(self, column, dateFormat, nameColAge): """ This method compute the age of clients based on their born dates. :param column Name of the column born dates column. :param dateFormat String format date of the column provided. :param nameColAge Name of the new column, the new columns is the resulting column of ages. """ # Check if column argument a string datatype: self.__assertTypeStr(column, "column") # Check if dateFormat argument a string datatype: self.__assertTypeStr(dateFormat, "dateFormat") # Asserting if column if in dataFrame: assert column in self.__df.columns, "Error: Column assigned in column argument does not exist in dataFrame" # Output format date Format = "yyyy-MM-dd" # Some SimpleDateFormat string exprs = format_number( mag( months_between(date_format( unix_timestamp(column, dateFormat).cast("timestamp"), Format), current_date()) / 12), 4).alias( nameColAge) self.__df = self.__df.withColumn(nameColAge, exprs) self.__addTransformation() # checkpoint in case return self
df1 = df1.drop(*drop_ColList) df1 = df1.withColumnRenamed(df1.columns[-1],"Cases") df2 = US_death keep_ColList = ['FIPS','Combined_Key','Population',US_case.columns[-1]] drop_ColList = list(set(US_death.columns)-set(keep_ColList)) df2 = df2.drop(*drop_ColList) df2 = df2.withColumnRenamed(df2.columns[-1],"Deaths") #Join the reduced dataframes into a new dataframe called US_stat US_stat = df1.join(df2,on = ['FIPS','Combined_Key']) #Add another column named Fatality by computing the death rates US_stat = US_stat.withColumn('InfectionRate', F.format_number(US_stat.Cases/US_stat.Population,5).cast(DoubleType())) US_stat = US_stat.withColumn('Fatality', F.format_number(US_stat.Deaths/US_stat.Cases,4).cast(DoubleType())) US_stat = US_stat.withColumnRenamed('Combined_Key','County') US_stat = US_stat.fillna(0) US_stat = US_stat.withColumn("FIPS",US_stat["FIPS"].cast(IntegerType())) #Reformat FIPS to be the standard format with 5 digits US_stat = US_stat.withColumn("FIPS",F.format_string("%05d","FIPS")) # Although rows with deteriorated information has been removed previously, # some rows actually contain information that does not make sense, probably due to wrong data source # such as population = 0, deaths > confirmed cases, etc. US_stat = US_stat.filter((US_stat.Population != 0) & (US_stat.Cases >= US_stat.Deaths) & (US_stat.Population >= US_stat.Cases)) US_stat.show()
df = df.withColumn("Age/NoOfFriends", df["NoOfFriends"] / df["Age"]) # COMMAND ---------- df.describe().show() # COMMAND ---------- from pyspark.sql.functions import format_number from pyspark.sql.types import DoubleType from pyspark.sql.types import IntegerType # COMMAND ---------- df2 = df.select( format_number("Age/NoOfFriends", 2).alias("AgeByFriends").cast(IntegerType())) df2.describe().show() #df2.orderBy(df2["AgeByString"].desc()).show() # COMMAND ---------- df2.printSchema() # COMMAND ---------- df2.select("AgeByFriends").show() # COMMAND ---------- #to convert a cloumn name from string to integer df2.select("AgeByFriends".cast(DoubleType())).show()
from pyspark.sql import SparkSession from pyspark.sql.functions import format_string import pyspark.sql.functions as F import sys spark = SparkSession.builder.appName("my_pp").getOrCreate() # 59 total agents joined_df = spark.read.format('csv').options( header='false', inferschema='true').load(sys.argv[1]).select( F.col('_c20').alias('agent_name'), F.col('_c5').alias('fare_amount')) res = joined_df.groupBy('agent_name').agg( F.sum('fare_amount').alias('total_revenue')).sort('total_revenue', ascending=False) res = res.limit(10) res = res.select( 'agent_name', F.regexp_replace(F.format_number(F.round(F.col('total_revenue'), 2), 2), ',', '').alias('total_revenue')) res.select(format_string('%s,%s', res.agent_name, res.total_revenue)).write.save('task4c-sql.out', format="text") spark.stop()
"""## Calculate standard deviation""" customerDF.select(stddev("salary")).show() customerDF.select(stddev("age")).show() """## format number""" from pyspark.sql.functions import format_number salary_std = customerDF.select(stddev("salary").alias('std')) salary_std.show() salary_std.select(format_number('std',2)).show() """## Order By Order By Ascending """ customerDF.orderBy("salary").show() """Order by descending""" customerDF.orderBy(customerDF["salary"].desc()).show() """## drop any row that contains null data""" customerDF.na.drop().show()
#%% Functions from pyspark.sql.functions import countDistinct, avg, stddev df.select(countDistinct("Sales")).show() df.select(countDistinct("Sales").alias("Distinct sales")).show() df.select(countDistinct("Sales").name("Distinct sales")).show() df.select(avg("Sales")).show() df.select(stddev("Sales")).show() #%% from pyspark.sql.functions import format_number sales_stddev = df.select(stddev('Sales').alias('std')) sales_stddev.show() sales_stddev.select(format_number('std', 2)).show() #%% Order By df.orderBy("Sales").show() df.orderBy(df["Sales"]).show() df.orderBy(df["Sales"].desc()).show() df["Sales"] df["Sales"].desc() #%% #%%
# COMMAND ---------- # DBTITLE 1,Minimum Price of whole data pdf.select(min(pdf["price"])).show() # COMMAND ---------- # DBTITLE 1,Average Price of whole data pdf.select(mean(pdf["price"])).show() # COMMAND ---------- # DBTITLE 1,Price Per Bedroom ppbed = pdf.withColumn( "Price_Per_Bedroom", format_number(((pdf["Price_Sq_Ft"] * pdf["Size"]) / (pdf["Bedrooms"])), 2)) ppbed.show() # COMMAND ---------- # DBTITLE 1,Price Per Bathroom ppbath = ppbed.withColumn( "Price_Per_Bathroom", format_number( ((ppbed["Price_Sq_Ft"] * ppbed["Size"]) / (ppbed["Bathrooms"])), 2)) ppbath.show() # COMMAND ----------
df.groupBy("Company").max().show() df.groupBy("Company").min().show() df.groupBy("Company").sum().show() df.agg({'Sales':'max'}).show() import pyspark.sql.functions as F df.select(F.countDistinct("Sales")).show() df.select(F.avg('Sales')).show() df.select(F.stddev("Sales")).show() sales_std = df.select(F.stddev("Sales").alias('std')) sales_std.select(F.format_number('std',2).alias('std')).show() # OrderBy df.orderBy("Sales").show() df.orderBy(df["Sales"].desc()).show() # In[16]: # Pandas Data Size Limit <2 gigabytes # PySpark Data Size Limit >100 gigabytes # There is no hard limit on the Data size you can process with PySpark, however when your RDD size # exceeds the size of your RAM then the data will be moved to Disk. Even after the data is moved to the Disk, # Spark will be equally capable of processing it. # https://www.kaggle.com/mkechinov/ecommerce-behavior-data-from-multi-category-store import pandas as pd dp = pd.read_csv('D:/#DevCourses-GWU/#5_IoT_BigData/2019-Novs.csv')
# this is a bit of a nicer way to gather the data group_data = df.groupBy('Company') group_data.agg({'Sales': 'max'}).show() # import functions from the pyspark library from pyspark.sql.functions import countDistinct, avg, stddev # count the number of distinct sales df.select(countDistinct('Sales')).show() # show the average for all sales you can then change the name using '.alias()' to whatever makes sense to you df.select(avg('Sales').alias('Average Sales')).show() # select standard deviation df.select(stddev('Sales')).show() #import format number function to make the number a bit neater from pyspark.sql.functions import format_number sales_std = df.select(stddev('Sales').alias('std')) sales_std.show() # change the number so it only shows 2 decimal places, and then also change the name to formatted std sales_std.select(format_number('std', 2).alias('Formatted std')).show() #to sort the order of the df df.orderBy('Sales').show() # to have the results showing descending df.orderBy(df['Sales'].desc()).show()
F.col('_c18').alias('tip_amount'), F.col('_c19').alias('tolls_amount'), F.col('_c20').alias('total_amount')) left = joined_df.filter(F.col('pickup_longitude') == 0.0).filter( F.col('pickup_latitude') == 0.0).filter( F.col('dropoff_longitude') == 0.0).filter( F.col('dropoff_latitude') == 0.0).select('medallion').groupBy( 'medallion').agg(F.count('*').alias('all_zero_coordinates')) right = joined_df.groupBy('medallion').agg( F.count('*').alias('total_taxi_count')).withColumnRenamed( 'medallion', 'medallion2') res = left.join(right, left.medallion == right.medallion2).drop('medallion2') res = res.select( 'medallion', F.regexp_replace( F.format_number( F.round( 100 * F.col('all_zero_coordinates') / F.col('total_taxi_count'), 2), 2), ',', '').alias('percentage_of_trips')).sort('medallion') res.select(format_string('%s,%s', res.medallion, res.percentage_of_trips)).write.save('task3c-sql.out', format="text") spark.stop()
|avg(Year)| avg(Close)| +---------+------------------+ | 2015.0|120.03999980555547| | 2013.0| 472.6348802857143| | 2014.0| 295.4023416507935| | 2012.0| 576.0497195640002| | 2016.0|104.60400786904763| | 2010.0| 259.8424600000002| | 2011.0|364.00432532142867| +---------+------------------+ ''' result = newdf.groupBy("Year").mean()[['avg(Year)', 'avg(Close)']] result = result.withColumnRenamed("avg(Year)", "Year") result.select('Year', format_number('avg(Close)', 2).alias("Mean Close")).show() ''' +------+----------+ | Year|Mean Close| +------+----------+ |2015.0| 120.04| |2013.0| 472.63| |2014.0| 295.40| |2012.0| 576.05| |2016.0| 104.60| |2010.0| 259.84| |2011.0| 364.00| +------+----------+ ''' print("Execution Completed")
df.select(dayofyear(df['Date']).alias('day of year')).show() df.select(month(df['Date']).alias('month')).show() df.select(hour(df['Date'])).show() df.select(year(df['Date']).alias('year')).show() #%% df.withColumn("Year", year(df['Date'])).show() newdf = df.withColumn("Year", year(df["Date"])) newdf_mean = newdf.groupBy("Year").mean() newdf_mean.show() newdf_mean['avg(Close)'] newdf_mean['avg(Close)'].show() #! TypeError: 'Column' object is not callable # BUT newdf_mean[['avg(Close)']].show() newdf_mean[['Year', 'avg(Year)','avg(Close)', 'avg(Open)']].show() #%% add aliases and format_number meandf = newdf_mean[['Year', 'avg(Open)', 'avg(Close)', 'avg(Volume)']] meandf = meandf.select('Year', \ format_number('avg(Open)', 2).alias('mean open'), \ format_number('avg(Close)', 2).alias('mean close'), \ format_number('avg(Volume)', 2).alias('mean volume'), \ ) meandf.show() #%%
# COMMAND ---------- #dfStart = dfStart.filter(dfStart["Id"]%2 == 0).withColumn( "Total Amount", ( dfStart["Friends"]*10*dfStart["Age"] )/12 ) #TotalAmountDF = dfStart.select(dfStart["Name"],dfStart["Age"],dfStart["Friends"].alias("Total Friends"),dfStart["Total Amount"] ) #TotalAmountDF.show() # COMMAND ---------- from pyspark.sql.functions import format_number, round, mean, max, min # COMMAND ---------- dfStart.select( "*", format_number(dfStart['Friends'] / dfStart['Age'], 2).alias("friend_by_age")).show() # COMMAND ---------- dfStart.select(min(dfStart["age"])).show() # COMMAND ---------- dfStart.columns # COMMAND ---------- dfStart.groupBy("age").max("friends").show()
# MAGIC If you get stuck on this, don't worry, just view the solutions. # COMMAND ---------- # Uh oh Strings! df.describe().printSchema() # COMMAND ---------- from pyspark.sql.functions import format_number # COMMAND ---------- result = df.describe() result.select(result['summary'], format_number(result['Open'].cast('float'),2).alias('Open'), format_number(result['High'].cast('float'),2).alias('High'), format_number(result['Low'].cast('float'),2).alias('Low'), format_number(result['Close'].cast('float'),2).alias('Close'), result['Volume'].cast('int').alias('Volume') ).show() # COMMAND ---------- # MAGIC %md #### Create a new dataframe with a column called HV Ratio that is the ratio of the High Price versus volume of stock traded for a day. # COMMAND ---------- df2 = df.withColumn("HV Ratio",df["High"]/df["Volume"])#.show() # df2.show() df2.select('HV Ratio').show()
# 2) Print out the schema df.printSchema() # 3) Print out the first 5 rows rows = df.head(5) for row in rows: print(row) # 4) Describe the dataframe df_describe = df.select(['Open', 'High', 'Low', 'Close', 'Volume']).describe() df_describe.show() # 5) Format describe output to two decimals numbers df_describe.printSchema() df_describe.select( format_number(df_describe['Open'].cast("float"), 2).alias('Open'), format_number(df_describe['High'].cast("float"), 2).alias('High'), format_number(df_describe['Low'].cast("float"), 2).alias('Low'), format_number(df_describe['Close'].cast("float"), 2).alias('Close'), format_number(df_describe['Volume'].cast("float"), 2).alias('Volume'), ).show() # 6) Create new column called "HV Ratio": High Price versus volume of stock traded for a day df.withColumn('hv_ratio', df['High'] / df['Volume']).show() # 7) What day had the Peak High in Price? print(df.orderBy(df["High"].desc()).head(1)[0][0]) # 8) What is the mean of the Close column? df.agg({"Close": "avg"}).show()
#The following lines are more realistic in case of manipulating data: df_comp = df_2.groupby("Company") df_comp.agg({'Sales':'mean'}).show() # The following option could be more handy than using dictionaries from pyspark.sql.functions import countDistinct,avg,stddev,format_number df_2.select(countDistinct('Company').alias('N. of companies')).show() df_2.select(avg('Sales')).show() df_2.select(avg('Sales').alias('Average sales man!')).show() df_2.select(stddev('Sales').alias('STD man!')).show() print('testing different formats'.upper()) # First we refer to the column as "std" df_2_STD = df_2.select(stddev('Sales').alias('std')) # Then, we change the format of std to whow only two digits and change the name of the column as "formated std" df_2_STD.select(format_number('std',2).alias('formated std')).show() # Ascendeing order by a given column df_2.orderBy('Sales').show() df_2.orderBy('Company').show() df_2.orderBy('Person').show() # Descending order by a given column df_2.orderBy(df_2['Sales'].desc()).show() # Missing data print('Third exersize'.upper()) from pyspark.sql import SparkSession spark = SparkSession.builder.appName('missData').getOrCreate() df_3 = spark.read.csv('ContainsNull.csv',inferSchema=True, header=True) df_3.show()
# In[39]: df.select(avg('char_38').alias('Avrage_of_char38')).show() # In[40]: from pyspark.sql.functions import format_number # In[41]: stdd = df.select(stddev('char_38').alias('std')) # In[42]: stdd.select(format_number('std', 2).alias('std')).show() # In[99]: #missing data # In[46]: from pyspark.sql.functions import mean mean_val = df.select(mean(df['char_38'])).collect() # In[47]: df.na.fill(df.select(mean(df['char_38'])).collect()[0][0], ['char_38']).show() # In[ ]:
print(df.columns) df.printSchema() df.show(5) print(df.head(5)) df.describe().show() ddf = df.describe() for col in ['Open', 'High', 'Low', 'Close', 'Volume', 'Adj Close']: dff = ddf.withColumn(col, ddf[col].cast(FloatType())) dff = dff.withColumn(col, format_number(col, 2).alias(col)) ddf.show() df.withColumn('HR Ratio', df['High'] / df['Volume']).select('HR Ratio').show() df.orderBy(df['High'].desc()).head(1)[0][0] df.select(avg('Close')).show() df.select([max('Volume'), min('Volume')]).show() print(df.filter(df['Close'] < 60).count()) print(df.select(df['High'] > 80).count() / df.count() * 100.0) print(df.corr('High', 'Volume'))
def addUserFeatures(samplesWithMovieFeatures: DataFrame) -> DataFrame: """ 新增列详解: --- 以下都是**到该条评价产生时间前的**历史行为特征记录 --- 1.userPositiveHistory: 收集该用户的积极评价并形成一个list,积极评价定义为评价分>3.5(认定为其喜欢该电影),同时使用滑动窗口收集该评价发生时间前的历史节点,避免收集未来信息 2.使用F.reverse将①中得到的评价序列反序,即按最新评价在前的顺序 3.userRatedMovie[0~4]: 该用户最近评价的5部电影 4.userRatingCount: 用户评价总数 5.userRatedMovieAvgReleaseYear: 用户评价过的电影的平均上映年份 6.userRatedMovieReleaseYearStddev: 用户评价过的电影的上映年份的无偏标准差 7.userAvgRating: 用户平均评分 8.userRatingStddev: 用户评分的无偏标准差 9.userGenres: 用户观看过的电影的风格分类汇总 10.userGenre[0~4]: 用户最近5个观看的电影风格分类 --- 以下是对DataFrame中无用信息列的修正 1.drop: ①genres: 原始电影风格分类,在历史行为特征中不具有含义,删去 ②userGenres: 收集到的按时间排列的最近观看电影分类的列表,已捡取前5个,原始列可删去 ③userPositiveHistory 收集到的按时间排列的评分序列,已捡取前5个,原始列可删去 2.filter: 不保留用户整个历史行为中第一次的电影评价,因为在这个行为前没有历史行为,属于冷启动部分 :param samplesWithMovieFeatures :return: samplesWithUserFeatures """ samplesWithUserFeatures = samplesWithMovieFeatures \ .withColumn('userPositiveHistory', F.collect_list(F.when(F.col('label') == 1, F.col('movieId')).otherwise(F.lit(None))).over( Window.partitionBy('userId').orderBy(F.col('timestamp')).rowsBetween(-100, -1) )) \ .withColumn('userPositiveHistory', F.reverse(F.col('userPositiveHistory'))) \ .withColumn('userRatedMovie0', F.col('userPositiveHistory')[0]) \ .withColumn('userRatedMovie1', F.col('userPositiveHistory')[1]) \ .withColumn('userRatedMovie2', F.col('userPositiveHistory')[2]) \ .withColumn('userRatedMovie3', F.col('userPositiveHistory')[3]) \ .withColumn('userRatedMovie4', F.col('userPositiveHistory')[4]) \ .withColumn('userRatingCount', F.count(F.lit(1)).over(Window.partitionBy('userId').orderBy('timestamp').rowsBetween(-100, -1)) ) \ .withColumn('userRatedMovieAvgReleaseYear', F.avg(F.col('releaseYear')).over(Window.partitionBy('userId').orderBy('timestamp').rowsBetween(-100, -1)) .cast(IntegerType())) \ .withColumn('userRatedMovieReleaseYearStddev', F.format_number( F.stddev(F.col('releaseYear')).over(Window.partitionBy('userId').orderBy('timestamp').rowsBetween(-100, -1)), NUMBER_PRECISION)) \ .withColumn('userAvgRating', F.format_number( F.avg(F.col('rating')).over(Window.partitionBy('userId').orderBy('timestamp').rowsBetween(-100, -1)), NUMBER_PRECISION)) \ .withColumn("userRatingStddev", F.format_number( F.stddev(F.col("rating")).over(Window.partitionBy('userId').orderBy('timestamp').rowsBetween(-100, -1)), NUMBER_PRECISION)) \ .withColumn("userGenres", F.udf(extractGenresUDF, ArrayType(StringType()))( F.collect_list(F.when(F.col('label') == 1, F.col('genres')).otherwise(F.lit(None))).over( Window.partitionBy('userId').orderBy('timestamp').rowsBetween(-100, -1)) )) \ .withColumn("userGenre0", F.col("userGenres")[0]) \ .withColumn("userGenre1", F.col("userGenres")[1]) \ .withColumn("userGenre2", F.col("userGenres")[2]) \ .withColumn("userGenre3", F.col("userGenres")[3]) \ .withColumn("userGenre4", F.col("userGenres")[4]) \ .drop("genres", "userGenres", "userPositiveHistory") \ .filter(F.col("userRatingCount") > 1) return samplesWithUserFeatures
df["friends"].alias("TotalFriends"), "TotalAmount") # COMMAND ---------- dfAmountPerMonth.show() # COMMAND ---------- from pyspark.sql.functions import format_number, mean, min, max # COMMAND ---------- dfAmountPerMonth.select( "*", format_number(dfAmountPerMonth["TotalAmount"], 2).alias("upto 2 decimal place")).show() # COMMAND ---------- df.select(mean(df["age"])).show() # COMMAND ---------- (df.filter(df["age"] == 53)).show() # COMMAND ---------- df.groupBy("name").max("Age").show() # COMMAND ----------
df_year.head(1)[0] # In[71]: # group by and select two columns year and Close avg_close_per_year = df_year.groupBy('Year').mean().select(['Year', 'avg(Close)']) # In[72]: # rename the columns avg_close_per_year = avg_close_per_year.withColumnRenamed("avg(Close)", "Average Closing Price") # In[74]: # format the numbers avg_close_per_year.select(['Year', format_number('Average Closing Price', 2)]).show() # In[77]: # use alias to rename the column avg_close_per_year.select(['Year', format_number('Average Closing Price', 2).alias("Avg Close")]).show()
from pyspark.sql import SparkSession from pyspark.sql.functions import (dayofmonth, hour, dayofyear, month, year, weekofyear, format_number, date_format) spark = SparkSession.builder.appName('dates').getOrCreate() df = spark.read.csv('./data/appl_stock.csv', inferSchema=True, header=True) df.show() df.printSchema() df.select(['Date']).show() df.select(dayofmonth(df['Date'])).show() df.select(hour(df['Date'])).show() df.select(year(df['Date'])).show() ndf = df.withColumn('Year', year(df['Date'])) ndf.groupBy('Year').mean().select(['Year', 'avg(Close)']).show() result = ndf.groupBy('Year').mean().select(['Year', 'avg(Close)']) result.withColumnRenamed('avg(Close)', 'Average Closing Price').select( ['Year', format_number('Average Closing Price', 2)]).show()
print('What is the schema?'.upper()) df.printSchema() print('Print out the first 5 rows'.upper()) df.show(5) print('OR'.upper()) for i in df.head(5): print(i) print('\n') print('Reduce the decimals') #df.describe() from pyspark.sql.functions import format_number df_new = df.select([ 'Date', format_number('Open', 2).cast('float').alias('Open'), format_number('High', 2).cast('float').alias('High'), format_number('Low', 2).cast('float').alias('Low'), format_number('Close', 2).cast('float').alias('Close'), 'Volume', format_number('Adj Close', 2).cast('float').alias('Adj Close') ]) print('Create new data frame with an extra column Volume/High') df_new = df_new.withColumn('V/H ratio', format_number(df_new['Volume'] / df_new['High'], 2)) print('What day preseted the highest peak in High'.upper()) df_ord_H = df_new.orderBy(df_new['High'].desc()) df_ord_H.select('Date').show(1) print('OR') df_new.orderBy(df_new['High'].desc()).head(1)[0][0]
dayofyear, month, year, weekofyear, format_number, date_format) spark = SparkSession.builder.appName('ops').getOrCreate() df = spark.read.csv('appl_stock.csv', inferSchema=True, header=True) #df.select(['Date', 'Open']).show()df.take(20) # Date Formatting df.select(dayofmonth(df['Date'])) df.select(month(df['Date'])) df.select(year(df['Date'])) df.select(hour(df['Date'])) df_year = df.withColumn('Year', year(df['Date'])) df_year_avg = df_year.groupBy("Year").mean().select(["Year", "avg(Close)"]) result = df_year_avg.withColumnRenamed("avg(Close)", "Average Closing Price") new = result.select(['Year', format_number('Average Closing Price', 2).alias("Average Close")]) new.show()
df.columns # What does the schema look like? df.printSchema() # Print out the first five rows df.head(5) # Use "describe" to learn about the DataFrame df.describe().show() summary_df = df.describe() # Format the number to show up two decimal places formatted_summary_df = summary_df.select( col('summary'), format_number(col('Open').cast('float'), 2).alias('Open'), format_number(col('High').cast('float'), 2).alias('High'), format_number(col('Low').cast('float'), 2).alias('Low'), format_number(col('Close').cast('float'), 2).alias('Close'), format_number(col('Volume').cast('float'), 2).alias('Volume'), format_number(col('Adj Close').cast('float'), 2).alias('Adj Close')) formatted_summary_df.show() # Create a new dataframe with a column called HV Ratio newdf = df.select(col('High') / col('Volume')) newdf.show() # What day had the Peak High in Price? high_max = df.groupBy().max('High').collect()[0][0] day = df.filter(col('High') == high_max).collect()[0][0] print(day)
# Formatting columns df.select(stddev('Sales')).show() # In[58]: from pyspark.sql.functions import format_number # In[64]: sales_std = df.select(stddev('Sales').alias('Sales_STD')) sales_std.select(format_number('Sales_STD', 2)).show() # pass the column name and the number of decimals wanted # will have to fix alias once again # #### OrderBy # In[66]: df.orderBy("Sales").show() # In[67]: df.orderBy('Company').show()
# In[92]: from pyspark.sql.functions import format_number # In[93]: # store the description in a var walmart_desc = walmart_df.describe() # In[94]: # cast the result walmart_desc.select( walmart_desc['summary'], format_number(walmart_desc['Open'].cast('float'), 2).alias('Open'), format_number(walmart_desc['High'].cast('float'), 2).alias('High'), format_number(walmart_desc['Low'].cast('float'), 2).alias('Low'), format_number(walmart_desc['Close'].cast('float'), 2).alias('Close'), walmart_desc['Volume'].cast('int').alias('Volume')).show() # #### Create a new dataframe with a column called HV Ratio that is the ratio of the High Price versus volume of stock traded for a day. # In[95]: walmart_df_hv_ratio = walmart_df.withColumn( 'HV Ratio', (walmart_df['High'] / walmart_df['Volume'])) walmart_df_hv_ratio.select('HV Ratio').show() # #### What day had the Peak High in Price?
spark = SparkSession.builder.appName('aggs').getOrCreate() df = spark.read.csv('./data/sales_info.csv', inferSchema=True, header=True) df.printSchema() df.show() df.groupBy('Company').mean().show() df.groupBy('Company').count().show() df.agg({'Sales': 'sum'}).show() group_data = df.groupBy('Company') group_data.agg({'Sales': 'max'}).show() df.select(countDistinct('Company')).show() df.select(avg('Sales').alias('Average Sales')).show() df.select(stddev('Sales').alias('Std Dev')).show() sales_std = df.select(stddev('Sales').alias('Std Dev')).select( format_number('Std Dev', 2)).show() df.orderBy('Sales').show() df.orderBy(df['Sales'].desc()).show()
df_nycflights_max_min_dep_delay.show() '''Finding groupby with multiple columns''' df_nycflights_grpby = df_nycflights. \ groupby(['month','origin','dest']). \ count(). \ orderBy(['month','count'], ascending= [1,0]) df_nycflights_grpby.show() '''group by also renaming column from group(column_name) to something proper ''' '''then using withColumn we do rounding as well''' df_nycflights_agg_col_rename = df_nycflights. \ groupby('month'). \ agg({'dep_delay': 'avg' , 'arr_delay': 'avg'}). \ withColumnRenamed('avg(arr_delay)','mean_arr_delay'). \ withColumnRenamed('avg(dep_delay)','mean_dep_delay'). \ withColumn('mean_arr_delay',format_number('mean_arr_delay',1)) df_nycflights_agg_col_rename.show() '''Group By with pivot''' df_pivot_carrier = df_nycflights. \ groupby('month'). \ pivot('carrier'). \ count() df_pivot_carrier.show()
('spark.driver.memory', '4g')]) spark = SparkSession.builder.appName("example-spark").config( conf=conf).getOrCreate() sc = spark.sparkContext.getOrCreate() # see its lowercase sqlContext = SQLContext(sc) df = spark.read.csv( "D:\\Arasan\\Misc\\GitHub\\Spark\\input\\walmart_stock.csv", inferSchema=True, header=True) #print(df.head()) #print(df.columns) #print(df.printSchema()) d_df = df.describe() #print(d_df.select(d_df['Open']).collect()) d_df.select( format_number(d_df['Open'].cast(IntegerType()), 2).alias('Open'), format_number(d_df['High'].cast(IntegerType()), 2).alias('High'), format_number(d_df['Low'].cast(IntegerType()), 2).alias('Low'), format_number(d_df['Close'].cast(IntegerType()), 2).alias('Close'), format_number(d_df['Volume'].cast(IntegerType()), 0).alias('Volume'), format_number(d_df['Adj Close'].cast('int'), 2).alias('Adj Close')) #).show() newdf = df.withColumn('HV Ratio', df['High'] / df['Volume']) #newdf.show() #print(df.agg({'High':'max'}).collect()[0][0]) #x = df.agg({'High':'max'}).collect()[0][0] #df.filter(df['High'] == df.agg({'High':'max'}).collect()[0][0]).select('Date').show() #print(df.orderBy(df['High'].desc()).head(1)[0][0]) #df.agg({'Close':'mean'}).show()
df1.Volume.cast("Double"), df1["Adj Close"].cast("Double"), ) # COMMAND ---------- from pyspark.sql.functions import col, format_number, when # df1.filter((col("summary") == "mean") | (col("summary") == "stddev")).show() # df1.filter(col("summary").isin(["mean", "stddev"])).with.show() cols = ["Open", "High", "Low", "Close", "Volume", "Adj Close"] for col_val in cols: df2 = df2.withColumn( col_val, when( col("summary").isin("mean", "stddev"), format_number(col_val, 2).cast("Double")).otherwise(df2[col_val])) # COMMAND ---------- df2.show() # COMMAND ---------- # MAGIC %md #### Create a new dataframe with a column called HV Ratio that is the ratio of the High Price versus volume of stock traded for a day. # COMMAND ---------- df2 = df2.withColumn("HV RAtio", df2.High / df2.Volume) # COMMAND ----------