Exemple #1
0
    def ageCalculate(self, column, dateFormat, nameColAge):
        """
        This method compute the age of clients based on their born dates.
        :param  column      Name of the column born dates column.
        :param  dateFormat  String format date of the column provided.
        :param  nameColAge  Name of the new column, the new columns is the resulting column of ages.

        """
        # Check if column argument a string datatype:
        self.__assertTypeStr(column, "column")

        # Check if dateFormat argument a string datatype:
        self.__assertTypeStr(dateFormat, "dateFormat")

        # Asserting if column if in dataFrame:
        assert column in self.__df.columns, "Error: Column assigned in column argument does not exist in dataFrame"

        # Output format date
        Format = "yyyy-MM-dd"  # Some SimpleDateFormat string

        exprs = format_number(
            mag(
                months_between(date_format(
                    unix_timestamp(column, dateFormat).cast("timestamp"), Format), current_date()) / 12), 4).alias(
            nameColAge)

        self.__df = self.__df.withColumn(nameColAge, exprs)

        self.__addTransformation()  # checkpoint in case

        return self
df1 = df1.drop(*drop_ColList)
df1 = df1.withColumnRenamed(df1.columns[-1],"Cases")


df2 = US_death

keep_ColList = ['FIPS','Combined_Key','Population',US_case.columns[-1]]
drop_ColList = list(set(US_death.columns)-set(keep_ColList))
df2 = df2.drop(*drop_ColList)
df2 = df2.withColumnRenamed(df2.columns[-1],"Deaths")

#Join the reduced dataframes into a new dataframe called US_stat
US_stat = df1.join(df2,on = ['FIPS','Combined_Key'])
#Add another column named Fatality by computing the death rates
US_stat = US_stat.withColumn('InfectionRate',
                             F.format_number(US_stat.Cases/US_stat.Population,5).cast(DoubleType()))
US_stat = US_stat.withColumn('Fatality',
                             F.format_number(US_stat.Deaths/US_stat.Cases,4).cast(DoubleType()))
US_stat = US_stat.withColumnRenamed('Combined_Key','County')
US_stat = US_stat.fillna(0)
US_stat = US_stat.withColumn("FIPS",US_stat["FIPS"].cast(IntegerType()))
#Reformat FIPS to be the standard format with 5 digits
US_stat = US_stat.withColumn("FIPS",F.format_string("%05d","FIPS"))

# Although rows with deteriorated information has been removed previously, 
# some rows actually contain information that does not make sense, probably due to wrong data source
# such as population = 0, deaths > confirmed cases, etc.
US_stat = US_stat.filter((US_stat.Population != 0) & (US_stat.Cases >= US_stat.Deaths) & (US_stat.Population >= US_stat.Cases))

US_stat.show()
Exemple #3
0
df = df.withColumn("Age/NoOfFriends", df["NoOfFriends"] / df["Age"])

# COMMAND ----------

df.describe().show()

# COMMAND ----------

from pyspark.sql.functions import format_number
from pyspark.sql.types import DoubleType
from pyspark.sql.types import IntegerType

# COMMAND ----------

df2 = df.select(
    format_number("Age/NoOfFriends",
                  2).alias("AgeByFriends").cast(IntegerType()))
df2.describe().show()
#df2.orderBy(df2["AgeByString"].desc()).show()

# COMMAND ----------

df2.printSchema()

# COMMAND ----------

df2.select("AgeByFriends").show()

# COMMAND ----------

#to convert a cloumn name from string to integer
df2.select("AgeByFriends".cast(DoubleType())).show()
Exemple #4
0
from pyspark.sql import SparkSession
from pyspark.sql.functions import format_string
import pyspark.sql.functions as F
import sys

spark = SparkSession.builder.appName("my_pp").getOrCreate()

# 59 total agents

joined_df = spark.read.format('csv').options(
    header='false', inferschema='true').load(sys.argv[1]).select(
        F.col('_c20').alias('agent_name'),
        F.col('_c5').alias('fare_amount'))

res = joined_df.groupBy('agent_name').agg(
    F.sum('fare_amount').alias('total_revenue')).sort('total_revenue',
                                                      ascending=False)

res = res.limit(10)

res = res.select(
    'agent_name',
    F.regexp_replace(F.format_number(F.round(F.col('total_revenue'), 2), 2),
                     ',', '').alias('total_revenue'))

res.select(format_string('%s,%s', res.agent_name,
                         res.total_revenue)).write.save('task4c-sql.out',
                                                        format="text")

spark.stop()
Exemple #5
0
"""## Calculate standard deviation"""

customerDF.select(stddev("salary")).show()

customerDF.select(stddev("age")).show()

"""## format number"""

from pyspark.sql.functions import format_number

salary_std = customerDF.select(stddev("salary").alias('std'))

salary_std.show()

salary_std.select(format_number('std',2)).show()

"""## Order By

Order By Ascending
"""

customerDF.orderBy("salary").show()

"""Order by descending"""

customerDF.orderBy(customerDF["salary"].desc()).show()

"""## drop any row that contains null data"""

customerDF.na.drop().show()
Exemple #6
0
#%% Functions
from pyspark.sql.functions import countDistinct, avg, stddev

df.select(countDistinct("Sales")).show()
df.select(countDistinct("Sales").alias("Distinct sales")).show()
df.select(countDistinct("Sales").name("Distinct sales")).show()
df.select(avg("Sales")).show()
df.select(stddev("Sales")).show()

#%%
from pyspark.sql.functions import format_number

sales_stddev = df.select(stddev('Sales').alias('std'))
sales_stddev.show()

sales_stddev.select(format_number('std', 2)).show()

#%% Order By

df.orderBy("Sales").show()
df.orderBy(df["Sales"]).show()
df.orderBy(df["Sales"].desc()).show()
df["Sales"]
df["Sales"].desc()

#%%



#%%
# COMMAND ----------

# DBTITLE 1,Minimum Price of whole data
pdf.select(min(pdf["price"])).show()

# COMMAND ----------

# DBTITLE 1,Average Price of whole data
pdf.select(mean(pdf["price"])).show()

# COMMAND ----------

# DBTITLE 1,Price Per Bedroom
ppbed = pdf.withColumn(
    "Price_Per_Bedroom",
    format_number(((pdf["Price_Sq_Ft"] * pdf["Size"]) / (pdf["Bedrooms"])), 2))

ppbed.show()

# COMMAND ----------

# DBTITLE 1,Price Per Bathroom
ppbath = ppbed.withColumn(
    "Price_Per_Bathroom",
    format_number(
        ((ppbed["Price_Sq_Ft"] * ppbed["Size"]) / (ppbed["Bathrooms"])), 2))

ppbath.show()

# COMMAND ----------
Exemple #8
0
df.groupBy("Company").max().show()
df.groupBy("Company").min().show()
df.groupBy("Company").sum().show()
df.agg({'Sales':'max'}).show()


import pyspark.sql.functions as F
df.select(F.countDistinct("Sales")).show()

df.select(F.avg('Sales')).show()

df.select(F.stddev("Sales")).show()


sales_std = df.select(F.stddev("Sales").alias('std'))
sales_std.select(F.format_number('std',2).alias('std')).show()

# OrderBy
df.orderBy("Sales").show()
df.orderBy(df["Sales"].desc()).show()

# In[16]:
# Pandas Data Size Limit <2 gigabytes
# PySpark Data Size Limit >100 gigabytes
# There is no hard limit on the Data size you can process with PySpark, however when your RDD size 
# exceeds the size of your RAM then the data will be moved to Disk. Even after the data is moved to the Disk,
# Spark will be equally capable of processing it.

# https://www.kaggle.com/mkechinov/ecommerce-behavior-data-from-multi-category-store
import pandas as pd
dp = pd.read_csv('D:/#DevCourses-GWU/#5_IoT_BigData/2019-Novs.csv')
Exemple #9
0
# this is a bit of a nicer way to gather the data
group_data = df.groupBy('Company')
group_data.agg({'Sales': 'max'}).show()

# import functions from the pyspark library
from pyspark.sql.functions import countDistinct, avg, stddev

# count the number of distinct sales
df.select(countDistinct('Sales')).show()

# show the average for all sales you can then change the name using '.alias()' to whatever makes sense to you
df.select(avg('Sales').alias('Average Sales')).show()

# select standard deviation
df.select(stddev('Sales')).show()

#import format number function to make the number a bit neater
from pyspark.sql.functions import format_number

sales_std = df.select(stddev('Sales').alias('std'))
sales_std.show()

# change the number so it only shows 2 decimal places, and then also change the name to formatted std
sales_std.select(format_number('std', 2).alias('Formatted std')).show()

#to sort the order of the df
df.orderBy('Sales').show()

# to have the results showing descending
df.orderBy(df['Sales'].desc()).show()
Exemple #10
0
    F.col('_c18').alias('tip_amount'),
    F.col('_c19').alias('tolls_amount'),
    F.col('_c20').alias('total_amount'))

left = joined_df.filter(F.col('pickup_longitude') == 0.0).filter(
    F.col('pickup_latitude') == 0.0).filter(
        F.col('dropoff_longitude') == 0.0).filter(
            F.col('dropoff_latitude') == 0.0).select('medallion').groupBy(
                'medallion').agg(F.count('*').alias('all_zero_coordinates'))

right = joined_df.groupBy('medallion').agg(
    F.count('*').alias('total_taxi_count')).withColumnRenamed(
        'medallion', 'medallion2')

res = left.join(right, left.medallion == right.medallion2).drop('medallion2')

res = res.select(
    'medallion',
    F.regexp_replace(
        F.format_number(
            F.round(
                100 * F.col('all_zero_coordinates') /
                F.col('total_taxi_count'), 2), 2), ',',
        '').alias('percentage_of_trips')).sort('medallion')

res.select(format_string('%s,%s', res.medallion,
                         res.percentage_of_trips)).write.save('task3c-sql.out',
                                                              format="text")

spark.stop()
Exemple #11
0
|avg(Year)|        avg(Close)|
+---------+------------------+
|   2015.0|120.03999980555547|
|   2013.0| 472.6348802857143|
|   2014.0| 295.4023416507935|
|   2012.0| 576.0497195640002|
|   2016.0|104.60400786904763|
|   2010.0| 259.8424600000002|
|   2011.0|364.00432532142867|
+---------+------------------+
'''

result = newdf.groupBy("Year").mean()[['avg(Year)', 'avg(Close)']]
result = result.withColumnRenamed("avg(Year)", "Year")
result.select('Year',
              format_number('avg(Close)', 2).alias("Mean Close")).show()
'''
+------+----------+
|  Year|Mean Close|
+------+----------+
|2015.0|    120.04|
|2013.0|    472.63|
|2014.0|    295.40|
|2012.0|    576.05|
|2016.0|    104.60|
|2010.0|    259.84|
|2011.0|    364.00|
+------+----------+
'''

print("Execution Completed")
Exemple #12
0
df.select(dayofyear(df['Date']).alias('day of year')).show()
df.select(month(df['Date']).alias('month')).show()

df.select(hour(df['Date'])).show()

df.select(year(df['Date']).alias('year')).show()

#%%
df.withColumn("Year", year(df['Date'])).show()

newdf = df.withColumn("Year", year(df["Date"]))
newdf_mean = newdf.groupBy("Year").mean()
newdf_mean.show()

newdf_mean['avg(Close)']
newdf_mean['avg(Close)'].show()   #! TypeError: 'Column' object is not callable
# BUT
newdf_mean[['avg(Close)']].show()
newdf_mean[['Year', 'avg(Year)','avg(Close)', 'avg(Open)']].show()

#%% add aliases and format_number

meandf = newdf_mean[['Year', 'avg(Open)', 'avg(Close)', 'avg(Volume)']]
meandf = meandf.select('Year', \
                       format_number('avg(Open)', 2).alias('mean open'), \
                       format_number('avg(Close)', 2).alias('mean close'), \
                       format_number('avg(Volume)', 2).alias('mean volume'), \
                       )
meandf.show()

#%%
# COMMAND ----------

#dfStart = dfStart.filter(dfStart["Id"]%2 == 0).withColumn( "Total Amount", ( dfStart["Friends"]*10*dfStart["Age"] )/12 )

#TotalAmountDF = dfStart.select(dfStart["Name"],dfStart["Age"],dfStart["Friends"].alias("Total Friends"),dfStart["Total Amount"] )

#TotalAmountDF.show()

# COMMAND ----------

from pyspark.sql.functions import format_number, round, mean, max, min

# COMMAND ----------

dfStart.select(
    "*",
    format_number(dfStart['Friends'] / dfStart['Age'],
                  2).alias("friend_by_age")).show()

# COMMAND ----------

dfStart.select(min(dfStart["age"])).show()

# COMMAND ----------

dfStart.columns

# COMMAND ----------

dfStart.groupBy("age").max("friends").show()
# MAGIC If you get stuck on this, don't worry, just view the solutions.

# COMMAND ----------

# Uh oh Strings! 
df.describe().printSchema()

# COMMAND ----------

from pyspark.sql.functions import format_number

# COMMAND ----------

result = df.describe()
result.select(result['summary'],
              format_number(result['Open'].cast('float'),2).alias('Open'),
              format_number(result['High'].cast('float'),2).alias('High'),
              format_number(result['Low'].cast('float'),2).alias('Low'),
              format_number(result['Close'].cast('float'),2).alias('Close'),
              result['Volume'].cast('int').alias('Volume')
             ).show()

# COMMAND ----------

# MAGIC %md #### Create a new dataframe with a column called HV Ratio that is the ratio of the High Price versus volume of stock traded for a day.

# COMMAND ----------

df2 = df.withColumn("HV Ratio",df["High"]/df["Volume"])#.show()
# df2.show()
df2.select('HV Ratio').show()
# 2) Print out the schema
df.printSchema()

# 3) Print out the first 5 rows
rows = df.head(5)
for row in rows:
    print(row)

# 4) Describe the dataframe
df_describe = df.select(['Open', 'High', 'Low', 'Close', 'Volume']).describe()
df_describe.show()

# 5) Format describe output to two decimals numbers
df_describe.printSchema()
df_describe.select(
    format_number(df_describe['Open'].cast("float"), 2).alias('Open'),
    format_number(df_describe['High'].cast("float"), 2).alias('High'),
    format_number(df_describe['Low'].cast("float"), 2).alias('Low'),
    format_number(df_describe['Close'].cast("float"), 2).alias('Close'),
    format_number(df_describe['Volume'].cast("float"), 2).alias('Volume'),
).show()

# 6) Create new column called "HV Ratio": High Price versus volume of stock traded for a day
df.withColumn('hv_ratio', df['High'] / df['Volume']).show()

# 7) What day had the Peak High in Price?
print(df.orderBy(df["High"].desc()).head(1)[0][0])

# 8) What is the mean of the Close column?
df.agg({"Close": "avg"}).show()
#The following lines are more realistic in case of manipulating data:
df_comp = df_2.groupby("Company")
df_comp.agg({'Sales':'mean'}).show()

# The following option could be more handy than using dictionaries
from pyspark.sql.functions import countDistinct,avg,stddev,format_number
df_2.select(countDistinct('Company').alias('N. of companies')).show()

df_2.select(avg('Sales')).show()
df_2.select(avg('Sales').alias('Average sales man!')).show()
df_2.select(stddev('Sales').alias('STD man!')).show()
print('testing different formats'.upper())
# First we refer to the column as "std"
df_2_STD = df_2.select(stddev('Sales').alias('std'))
# Then, we change the format of std to whow only two digits and change the name of the column as "formated std"
df_2_STD.select(format_number('std',2).alias('formated std')).show()

# Ascendeing order by a given column
df_2.orderBy('Sales').show()
df_2.orderBy('Company').show()
df_2.orderBy('Person').show()

# Descending order by a given column
df_2.orderBy(df_2['Sales'].desc()).show()

# Missing data
print('Third exersize'.upper())
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('missData').getOrCreate()
df_3 = spark.read.csv('ContainsNull.csv',inferSchema=True, header=True)
df_3.show()
# In[39]:

df.select(avg('char_38').alias('Avrage_of_char38')).show()

# In[40]:

from pyspark.sql.functions import format_number

# In[41]:

stdd = df.select(stddev('char_38').alias('std'))

# In[42]:

stdd.select(format_number('std', 2).alias('std')).show()

# In[99]:

#missing data

# In[46]:

from pyspark.sql.functions import mean
mean_val = df.select(mean(df['char_38'])).collect()

# In[47]:

df.na.fill(df.select(mean(df['char_38'])).collect()[0][0], ['char_38']).show()

# In[ ]:
print(df.columns)

df.printSchema()

df.show(5)

print(df.head(5))

df.describe().show()

ddf = df.describe()

for col in ['Open', 'High', 'Low', 'Close', 'Volume', 'Adj Close']:
    dff = ddf.withColumn(col, ddf[col].cast(FloatType()))
    dff = dff.withColumn(col, format_number(col, 2).alias(col))    
ddf.show()

df.withColumn('HR Ratio', df['High'] / df['Volume']).select('HR Ratio').show()

df.orderBy(df['High'].desc()).head(1)[0][0]

df.select(avg('Close')).show()

df.select([max('Volume'), min('Volume')]).show()

print(df.filter(df['Close'] < 60).count())

print(df.select(df['High'] > 80).count() / df.count() * 100.0)

print(df.corr('High', 'Volume'))
def addUserFeatures(samplesWithMovieFeatures: DataFrame) -> DataFrame:
    """
    新增列详解:
    --- 以下都是**到该条评价产生时间前的**历史行为特征记录 ---
    1.userPositiveHistory: 收集该用户的积极评价并形成一个list,积极评价定义为评价分>3.5(认定为其喜欢该电影),同时使用滑动窗口收集该评价发生时间前的历史节点,避免收集未来信息
    2.使用F.reverse将①中得到的评价序列反序,即按最新评价在前的顺序
    3.userRatedMovie[0~4]: 该用户最近评价的5部电影
    4.userRatingCount:                  用户评价总数
    5.userRatedMovieAvgReleaseYear:     用户评价过的电影的平均上映年份
    6.userRatedMovieReleaseYearStddev:  用户评价过的电影的上映年份的无偏标准差
    7.userAvgRating:                    用户平均评分
    8.userRatingStddev:                 用户评分的无偏标准差
    9.userGenres:                       用户观看过的电影的风格分类汇总
    10.userGenre[0~4]:                  用户最近5个观看的电影风格分类
    --- 以下是对DataFrame中无用信息列的修正
    1.drop:
        ①genres:                        原始电影风格分类,在历史行为特征中不具有含义,删去
        ②userGenres:                    收集到的按时间排列的最近观看电影分类的列表,已捡取前5个,原始列可删去
        ③userPositiveHistory            收集到的按时间排列的评分序列,已捡取前5个,原始列可删去
    2.filter:
        不保留用户整个历史行为中第一次的电影评价,因为在这个行为前没有历史行为,属于冷启动部分
    :param samplesWithMovieFeatures
    :return: samplesWithUserFeatures
    """
    samplesWithUserFeatures = samplesWithMovieFeatures \
        .withColumn('userPositiveHistory',
                    F.collect_list(F.when(F.col('label') == 1, F.col('movieId')).otherwise(F.lit(None))).over(
                        Window.partitionBy('userId').orderBy(F.col('timestamp')).rowsBetween(-100, -1)
                    )) \
        .withColumn('userPositiveHistory', F.reverse(F.col('userPositiveHistory'))) \
        .withColumn('userRatedMovie0', F.col('userPositiveHistory')[0]) \
        .withColumn('userRatedMovie1', F.col('userPositiveHistory')[1]) \
        .withColumn('userRatedMovie2', F.col('userPositiveHistory')[2]) \
        .withColumn('userRatedMovie3', F.col('userPositiveHistory')[3]) \
        .withColumn('userRatedMovie4', F.col('userPositiveHistory')[4]) \
        .withColumn('userRatingCount',
                    F.count(F.lit(1)).over(Window.partitionBy('userId').orderBy('timestamp').rowsBetween(-100, -1))
                    ) \
        .withColumn('userRatedMovieAvgReleaseYear',
                    F.avg(F.col('releaseYear')).over(Window.partitionBy('userId').orderBy('timestamp').rowsBetween(-100, -1))
                    .cast(IntegerType())) \
        .withColumn('userRatedMovieReleaseYearStddev', F.format_number(
                    F.stddev(F.col('releaseYear')).over(Window.partitionBy('userId').orderBy('timestamp').rowsBetween(-100, -1)),
                    NUMBER_PRECISION)) \
        .withColumn('userAvgRating', F.format_number(
                    F.avg(F.col('rating')).over(Window.partitionBy('userId').orderBy('timestamp').rowsBetween(-100, -1)),
                    NUMBER_PRECISION)) \
        .withColumn("userRatingStddev", F.format_number(
                    F.stddev(F.col("rating")).over(Window.partitionBy('userId').orderBy('timestamp').rowsBetween(-100, -1)),
                    NUMBER_PRECISION)) \
        .withColumn("userGenres", F.udf(extractGenresUDF, ArrayType(StringType()))(
                    F.collect_list(F.when(F.col('label') == 1, F.col('genres')).otherwise(F.lit(None))).over(
                        Window.partitionBy('userId').orderBy('timestamp').rowsBetween(-100, -1))
                    )) \
        .withColumn("userGenre0", F.col("userGenres")[0]) \
        .withColumn("userGenre1", F.col("userGenres")[1]) \
        .withColumn("userGenre2", F.col("userGenres")[2]) \
        .withColumn("userGenre3", F.col("userGenres")[3]) \
        .withColumn("userGenre4", F.col("userGenres")[4]) \
        .drop("genres", "userGenres", "userPositiveHistory") \
        .filter(F.col("userRatingCount") > 1)

    return samplesWithUserFeatures
                                     df["friends"].alias("TotalFriends"),
                                     "TotalAmount")

# COMMAND ----------

dfAmountPerMonth.show()

# COMMAND ----------

from pyspark.sql.functions import format_number, mean, min, max

# COMMAND ----------

dfAmountPerMonth.select(
    "*",
    format_number(dfAmountPerMonth["TotalAmount"],
                  2).alias("upto 2 decimal place")).show()

# COMMAND ----------

df.select(mean(df["age"])).show()

# COMMAND ----------

(df.filter(df["age"] == 53)).show()

# COMMAND ----------

df.groupBy("name").max("Age").show()

# COMMAND ----------
Exemple #21
0
df_year.head(1)[0]


# In[71]:


# group by and select two columns year and Close
avg_close_per_year = df_year.groupBy('Year').mean().select(['Year', 'avg(Close)'])


# In[72]:


# rename the columns
avg_close_per_year = avg_close_per_year.withColumnRenamed("avg(Close)", "Average Closing Price")


# In[74]:


# format the numbers
avg_close_per_year.select(['Year', format_number('Average Closing Price', 2)]).show()


# In[77]:


# use alias to rename the column
avg_close_per_year.select(['Year', format_number('Average Closing Price', 2).alias("Avg Close")]).show()

Exemple #22
0
from pyspark.sql import SparkSession
from pyspark.sql.functions import (dayofmonth, hour, dayofyear, month, year,
                                   weekofyear, format_number, date_format)

spark = SparkSession.builder.appName('dates').getOrCreate()

df = spark.read.csv('./data/appl_stock.csv', inferSchema=True, header=True)

df.show()

df.printSchema()

df.select(['Date']).show()

df.select(dayofmonth(df['Date'])).show()

df.select(hour(df['Date'])).show()

df.select(year(df['Date'])).show()

ndf = df.withColumn('Year', year(df['Date']))
ndf.groupBy('Year').mean().select(['Year', 'avg(Close)']).show()

result = ndf.groupBy('Year').mean().select(['Year', 'avg(Close)'])

result.withColumnRenamed('avg(Close)', 'Average Closing Price').select(
    ['Year', format_number('Average Closing Price', 2)]).show()
Exemple #23
0
print('What is the schema?'.upper())
df.printSchema()

print('Print out the first 5 rows'.upper())
df.show(5)
print('OR'.upper())
for i in df.head(5):
    print(i)
    print('\n')

print('Reduce the decimals')
#df.describe()
from pyspark.sql.functions import format_number
df_new = df.select([
    'Date',
    format_number('Open', 2).cast('float').alias('Open'),
    format_number('High', 2).cast('float').alias('High'),
    format_number('Low', 2).cast('float').alias('Low'),
    format_number('Close', 2).cast('float').alias('Close'), 'Volume',
    format_number('Adj Close', 2).cast('float').alias('Adj Close')
])

print('Create new data frame with an extra column Volume/High')
df_new = df_new.withColumn('V/H ratio',
                           format_number(df_new['Volume'] / df_new['High'], 2))

print('What day preseted the highest peak in High'.upper())
df_ord_H = df_new.orderBy(df_new['High'].desc())
df_ord_H.select('Date').show(1)
print('OR')
df_new.orderBy(df_new['High'].desc()).head(1)[0][0]
                                   dayofyear, month,
                                   year, weekofyear,
                                   format_number, date_format)

spark = SparkSession.builder.appName('ops').getOrCreate()

df = spark.read.csv('appl_stock.csv', inferSchema=True, header=True)


#df.select(['Date', 'Open']).show()df.take(20)

# Date Formatting

df.select(dayofmonth(df['Date']))
df.select(month(df['Date']))
df.select(year(df['Date']))
df.select(hour(df['Date']))


df_year = df.withColumn('Year', year(df['Date']))
df_year_avg = df_year.groupBy("Year").mean().select(["Year", "avg(Close)"])
result = df_year_avg.withColumnRenamed("avg(Close)", "Average Closing Price")
new = result.select(['Year', format_number('Average Closing Price', 2).alias("Average Close")])
new.show()






df.columns

# What does the schema look like?
df.printSchema()

# Print out the first five rows
df.head(5)

# Use "describe" to learn about the DataFrame
df.describe().show()
summary_df = df.describe()

# Format the number to show up two decimal places
formatted_summary_df = summary_df.select(
    col('summary'),
    format_number(col('Open').cast('float'), 2).alias('Open'),
    format_number(col('High').cast('float'), 2).alias('High'),
    format_number(col('Low').cast('float'), 2).alias('Low'),
    format_number(col('Close').cast('float'), 2).alias('Close'),
    format_number(col('Volume').cast('float'), 2).alias('Volume'),
    format_number(col('Adj Close').cast('float'), 2).alias('Adj Close'))
formatted_summary_df.show()

# Create a new dataframe with a column called HV Ratio
newdf = df.select(col('High') / col('Volume'))
newdf.show()

# What day had the Peak High in Price?
high_max = df.groupBy().max('High').collect()[0][0]
day = df.filter(col('High') == high_max).collect()[0][0]
print(day)
Exemple #26
0
# Formatting columns
df.select(stddev('Sales')).show()


# In[58]:


from pyspark.sql.functions import format_number


# In[64]:


sales_std = df.select(stddev('Sales').alias('Sales_STD'))
sales_std.select(format_number('Sales_STD', 2)).show()  # pass the column name and the number of decimals wanted
# will have to fix alias once again


# #### OrderBy

# In[66]:


df.orderBy("Sales").show()


# In[67]:


df.orderBy('Company').show()
# In[92]:

from pyspark.sql.functions import format_number

# In[93]:

# store the description in a var
walmart_desc = walmart_df.describe()

# In[94]:

# cast the result
walmart_desc.select(
    walmart_desc['summary'],
    format_number(walmart_desc['Open'].cast('float'), 2).alias('Open'),
    format_number(walmart_desc['High'].cast('float'), 2).alias('High'),
    format_number(walmart_desc['Low'].cast('float'), 2).alias('Low'),
    format_number(walmart_desc['Close'].cast('float'), 2).alias('Close'),
    walmart_desc['Volume'].cast('int').alias('Volume')).show()

# #### Create a new dataframe with a column called HV Ratio that is the ratio of the High Price versus volume of stock traded for a day.

# In[95]:

walmart_df_hv_ratio = walmart_df.withColumn(
    'HV Ratio', (walmart_df['High'] / walmart_df['Volume']))
walmart_df_hv_ratio.select('HV Ratio').show()

# #### What day had the Peak High in Price?
spark = SparkSession.builder.appName('aggs').getOrCreate()

df = spark.read.csv('./data/sales_info.csv', inferSchema=True, header=True)

df.printSchema()

df.show()

df.groupBy('Company').mean().show()

df.groupBy('Company').count().show()

df.agg({'Sales': 'sum'}).show()

group_data = df.groupBy('Company')
group_data.agg({'Sales': 'max'}).show()

df.select(countDistinct('Company')).show()

df.select(avg('Sales').alias('Average Sales')).show()

df.select(stddev('Sales').alias('Std Dev')).show()

sales_std = df.select(stddev('Sales').alias('Std Dev')).select(
    format_number('Std Dev', 2)).show()

df.orderBy('Sales').show()

df.orderBy(df['Sales'].desc()).show()
df_nycflights_max_min_dep_delay.show()

'''Finding groupby with multiple columns'''
df_nycflights_grpby = df_nycflights. \
                      groupby(['month','origin','dest']). \
                      count(). \
                      orderBy(['month','count'],
                              ascending= [1,0])

df_nycflights_grpby.show()

'''group by also renaming column from group(column_name) to something proper '''
'''then using withColumn we do rounding as well'''
df_nycflights_agg_col_rename = df_nycflights. \
                               groupby('month'). \
                               agg({'dep_delay': 'avg' , 'arr_delay': 'avg'}). \
                               withColumnRenamed('avg(arr_delay)','mean_arr_delay'). \
                               withColumnRenamed('avg(dep_delay)','mean_dep_delay'). \
                               withColumn('mean_arr_delay',format_number('mean_arr_delay',1))


df_nycflights_agg_col_rename.show()

'''Group By with pivot'''
df_pivot_carrier = df_nycflights. \
                   groupby('month'). \
                   pivot('carrier'). \
                   count()

df_pivot_carrier.show()                       
Exemple #30
0
                           ('spark.driver.memory', '4g')])
spark = SparkSession.builder.appName("example-spark").config(
    conf=conf).getOrCreate()
sc = spark.sparkContext.getOrCreate()  # see its lowercase
sqlContext = SQLContext(sc)
df = spark.read.csv(
    "D:\\Arasan\\Misc\\GitHub\\Spark\\input\\walmart_stock.csv",
    inferSchema=True,
    header=True)
#print(df.head())
#print(df.columns)
#print(df.printSchema())
d_df = df.describe()
#print(d_df.select(d_df['Open']).collect())
d_df.select(
    format_number(d_df['Open'].cast(IntegerType()), 2).alias('Open'),
    format_number(d_df['High'].cast(IntegerType()), 2).alias('High'),
    format_number(d_df['Low'].cast(IntegerType()), 2).alias('Low'),
    format_number(d_df['Close'].cast(IntegerType()), 2).alias('Close'),
    format_number(d_df['Volume'].cast(IntegerType()), 0).alias('Volume'),
    format_number(d_df['Adj Close'].cast('int'), 2).alias('Adj Close'))
#).show()

newdf = df.withColumn('HV Ratio', df['High'] / df['Volume'])
#newdf.show()

#print(df.agg({'High':'max'}).collect()[0][0])
#x = df.agg({'High':'max'}).collect()[0][0]
#df.filter(df['High'] == df.agg({'High':'max'}).collect()[0][0]).select('Date').show()
#print(df.orderBy(df['High'].desc()).head(1)[0][0])
#df.agg({'Close':'mean'}).show()
    df1.Volume.cast("Double"),
    df1["Adj Close"].cast("Double"),
)

# COMMAND ----------

from pyspark.sql.functions import col, format_number, when
# df1.filter((col("summary") == "mean") | (col("summary") == "stddev")).show()
# df1.filter(col("summary").isin(["mean", "stddev"])).with.show()
cols = ["Open", "High", "Low", "Close", "Volume", "Adj Close"]
for col_val in cols:
    df2 = df2.withColumn(
        col_val,
        when(
            col("summary").isin("mean", "stddev"),
            format_number(col_val, 2).cast("Double")).otherwise(df2[col_val]))

# COMMAND ----------

df2.show()

# COMMAND ----------

# MAGIC %md #### Create a new dataframe with a column called HV Ratio that is the ratio of the High Price versus volume of stock traded for a day.

# COMMAND ----------

df2 = df2.withColumn("HV RAtio", df2.High / df2.Volume)

# COMMAND ----------