from pyspark.sql import SparkSession from pyspark.sql import functions as func from pyspark.sql.types import StructType, StructField, IntegerType, FloatType spark = SparkSession.builder.appName("TotalSpentByCustomer").master( "local[*]").getOrCreate() # Create schema when reading customer-orders customerOrderSchema = StructType([ StructField("cust_id", IntegerType(), True), StructField("item_id", IntegerType(), True), StructField("amount_spent", FloatType(), True) ]) # Load up the data into spark dataset customersDF = spark.read.schema(customerOrderSchema).csv( "./data/customer-orders.csv") totalByCustomer = customersDF\ .groupBy("cust_id")\ .agg(func.round(func.sum("amount_spent"), 2).alias("total_spent")) totalByCustomerSorted = totalByCustomer.sort("total_spent") totalByCustomerSorted.show(totalByCustomerSorted.count()) spark.stop()
from pyspark.sql import SparkSession from pyspark.sql import functions as func spark = SparkSession.builder.appName("SparkSQL").getOrCreate() people = (spark.read.option("header", "true").option( "inferSchema", "true").csv("./fakefriends-header.csv")) # now the schema is # root # |-- userID: integer (nullable = true) # |-- name: string (nullable = true) # |-- age: integer (nullable = true) # |-- friends: integer (nullable = true) results = people.select( "age", "friends").groupBy("age").avg("friends").sort("age").show() # use alias and 2 decimal places results = (people.select("age", "friends").groupBy("age").agg( func.round(func.avg("friends"), 2).alias("friends_avg")).sort("age").show()) spark.stop()
sc=SparkContext.getOrCreate() sqlContext = SQLContext(sc) all_trips= StructType([ StructField("medallion",StringType(),True), StructField("hack_license", StringType(),True), StructField("vendor_id", StringType(),True), StructField("pickup_datetime", StringType(),True), StructField("rate_code", StringType(),True), StructField("store_and_fwd_flag", StringType(),True), StructField("drop_datetime", StringType(),True), StructField("passenger_count", StringType(),True), StructField("trip_time_in_secs", StringType(),True), StructField("trip_distance", StringType(),True), StructField("pickup_longitude", StringType(),True), StructField("pickup_latitude", StringType(),True), StructField("dropoff_longitude", StringType(),True), StructField("dropoff_latitude", StringType(),True), StructField("payment_type", StringType(),True), StructField("fare_amount", StringType(),True), StructField("surcharge", StringType(),True), StructField("mta_tax", StringType(),True), StructField("tip_amount", StringType(),True), StructField("tolls_amount", StringType(),True), StructField("total_amount", StringType(),True)]) allTrips = spark.read.format('csv').schema(all_trips).options(header='false',inferschema='true').load(sys.argv[1]) allTrips.createOrReplaceTempView("allTrips") df = spark.sql("select date(pickup_datetime) as date, round(sum(fare_amount + surcharge + tip_amount),2) as total_revenue, round(sum(tolls_amount),2) as total_tolls from allTrips group by date(pickup_datetime) order by date(pickup_datetime) asc") df.select(format_string('%s,%s,%s', from_unixtime(unix_timestamp(df.date, "yyyy-MM-dd"),'yyyy-MM-dd'), func.round(df.total_revenue,2), func.round(df.total_tolls,2))).write.save("task2c-sql.out", format="text")
from pyspark.sql import SparkSession from pyspark.sql import functions as func from pyspark.sql.types import StructType,StructField,FloatType,IntegerType spark = SparkSession.builder.appName("customerOrder").getOrCreate() schema = StructType([StructField("ID",IntegerType(),True), StructField("ITEM_ID",IntegerType(),True), StructField("PRICE",FloatType(),True)]) df = spark.read.schema(schema).csv("file:///ApacheSparkCourse/customer-orders.csv") customerBuy = df.select("ID","PRICE") customerBuy = customerBuy.groupBy("ID").agg(func.round(func.sum("PRICE"),2).alias("TOTAL")) customerBuy = customerBuy.sort("TOTAL") customerBuy.show(customerBuy.count())
# ## Exercises # (1) Extract the hour of day and day of week from `rides.date_time`. from pyspark.sql.functions import hour, dayofweek rides \ .withColumn("hour_of_day", hour("date_time")) \ .withColumn("day_of_week", dayofweek("date_time")) \ .select("date_time", "hour_of_day", "day_of_week") \ .show(5) # (2) Convert `rides.duration` from seconds to minutes. from pyspark.sql.functions import col, round rides \ .withColumn("duration_in_minutes", round(col("duration") / 60, 1)) \ .select("duration", "duration_in_minutes") \ .show(5) # (3) Convert `rides.cancelled` to a Boolean column. # Using the `cast` method: rides \ .withColumn("cancelled", col("cancelled").cast("boolean")) \ .select("cancelled") \ .show(5) # Using a Boolean expression: rides \ .withColumn("cancelled", col("cancelled") == 1) \ .select("cancelled") \
estimator=als, evaluator=evaluator, estimatorParamMaps=paramGrid, numFolds=5 ) model = cv.fit(train) predictions = model.transform(test) evaluator.evaluate(predictions) display(predictions) predictions = predictions.withColumn("prediction", F.abs(F.round(predictions["prediction"],0))) display(predictions) userRecommendations = model.bestModel.recommendForAllUsers(10) display(userRecommendations) itemRecommendations = model.bestModel.recommendForAllItems(10) display(itemRecommendations) display(userpayment) display(chefmozaccepts) chefmozaccepts = chefmozaccepts.withColumnRenamed("Rpayment", "Upayment") display(chefmozaccepts)
avgSalaryDF.show() # COMMAND ---------- # MAGIC %md # MAGIC Convert that value to an integer using the `round()` function. See # MAGIC <a href "https://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.sql.functions$" class="text-info">the documentation for <tt>round()</tt></a> # MAGIC for more details. # COMMAND ---------- from pyspark.sql.functions import round roundedAvgSalaryDF = avgSalaryDF.select( round("averageSalary").alias("roundedAverageSalary")) roundedAvgSalaryDF.show() # COMMAND ---------- # MAGIC %md # MAGIC In addition to the average salary, what are the maximum and minimum salaries? # COMMAND ---------- from pyspark.sql.functions import min, max salaryDF = peopleDF.select( max("salary").alias("max"), min("salary").alias("min"),
# COMMAND ---------- days_back = 14 values_per_second = 337 nowTimestamp = time.time() # COMMAND ---------- dfTimeSeries = sqlContext.range(0, days_back * 24 * 60 * 60 * values_per_second) \ .withColumn("Timestamp", (nowTimestamp - (F.col("id") / values_per_second)).cast("Timestamp")) \ .drop("id") \ .withColumn("Sensor", F.concat_ws('-', 1 + (F.rand() * 10).cast("Int"), 1 + (F.rand() * 100).cast("Int"), 1 + (F.rand() * 350).cast("Int"))) \ .withColumn("Value", F.round(F.rand() * 100, 3)) \ .withColumn("year", F.year("Timestamp")) \ .withColumn("month", F.month("Timestamp")) \ .withColumn("day", F.dayofmonth("Timestamp")) display(dfTimeSeries) # COMMAND ---------- spark.conf.set("fs.azure.account.key.<StorageAccountName>.blob.core.windows.net", \ "<StorageAccountKey>") dfTimeSeries.write \ .mode("overwrite") \ .partitionBy("year", "month", "day") \ .csv("wasbs://<StorageContainer>@<StorageAccountName>.blob.core.windows.net/timeseries")
def calculate_percentage(df: DataFrame, col_name: str): return df.groupBy(col_name).agg(f.round(f.count(col_name) * 100 / df.count(), 1).alias('Percentage')) \ .orderBy('Percentage', ascending=False)
.appName('first_spark_application') \ .getOrCreate() cars = spark.read.csv('/Users/wel51x/Box Sync/MyBox/Code/DataCamp/data/cars.csv', sep=',', header=True, inferSchema=True, nullValue='NA') cars = cars.dropna() # Get number of records print("The data contains %d records." % cars.count(), '\n') cars = cars.withColumnRenamed("ncyl", "cyl") cars = cars.withColumn('length_meters', round(cars.length * 0.0254, 3)) cars = cars.withColumn('weight_kg', round(cars.weight / 2.205, 0)) cars = cars.withColumn('avg_mpg', round((cars.city_mpg + cars.hwy_mpg) / 2, 1)) \ .drop("city_mpg", "hwy_mpg") cars = cars.withColumn('consumption', round((100 * 3.785411784) / (cars.avg_mpg * 1.609344), 2)) pd.set_option('display.max_columns', None) # all cols pd.set_option('display.width', 161) pd.set_option('display.max_colwidth', 199) #print(cars.toPandas().sample(8), '\n') indexer = StringIndexer(inputCol='type', outputCol='type_idx')
def cal_indexs(self): portfolio_acret = self.portfolio_acret risk_free_rateline = self.risk_free_rateline basedata = portfolio_acret.join(risk_free_rateline, 'trading_date') \ .drop(risk_free_rateline.trading_date)# .dropDuplicates(['trading_date', 'com_id']) basedata = basedata.withColumn('order_nm_desc', F.row_number().over(self.__constants.w_h_desc)) \ .withColumn('order_nm', F.row_number().over(self.__constants.w_h)) \ .withColumn('date_diff', F.datediff(F.col('trading_date'), F.min(F.col('trading_date')).over(self.__constants.w_h))) \ .cache() basedata = basedata.withColumn('init_arh', F.first(F.col('accum_ret_h')).over(self.__constants.w_h)) \ .withColumn('annl_ret_h', F.when(basedata.date_diff >= 30, F.round(F.pow(F.col('accum_ret_h') / F.col('init_arh'), 242.0 / F.col('order_nm')) - 1.0, 6)) \ .otherwise(None)) \ .withColumn('pre1y_arh', F.first(F.col('accum_ret_h')).over(self.__constants.w_y)) \ .withColumn('pre1y_odn', F.first(F.col('order_nm')).over(self.__constants.w_y)) \ .withColumn('annl_ret_1y', F.when((basedata.date_diff >= 30) & (basedata.date_diff < 365), F.round(F.pow(F.col('accum_ret_h') / F.col('pre1y_arh'), 242.0 / (F.col('order_nm') - F.col('pre1y_odn') + 1)) - 1.0, 6)) \ .when(basedata.date_diff >= 365, F.round(F.pow(F.col('accum_ret_h') / F.col('pre1y_arh'), 242.0 / 242.0) - 1.0, 6)) .otherwise(None)) basedata = basedata.withColumn('lograte', F.log(basedata.accum_ret_h / F.lag(basedata.accum_ret_h, 1, default=1.0) \ .over(self.__constants.w_unbnd))) \ .withColumn('annl_std_1y', F.when(basedata.date_diff >= 30, F.round(F.stddev(F.col('lograte')).over(self.__constants.w_y) * np.sqrt(242.0), 6)) \ .otherwise(None) ) \ .withColumn('annl_std_h', F.when(basedata.date_diff >= 30, F.round(F.stddev(F.col('lograte')).over(self.__constants.w_h) * np.sqrt(242.0), 6)) \ .otherwise(None) ) \ .withColumn('drawdown', F.col('accum_ret_h') / F.max(F.col('accum_ret_h')).over(self.__constants.w_h) - 1.0) \ .withColumn('max_drawdown_h', F.round(F.min(F.col('drawdown')).over(self.__constants.w_h), 6)) \ .withColumn('sharp_ratio_1y', F.round((F.col('annl_ret_1y') - F.col('rfrate')) / F.col('annl_std_1y'), 6)) \ .withColumn('sharp_ratio_h', F.round((F.col('annl_ret_h') - F.col('rfrate')) / F.col('annl_std_h'), 6) ).filter(basedata.order_nm_desc <= 242) basedata = basedata.withColumn('drawdown_1y', F.col('accum_ret_h') / F.max(F.col('accum_ret_h')).over(self.__constants.w_h) - 1.0) \ .withColumn('max_drawdown_1y', F.round(F.min(F.col('drawdown_1y')).over(self.__constants.w_h), 6)) init_result = basedata.select('com_id', 'com_name', 'trading_date', 'annl_ret_h', 'annl_std_h', 'sharp_ratio_h', 'max_drawdown_h', 'annl_ret_1y', 'annl_std_1y', 'sharp_ratio_1y', 'max_drawdown_1y') return init_result
## Some random challenge , for agg function df.groupby(df["FilamentType"]).agg(avg("LifeInHours")).show(truncate=False) # withColumn will replace if there is already col name with same name df.withColumn("LifeInHours", col("LifeInHours").cast(DoubleType())) # NOTE for sum and min/max after groupby() we dont need agg as below df.groupby("FilamentType").sum("LifeInHours").show(truncate=False) df.groupby("FilamentType").agg( countDistinct("LifeInHours")).show(truncate=False) # NOTE for sum and max after groupby() we dont need agg as below df.groupby("FilamentType").max("LifeInHours").show(truncate=False) df.groupby("FilamentType").agg(count("LifeInHours").alias("cnt")) \ .sort(col("cnt"), ascending=False).show(truncate=False) df.groupby(col("FilamentType")).agg( round(avg(col("LifeInHours"))).alias("avg_rounded")).show(truncate=False) # Try creating table in-memory in Spark side and make use of the SQL syntax to do the same. df.createOrReplaceTempView("bulb_table") sql_1 = spark.sql( "select FilamentType, count(distinct LifeInHours) as cnt from bulb_table group by FilamentType" ) sql_1.show(truncate=False) # Computing Average, with round function. sql_2 = spark.sql( " select FilamentType , round(avg(LifeInHours)) as avg_life from bulb_table " "group by FilamentType limit 5") sql_2.show(truncate=False)
def columns(df, columns, buckets=10): """ Return statistical information about a specific column in json format count_data_type() :param df: Dataframe to be processed :param columns: Columns that you want to profile :param buckets: :return: json object with the """ columns = parse_columns(df, columns) # Get just a sample to infer the column data type # sample_size_number = sample_size(rows_count, 95.0, 2.0) # fraction = sample_size_number / rows_count # sample = df.sample(False, fraction, seed=1) # Initialize Objects column_info = {} column_info['columns'] = {} rows_count = df.count() column_info['rows_count'] = rows_count count_dtypes = Profiler.count_data_types(df, columns) column_info["count_types"] = count_dtypes["count_types"] column_info['size'] = human_readable_bytes(df.size()) def na(col_name): return F.count( F.when(F.isnan(col_name) | F.col(col_name).isNull(), col_name)) def zeros(col_name): return F.count(F.when(F.col(col_name) == 0, col_name)) # Cast every column to a specific type to ensure the correct profiling # For example if we calculate the min or max of a string column with numeric values the result will be incorrect for col_name in columns: dtype = count_dtypes["columns"][col_name]['dtype'] # Not force date type conversion, we can not trust that is going to be representative if dtype in ["string", "float", "int", "bool"]: df = df.cols.cast(col_name, dtype) stats = df.cols._exprs([ F.min, F.max, F.stddev, F.kurtosis, F.mean, F.skewness, F.sum, F.variance, F.approx_count_distinct, na, zeros ], columns) for col_name in columns: logging.info("Processing column '" + col_name + "'...") col_info = {} col_info["stats"] = {} column_info['columns'][col_name] = {} column_type = count_dtypes["columns"][col_name]['type'] col_info['column_dtype'] = count_dtypes["columns"][col_name][ 'dtype'] na = stats[col_name]["na"] max_value = stats[col_name]["max"] min_value = stats[col_name]["min"] col_info['name'] = col_name col_info['column_type'] = column_type # Numeric Column if column_type == "numeric" or column_type == "date": # Merge col_info["stats"] = stats[col_name] # Missing col_info['stats']['missing_count'] = round(na, 2) col_info['stats']['p_missing'] = round(na / rows_count * 100, 2) col_info["dtypes_stats"] = count_dtypes["columns"][col_name][ 'details'] if column_type == "categorical" or column_type == "numeric" or column_type == "date" or column_type == "bool": # Frequency col_info['frequency'] = ( df.groupBy(col_name).count().rows.sort([ ("count", "desc"), (col_name, "desc") ]).limit(10).withColumn( "percentage", F.round((F.col("count") / rows_count) * 100, 3)).cols.rename(col_name, "value").to_json()) # Uniques uniques = stats[col_name].pop("approx_count_distinct") col_info['stats']["uniques_count"] = uniques col_info['stats']["p_uniques"] = round( uniques / rows_count * 100, 3) if column_type == "numeric": # Additional Stats # Percentile can not be used a normal sql.functions. approxQuantile in this case need and extra pass # https: // stackoverflow.com / questions / 45287832 / pyspark - approxquantile - function max_value = fast_float(max_value) min_value = fast_float(min_value) col_info['stats']['quantile'] = df.cols.percentile( col_name, [0.05, 0.25, 0.5, 0.75, 0.95]) col_info['stats']['range'] = max_value - min_value col_info['stats']['median'] = col_info['stats']['quantile'][ 0.5] col_info['stats']['interquartile_range'] = col_info['stats']['quantile'][0.75] - \ col_info['stats']['quantile'][0.25] col_info['stats']['coef_variation'] = round( (col_info['stats']['stddev'] / col_info['stats']['mean']), 5) col_info['stats']['mad'] = round(df.cols.mad(col_name), 5) col_info["hist"] = df.cols.hist(col_name, min_value, max_value, buckets) column_info['columns'][col_name] = col_info return column_info
FinalBtc = CleandfBtc.selectExpr("Cleaned_BTC_Time_New as Date_Time", "Price") FinalBtc = FinalBtc.withColumn("Price",FinalBtc['Price'].cast(DoubleType())) FinalBtc.show(5)#In this cell, casting to timesstamp, changing col names and casting price type to double # ## Dataframes Look like this... FinalTw.printSchema() # In[15]: FinalBtc.printSchema() FinalBtc.count() # ## Truncating timestamps to hours and then grouping them by hour dt_truncated = ((round(unix_timestamp(col('Date_Time')) / 3600) * 3600).cast('timestamp')) FinalTw = FinalTw.withColumn('dt_truncated', dt_truncated) FinalTw = FinalTw.selectExpr("dt_truncated as Date_Time","Cleaned_Tweets","p_neg","p_neu","p_pos","p_comp") UTC = ((unix_timestamp(col('Date_Time'))+ 5*60*60).cast('timestamp')) FinalTw = FinalTw.withColumn('UTC', UTC) FinalTw = FinalTw.selectExpr("UTC as Date_Time","Cleaned_Tweets","p_neg","p_neu","p_pos","p_comp") FinalTw.show(5) # In[17]: FinalTw.registerTempTable("temp") FinalTw_avg = sql.sql("SELECT Date_Time As DateTime,AVG(p_neg) as P_Neg,AVG(p_neu) as P_Neu,AVG(p_pos) as P_Pos,AVG(p_comp) as P_Comp FROM temp GROUP BY Date_Time") #FinalTw_avg = FinalTw.select("Date_Time","polarity","subj","p_pos","p_neg").groupBy("Date_Time").agg(avg(col("polarity","subj","p_pos","p_neg"))) FinalTw_avg.show(5) #This cell is just to collect all the corpus per hour(for the future work)
cvModel = crossval2.fit(trainingDataSJ)#RFR cvModel2 = crossval2.fit(trainingDataIQ)#RFR cvModel3 = crossval.fit(trainingDataSJ)#GLM cvModel4 = crossval.fit(trainingDataIQ)#GLM # COMMAND ---------- cvModelF = cvModel.transform(trainingDataSJ)#RFR cvModel2F = cvModel2.transform(trainingDataIQ)#RFR cvModel3F = cvModel3.transform(trainingDataSJ)#GLM cvModel4F = cvModel4.transform(trainingDataIQ)#GLM # COMMAND ---------- predictionsAndLabels = cvModelF.select(col("city"), col("year"), col("weekofyear"), col("label").cast("double"), round(col("prediction"))) predictionsAndLabels2 = cvModel2F.select(col("city"), col("year"), col("weekofyear"), col("label").cast("double"), round(col("prediction"))) predictionsAndLabels3 = cvModel3F.select(col("city"), col("year"), col("weekofyear"), col("label").cast("double"), round(col("prediction"))) predictionsAndLabels4 = cvModel4F.select(col("city"), col("year"), col("weekofyear"), col("label").cast("double"), round(col("prediction"))) # COMMAND ---------- display(predictionsAndLabels) # COMMAND ---------- display(predictionsAndLabels2) # COMMAND ---------- display(predictionsAndLabels3)
def get_contrib_open_source(df: DataFrame): df.groupBy('OpenSourcer').agg(f.round(f.count('OpenSourcer') * 100 / df.count(), 1).alias('Percentage')) \ .orderBy('Percentage', ascending=False) \ .show(20, False)
# Pickups/Dropoffs in Single Districts taxi_dis_df = taxi_df.withColumnRenamed('Pickup_Count', 'Pickup_Count_Dis').withColumnRenamed('Dropoff_Count', 'Dropoff_Count_Dis').cache() taxi_dis_1h_df = get_agg_taxi_df(taxi_dis_df, 1, index_columns, sum_aggregations('Dis', 1)) taxi_dis_4h_df = get_agg_taxi_df(taxi_dis_df, 4, index_columns, sum_aggregations('Dis', 4)) # Pickups/Dropoffs in Neighbor Districts taxi_nb_df = sql_context.createDataFrame([], taxi_df.schema) for i in range(-1, 2): for j in range(-1, 2): # Exclude current district if i == j == 0: continue tmp_df = taxi_df.withColumn('Lat', func.round(taxi_df.Lat + i * 0.01, 2)) tmp_df = tmp_df.withColumn('Lon', func.round(taxi_df.Lon + j * 0.01, 2)) taxi_nb_df = taxi_nb_df.unionAll(tmp_df) taxi_nb_df = taxi_nb_df.groupby(index_columns).agg(*sum_aggregations('Nb')).cache() taxi_nb_1h_df = get_agg_taxi_df(taxi_nb_df, 1, index_columns, sum_aggregations('Nb', 1)) taxi_nb_4h_df = get_agg_taxi_df(taxi_nb_df, 4, index_columns, sum_aggregations('Nb', 4)) # Pickups/Dropoffs in entire NYC taxi_nyc_df = taxi_df.groupby(taxi_df.Time).agg(*sum_aggregations('Nyc')).cache() taxi_nyc_1h_df = get_agg_taxi_df(taxi_nyc_df, 1, 'Time', sum_aggregations('Nyc', 1)) taxi_nyc_4h_df = get_agg_taxi_df(taxi_nyc_df, 4, 'Time', sum_aggregations('Nyc', 4))
# window = Window.partitionBy('LoanID','EffectiveDate_new').orderBy('LoanID', F.asc('EffectiveDate_new'))\ # .rowsBetween(Window.unboundedPreceding, 0) window = Window.partitionBy('LoanID').orderBy('LoanID', 'EffectiveDate_new')\ .rowsBetween(Window.unboundedPreceding, 0) #pull first date associated with multi payer, some are multipayers on same day dupe_check = (dec_trans.withColumn( 'duplicate_count', (F.count('LoanID').over(window))).filter("duplicate_count == 1")) #get multi-payer multi_payer = dec_trans.groupBy("LoanID").count().filter("count > 1").select( "LoanID") #put get sum of all payments, put dec df together loan_pay_agg = dec_trans.groupby(['LoanID']).agg( F.round(F.sum('Amount'), 2).alias('PaymentReceived_sum')) dec_agg_trans = dupe_check.join(loan_pay_agg, how='left', on='LoanID').drop("duplicate_count", "Amount") display(dupe_check.orderBy("LoanID")) # COMMAND ---------- # COMMAND ---------- # DBTITLE 1,6008 multi-payers out of 92194 people print( multi_payer.select("LoanID").distinct().count(), dec_trans.select("LoanID").distinct().count(), dec_trans.count(), dupe_check.count(), dec_agg_trans.count())
StructField("date", IntegerType(), True), \ StructField("measure_type", StringType(), True), \ StructField("temperature", FloatType(), True)]) # // Read the file as dataframe df = spark.read.schema(schema).csv("1800.csv") df.printSchema() # Filter out all but TMIN entries minTemps = df.filter(df.measure_type == "TMIN") # Select only stationID and temperature stationTemps = minTemps.select("stationID", "temperature") # Aggregate to find minimum temperature for every station minTempsByStation = stationTemps.groupBy("stationID").min("temperature") minTempsByStation.show() # Convert temperature to fahrenheit and sort the dataset minTempsByStationF = minTempsByStation.withColumn("temperature", func.round(func.col("min(temperature)") * 0.1 * (9.0 / 5.0) + 32.0, 2))\ .select("stationID", "temperature").sort("temperature") # Collect, format, and print the results results = minTempsByStationF.collect() for result in results: print(result[0] + "\t{:.2f}F".format(result[1])) spark.stop()
spark = SparkSession.builder.getOrCreate() for file_name in list(glob.glob(str(parent_path / 'jsons' / '*.json'))): print(f'processing {file_name}') df = spark.read.json(file_name) print(f'{df.count()} records read from json') df = df.drop('data-group-quantity', 'data-energy-drink', 'data-amount', 'data-item-weight', 'data-energy-drink'). \ withColumn('old_price', df['data-old-price'].cast(FloatType())). \ withColumn('old_price_per_kg', df['data-old-price-per-kg'].cast(FloatType())). \ withColumn('price', df['data-price'].cast(FloatType())). \ withColumn('price_per_kg', df['data-price-per-kg'].cast(FloatType())). \ withColumn('product_id', df['data-product-id'].cast(IntegerType())). \ withColumn('type', df['data-type'].cast(StringType())). \ withColumn('weight', df['data-weight'].cast(IntegerType())) calc_discount = round((1 - df.price / df.old_price) * 100, 2) df = df.withColumn('discount', when(df.old_price.isNull(), None).otherwise(calc_discount)). \ select('shop', 'name', 'weight', 'price', 'old_price', 'price_per_kg', 'old_price_per_kg', 'discount', 'cat', 'product_id', 'type', 'datetm', 'url') df.sort('name').show(5) try: store = spark.read.parquet(store_name) except Exception as e: if 'Path does not exist' in str(e): store = spark.createDataFrame([], df.schema) else: raise e store_count = store.count()
languages = pd.DataFrame({ 'language': ['ruby', 'python', 'java', 'scala', 'haskell', 'go', 'clojure', 'c++'] }) df = spark.createDataFrame(languages) df.printSchema() # shows schema print((df.count(), len(df.columns))) df.show(5) mpg = data('mpg') mpg = spark.createDataFrame(mpg) mpg.select( F.concat(F.lit('The '), mpg.year, F.lit(' '), mpg.manufacturer, F.lit(' '), mpg.model, F.lit(' has a '), mpg.cyl, F.lit(' cylinder engine.'))).show(truncate=False) mpg.select(F.when(mpg.trans.startswith('auto'), 'auto').otherwise('manual')).show() tips = data('tips') tips = spark.createDataFrame(tips) tips.filter(tips.smoker == 'Yes').count() / tips.count() tips.percent = F.round(tips.tip / tips.total_bill, 2) tips.groupBy(tips.sex, tips.smoker).agg(F.avg(tips.percent)).show()
df.createOrReplaceTempView('df') test = df.select('Date received', 'Product', 'Company') res1 = test.select(year('Date received').alias('year'), 'Product', 'Company') res1 = res1.orderBy('Product', 'year') res2 = res1.groupBy('year', 'Product', 'Company').agg(func.count('Product').alias('Count_comp')) res3 = res2.groupBy('year', 'Product').agg( func.sum('Count_comp').alias('Total_Complaints'), func.countDistinct('Company').alias('Total_Companies'), func.max('Count_comp').alias('maximum')) res3 = res3.filter(res3.Total_Complaints >= 1) res4 = res3.withColumn( 'Percentage', func.round(func.col('maximum') / func.col('Total_Complaints') * 100)) res4 = res4.drop(res4.maximum).sort('Product', 'year') res4 = res4.withColumn("Product", func.lower(func.col("Product"))) res4 = res4.select('Product', 'year', 'Total_Complaints', 'Total_Companies', 'Percentage') res4.write.csv(output_file)
.master('local[*]') \ .appName('first_spark_application') \ .getOrCreate() cars = spark.read.csv( '/Users/wel51x/Box Sync/MyBox/Code/DataCamp/data/cars.csv', sep=',', header=True, inferSchema=True, nullValue='NA') # Get number of records print("The data contains %d records." % cars.count(), '\n') cars = cars.withColumnRenamed("ncyl", "cyl") cars = cars.withColumn('length_meters', round(cars.length * 0.0254, 3)) cars = cars.withColumn('weight_kg', round(cars.weight / 2.205, 0)) print("Cars with null cyl", cars.filter('cyl IS NULL').count(), '\n') indexer = StringIndexer(inputCol='type', outputCol='type_idx') # Assign index values to strings indexer = indexer.fit(cars) # Create column with index values cars = indexer.transform(cars) #print(cars.toPandas().sample(12)) print(indexer) # View the first five records
def app_open(df): """ 应用开启报表 """ #分析数据 df.createOrReplaceTempView("v_df") sql_0 = """select package_id,title,source,site,fsk_cid,grouping_id() id_1,count(custom_uuid) playNum,count(distinct custom_uuid) users,round(count(custom_uuid)/count(distinct custom_uuid),2) avgPlayNum from v_df where date="{date_0}" group by package_id,title,source,site,fsk_cid grouping sets((package_id,title,source,site,fsk_cid),()) """.format( date_0=str_dt_0) sql_1 = """select package_id,title,source,site,fsk_cid,grouping_id() id_1,count(custom_uuid) playNum,count(distinct custom_uuid) users,round(count(custom_uuid)/count(distinct custom_uuid),2) avgPlayNum from v_df where date="{date_1}" group by package_id,title,source,site,fsk_cid grouping sets((package_id,title,source,site,fsk_cid),()) """.format( date_1=str_dt_1) sql_7 = """select package_id,title,source,site,fsk_cid,grouping_id() id_1,count(custom_uuid) playNum,count(distinct custom_uuid) users,round(count(custom_uuid)/count(distinct custom_uuid),2) avgPlayNum from v_df where date="{date_7}" group by package_id,title,source,site,fsk_cid grouping sets((package_id,title,source,site,fsk_cid),()) """.format( date_7=str_dt_7) spark.sql("show databases") spark.sql("use sharp") df_cube_0 = spark.sql(sql_0) df_cube_1 = spark.sql(sql_1) df_cube_7 = spark.sql(sql_7) ##天环比、周同比连接条件 condition_0 = (F.coalesce(F.col("t_0.package_id"), F.lit("123")) == F.coalesce( F.col("t_1.package_id"), F.lit("123"))) condition_1 = (F.coalesce(F.col("t_0.title"), F.lit("123")) == F.coalesce( F.col("t_1.title"), F.lit("123"))) condition_2 = (F.coalesce(F.col("t_0.source"), F.lit("123")) == F.coalesce( F.col("t_1.source"), F.lit("123"))) condition_3 = (F.coalesce(F.col("t_0.site"), F.lit("123")) == F.coalesce( F.col("t_1.site"), F.lit("123"))) condition_4 = (F.coalesce(F.col("t_0.fsk_cid"), F.lit("123")) == F.coalesce( F.col("t_1.fsk_cid"), F.lit("123"))) condition_5 = (F.col("t_0.id_1") == F.col("t_1.id_1")) condition_6 = (F.coalesce(F.col("t_0.package_id"), F.lit("123")) == F.coalesce( F.col("t_7.package_id"), F.lit("123"))) condition_7 = (F.coalesce(F.col("t_0.title"), F.lit("123")) == F.coalesce( F.col("t_7.title"), F.lit("123"))) condition_8 = (F.coalesce(F.col("t_0.source"), F.lit("123")) == F.coalesce( F.col("t_7.source"), F.lit("123"))) condition_9 = (F.coalesce(F.col("t_0.site"), F.lit("123")) == F.coalesce( F.col("t_7.site"), F.lit("123"))) condition_10 = (F.coalesce(F.col("t_0.fsk_cid"), F.lit("123")) == F.coalesce( F.col("t_7.fsk_cid"), F.lit("123"))) condition_11 = (F.col("t_0.id_1") == F.col("t_7.id_1")) ##天环比连接条件 conditions_0_1 = condition_0 & condition_1 & condition_2 & condition_3 & condition_4 & condition_5 ##周同比连接条件 conditions_0_7 = condition_6 & condition_7 & condition_8 & condition_9 & condition_10 & condition_11 ##最终报表 app_report = df_cube_0.alias("t_0").join(df_cube_1.alias("t_1"),conditions_0_1,"left_outer") \ .join(df_cube_7.alias("t_7"),conditions_0_7,"left_outer") \ .select(F.regexp_replace(F.lit(str_dt_0),"-","").cast("int").alias("date"),F.col("t_0.package_id").alias("appId"),F.col("t_0.title").alias("appName"),F.col("t_0.source").alias("appSource"),F.col("t_0.site").alias("channelName"),F.col("t_0.fsk_cid").alias("typeName"),F.col("t_0.id_1").alias("id_1"), \ F.col("t_0.playNum").alias("totalPlayCount"),F.concat(F.round((F.col("t_0.playNum")/F.col("t_1.playNum")-1)*100,2),F.lit("%")).alias("playCountCompareDay"),F.concat(F.round((F.col("t_0.playNum")/F.col("t_7.playNum")-1)*100,2),F.lit("%")).alias("playCountCompareWeek"), \ F.col("t_0.users").alias("totalUserCount"),F.concat(F.round((F.col("t_0.users")/F.col("t_1.users")-1)*100,2),F.lit("%")).alias("userCountCompareDay"),F.concat(F.round((F.col("t_0.users")/F.col("t_7.users")-1)*100,2),F.lit("%")).alias("userCountCompareWeek"), \ F.col("t_0.avgPlayNum").alias("averagePlayCount"),F.concat(F.round((F.col("t_0.avgPlayNum")/F.col("t_1.avgPlayNum")-1)*100,2),F.lit("%")).alias("avgPlayCountCompareDay"),F.concat(F.round((F.col("t_0.avgPlayNum")/F.col("t_7.avgPlayNum")-1)*100,2),F.lit("%")).alias("avgPlayCountCompareWeek")) return app_report
# Column manipulation # The Federal Aviation Administration (FAA) considers a flight to be "delayed" when it arrives 15 minutes or more after its scheduled time. # The next step of preparing the flight data has two parts: # convert the units of distance, replacing the mile column with a kmcolumn; and # create a Boolean column indicating whether or not a flight was delayed. # Instructions # 100 XP # Import a function which will allow you to round a number to a specific number of decimal places. # Derive a new km column from the mile column, rounding to zero decimal places. One mile is 1.60934 km. # Remove the mile column. # Create a label column with a value of 1 indicating the delay was 15 minutes or more and 0 otherwise. # Import the required function from pyspark.sql.functions import round # Convert 'mile' to 'km' and drop 'mile' column flights_km = flights.withColumn('km', round(flights.mile * 1.60934, 0)) \ .drop('mile') # Create 'label' column indicating whether flight delayed (1) or not (0) flights_km = flights_km.withColumn('label', (flights.delay >= 15).cast('integer')) # Check first five records flights_km.show(5)
data_apr = data_apr.select(data_apr.FL_DATE, data_apr.OP_UNIQUE_CARRIER, data_apr.OP_CARRIER_FL_NUM, data_apr.DEP_DELAY) data_may = data_may.select(data_may.FL_DATE, data_may.OP_UNIQUE_CARRIER, data_may.OP_CARRIER_FL_NUM, data_may.DEP_DELAY) data_june = data_june.select(data_june.FL_DATE, data_june.OP_UNIQUE_CARRIER, data_june.OP_CARRIER_FL_NUM, data_june.DEP_DELAY) data_july = data_july.select(data_july.FL_DATE, data_july.OP_UNIQUE_CARRIER, data_july.OP_CARRIER_FL_NUM, data_july.DEP_DELAY) data_aug = data_aug.select(data_aug.FL_DATE, data_aug.OP_UNIQUE_CARRIER, data_aug.OP_CARRIER_FL_NUM, data_aug.DEP_DELAY) data_sep = data_sep.select(data_sep.FL_DATE, data_sep.OP_UNIQUE_CARRIER, data_sep.OP_CARRIER_FL_NUM, data_sep.DEP_DELAY) data_oct = data_oct.select(data_oct.FL_DATE, data_oct.OP_UNIQUE_CARRIER, data_oct.OP_CARRIER_FL_NUM, data_oct.DEP_DELAY) data_nov = data_nov.select(data_nov.FL_DATE, data_nov.OP_UNIQUE_CARRIER, data_nov.OP_CARRIER_FL_NUM, data_nov.DEP_DELAY) data_dec = data_dec.select(data_dec.FL_DATE, data_dec.OP_UNIQUE_CARRIER, data_dec.OP_CARRIER_FL_NUM, data_dec.DEP_DELAY) def unionAll(*dfs): return reduce(DataFrame.unionAll, dfs) final_csv_file = unionAll(*[data_jan, data_feb, data_mar,data_apr,data_may,data_june,data_july,data_aug,data_sep,data_oct,data_nov,data_dec]) overall_airlines_perf = final_csv_file.groupBy([final_csv_file.OP_UNIQUE_CARRIER.alias('CARRIER')]).agg(F.round(F.avg('DEP_DELAY'), 0).alias('AVERAGE_dELAY')) print("Overall flight operators performance, good performers being on top: \n") overall_airlines_perf.sort('Average_delay', ascending=True).show() flight_perf_per_airlines = final_csv_file.groupBy([final_csv_file.OP_UNIQUE_CARRIER, final_csv_file.OP_CARRIER_FL_NUM]).agg(F.round(F.avg('DEP_DELAY'), 0).alias('Expected_delay'), F.min('DEP_DELAY'), F.max('DEP_DELAY')) condition_on_join = [df_filter.actual_carrierCode == flight_perf_per_airlines.OP_UNIQUE_CARRIER, df_filter.actual_flight_number == flight_perf_per_airlines.OP_CARRIER_FL_NUM] final_df = df_filter.join(flight_perf_per_airlines, condition_on_join, 'inner' ).select(df_filter.actual_iatacode_origin.alias('ORIGIN'), \ df_filter.actual_at_origin.alias('DEP_TIME'), \ df_filter.actual_iatacode_destination.alias('DESTINATION'), \ df_filter.actual_at_destination.alias('ARR_TIME'), \ df_filter.actual_carrierCode.alias('CARRIER'), \ df_filter.actual_flight_number.alias('FL_NUM'), \ flight_perf_per_airlines.Expected_delay.alias('EXPECTED_DELAY')) print("Below are the flights info from your chosen origin, Expected delay in minutes: \n") final_df.show()
from pyspark.sql import SparkSession from pyspark.sql.functions import col, when, isnan, round, lit if __name__ == "__main__": dir0 = '/home/cloudera/2.kkbox_churn/data01/big_table_01/has-expired_auto-renew-0/' subdir = 'last-1-week_has-record/' infile = dir0 + subdir + '00.data' outfile = dir0 + subdir + '01.added_features' ## spark = SparkSession.builder.getOrCreate() df0 = spark.read.format('parquet').load(infile) df1 = df0.withColumn('last1WeekOnLine_D_last12WeekOnLine', round(col('last1WeekOnLine')/col('last12WeekOnLine'), 2)) \ .withColumn('w1_Mtotal_D_w12_Mtotal', round(col('w1_Mtotal')/col('w12_Mtotal'), 2)) ## df1.write.format('parquet').save(outfile)
totalcovictions # In[132]: monthlyGroupeddf.withColumn( 'percent', (monthlyGroupeddf.total / totalcovictions) * 100).show() # In[137]: import pyspark.sql.functions as func # In[149]: updateddf = monthlyGroupeddf.withColumn( 'percent', func.round((monthlyGroupeddf.total / totalcovictions) * 100, 2)) updateddf.printSchema() updateddf.show() # In[150]: #other aggregations ## convictions based on category in london df.show() # In[156]: df.groupBy('major_category').agg({ 'value': 'sum' }).withColumnRenamed('sum(value)', 'totalValue').orderBy('totalValue').show()
# aggregate by adding values and increment count each time rddAvgP = rddPenalty.map( lambda x: (x[0], x[2]))\ .aggregateByKey((0.0,0.0),\ (lambda x, newVal: ((x[0] + float(newVal)), (x[1] + 1))),\ (lambda rdd1, rdd2: (rdd1[0] + rdd2[0], rdd1[1] + rdd2[1]))) # we are penalizing a small amount based on the number of quality measures rddAvgP = rddAvgP.mapValues(lambda x: round((x[0] / (x[1])), 5)) # break the columns apart after the joins rddFinal = rddAvgQ.join(rddAvgP).join(rddStdDev).map( lambda x: (x[0], x[1][0][0], x[1][0][1], x[1][1])) # build final dataframes dfFinal = rddFinal.toDF( ["ProviderID", "QualityScore", "Penalty", "StandardDeviation"])\ .withColumn("FinalScore", F.round(F.col("QualityScore") - F.col("Penalty"), 5)) dfShow = dfFinal.join( dfHospitals, dfHospitals.id == dfFinal.ProviderID)\ .select("ProviderID", "name", "state", "rating", "QualityScore", "Penalty", "StandardDeviation", "FinalScore")\ .sort("FinalScore", ascending = False) # save this score table for question 4 dfShow.write.parquet("/user/w205/hospital_compare/hospitalQualParquet") dfShowRank = dfShow.rdd.zipWithIndex().map(lambda x: (x[1] + 1, x[0][0], x[0][1], x[0][2], x[0][3], x[0][4], x[0][5], x[0][6], x[0][7]))\ .toDF().select(F.col("_1").alias("Rank"), F.col("_2").alias("ProviderID"), F.col("_3").alias("name"), F.col("_4").alias("state"),\ F.col("_5").cast("int").alias("rating"), F.col("_6").alias("QualityScore"), F.col("_7").alias("Penalty"),\ F.col("_8").alias("StandardDeviation"), F.col("_9").alias("FinalScore"))\ .show(10, False)
def _binary_clf_curve(labelAndVectorisedScores, rawPredictionCol, labelCol): # sort the dataframe by pred column in descending order localPosProbCol = "pos_probability" labelAndPositiveProb = labelAndVectorisedScores.select( labelCol, getitem(1)(rawPredictionCol).alias(localPosProbCol)) # round the fractional prediction column labelAndPositiveProb = labelAndPositiveProb\ .withColumn("_tmp_pred", F.round(localPosProbCol, 3))\ .drop(localPosProbCol)\ .withColumnRenamed("_tmp_pred", localPosProbCol)\ .sort(F.desc(localPosProbCol)) # adding index to the dataframe sortedScoresAndLabels = labelAndPositiveProb.rdd.zipWithIndex() \ .toDF(['data', 'index']) \ .select('data.' + labelCol, 'data.' + localPosProbCol, "index") groupSumLabelCol = "group_sum_labels" groupMaxIndexCol = "group_max_indices" sortedScoresAndLabels = sortedScoresAndLabels\ .groupBy([localPosProbCol, labelCol])\ .agg(F.sum(labelCol).alias(groupSumLabelCol), F.max("index").alias(groupMaxIndexCol)) # sortedScoresAndLabels = labelAndPositiveProb.sort(F.desc(localPosProbCol)) # creating rank for pred column lookup = (sortedScoresAndLabels.select(localPosProbCol).distinct().sort( F.desc(localPosProbCol)).rdd.zipWithIndex().map( lambda x: x[0] + (x[1], )).toDF([localPosProbCol, "rank"])) # join the dataframe with lookup to assign the ranks sortedScoresAndLabels = sortedScoresAndLabels.join(lookup, [localPosProbCol]) # sorting in descending order based on the pred column sortedScoresAndLabels = sortedScoresAndLabels.sort(groupMaxIndexCol) # saving the dataframe to temporary table sortedScoresAndLabels.registerTempTable("processeddata") # TODO: script to avoid partition by warning, and span data across clusters nodes # creating the cumulative sum for tps sortedScoresAndLabelsCumSum = labelAndVectorisedScores.sql_ctx \ .sql( "SELECT " + labelCol + ", " + localPosProbCol + ", " + groupSumLabelCol + ", rank, " + groupMaxIndexCol + ", sum(" + groupSumLabelCol + ") OVER (ORDER BY " + groupMaxIndexCol + ") as tps FROM processeddata ") # repartitioning sortedScoresAndLabelsCumSum = sortedScoresAndLabelsCumSum.coalesce( partition_size) # # cache after partitioning sortedScoresAndLabelsCumSum.cache() # retain only the group-wise (according to threshold) max tps df_max_tps_in_group = sortedScoresAndLabelsCumSum.groupBy( localPosProbCol).agg(F.max("tps").alias("max_tps")) dup_removed_scores_labels = \ sortedScoresAndLabelsCumSum.join( df_max_tps_in_group, [sortedScoresAndLabelsCumSum[localPosProbCol] == df_max_tps_in_group[localPosProbCol], sortedScoresAndLabelsCumSum["tps"] == df_max_tps_in_group["max_tps"]], how="right_outer" )\ .drop(df_max_tps_in_group[localPosProbCol])\ .drop(df_max_tps_in_group["max_tps"])\ .groupBy([localPosProbCol, "tps"])\ .agg(F.max(groupMaxIndexCol).alias("max_index")) # creating the fps column based on rank and tps column df_with_fps = dup_removed_scores_labels \ .withColumn("fps", 1 + F.col("max_index") - F.col("tps")) return df_with_fps
from pyspark.sql import SparkSession from pyspark.sql import functions as func spark = SparkSession.builder.appName("FriendsByAge").getOrCreate() spark.sparkContext.setLogLevel("ERROR") lines = spark.read.option("header", "true").option("inferSchema", "true")\ .csv("file:///opt/bitnami/spark/datasets/fakefriends-header.csv") # Select only age and numFriends columns friendsByAge = lines.select("age", "friends") # From friendsByAge we group by "age" and then compute average friendsByAge.groupBy("age").avg("friends").show() # Sorted friendsByAge.groupBy("age").avg("friends").sort("age").show() # Formatted more nicely friendsByAge.groupBy("age").agg(func.round(func.avg("friends"), 2)).sort("age").show() # With a custom column name friendsByAge.groupBy("age").agg( func.round(func.avg("friends"), 2).alias("friends_avg")).sort("age").show() spark.stop()
def process_dataset(df): ''' Function for preparation of dataset for machine learning INPUT: df - initial dataset loaded from json file OUTPUT: df_ft - new dataset prepared for machine learning contains the following columns: 1. userId - initial id of the user 2. gender - user's gender 3. avg_events - average number of events per day for the user 4. avg_songs - average number of songs the user listens to per day 5. thumbs_up - number of thumbs up events 6. thumbs_down - number of thumbs down events 7. active_days - days since user's firts event 8. last_location - location of the last event 9. last_level - user's last level (paid or free) 10. addfriends - number of add friends events ''' # clean dataset using clean_data function df = clean_data(df) # define cancellation udf cancellation_event = udf(lambda x: 1 if x == "Cancellation Confirmation" else 0, IntegerType()) # set churn = 1 for rows where page == 'Cancellation Confirmation' df = df.withColumn("churn", cancellation_event("page")) # get userId with churn == 1 cancelled_users = df.select(['userId', 'churn']).where(df.churn == 1).groupby('userId').count().toPandas()['userId'].values # create udf, which sets churn of a row to 1 if userId is in cancelled_users list def replace_data(userId, features): if(userId in cancelled_users): return 1 else : return 0 # set churn == 1 for all rows for users who cancelled their subscription fill_array_udf = udf(replace_data, IntegerType()) df = df.withColumn("churn", fill_array_udf(col("userId"), col("churn"))) # set column last ts with the first and the last event timestamp w = Window.partitionBy('userId') df = df.withColumn('last_ts', max('ts').over(w)) df = df.withColumn('first_ts', min('ts').over(w)) # convert timestamp to date (string) def get_date(ts): return str(datetime.utcfromtimestamp(ts / 1000).strftime('%Y-%m-%d')) get_date_from_ts_udf = udf(get_date, StringType()) df = df.withColumn('last_date', get_date_from_ts_udf(col('last_ts'))) df = df.withColumn('first_date', get_date_from_ts_udf(col('first_ts'))) # add column date and convert timetamp to date df = df.withColumn('date', get_date_from_ts_udf(col('ts'))) # set column last_level to level when timestamp is last timestamp df = df.withColumn('last_level', when(df.last_ts == df.ts, df.level)) #aditional feature: Gender # flag_gender = udf(lambda x: 1 if x == 'M' else 0, IntegerType()) # gender = df.select("userId", "gender").dropDuplicates() # gender = df.withColumn("gender", flag_gender("gender")) # create column avg_songs to calculate average number of events per day w = Window.partitionBy('userId', 'date') events = df.select('userId', 'date', count('userId').over(w).alias('events')).distinct() w = Window.partitionBy('userId') events = events.withColumn('avg_events', avg('events').over(w)) events = events.select(col("userId").alias("events_userId"), 'avg_events') events = events.withColumn("avg_events", round(events["avg_events"], 2)) # create column avg_songs to calculate average number of songs per day w = Window.partitionBy('userId', 'date') songs = df.where(df.page == 'NextSong').select('userId', 'date', count('userId').over(w).alias('songs')).distinct() w = Window.partitionBy('userId') songs = songs.withColumn('avg_songs', avg('songs').over(w)) songs = songs.select(col("userId").alias("songs_userId"), 'avg_songs') songs = songs.withColumn("avg_songs", round(songs["avg_songs"], 2)) # calculate number of thumbs up for a user w = Window.partitionBy('userId') thumbsup = df.where(df.page == 'Thumbs Up').select('userId', count('userId').over(w).alias('thumbs_up')).distinct() thumbsup = thumbsup.select(col("userId").alias("thumbsup_userId"), 'thumbs_up') # calculate number of thumbs down for a user w = Window.partitionBy('userId') thumbsdown = df.where(df.page == 'Thumbs Down').select('userId', count('userId').over(w).alias('thumbs_down')).distinct() thumbsdown = thumbsdown.select(col("userId").alias("thumbsdown_userId"), 'thumbs_down') # calculate days since the date of the first event df = df.withColumn("days_active", datediff(to_date(lit(datetime.now().strftime("%Y-%m-%d %H:%M"))), to_date("first_date","yyyy-MM-dd"))) # add column with state of the event based on location column def get_state(location): location = location.split(',')[-1].strip() if (len(location) > 2): location = location.split('-')[-1].strip() return location state_udf = udf(get_state, StringType()) df = df.withColumn('state', state_udf(col('location'))) #add column with last location of the user df = df.withColumn('last_state',when(df.last_ts == df.ts, df.state)) # calculate number of add friends for a user w = Window.partitionBy('userId') addfriend = df.where(df.page == 'Add Friend').select('userId', count('userId').over(w).alias('addfriend')).distinct() addfriend = addfriend.select(col("userId").alias("addfriend_userId"), 'addfriend') # merge all results together df_ft = df.select('userId', 'gender', 'churn', 'last_level', 'days_active', 'last_state')\ .dropna().drop_duplicates() df_ft = df_ft.join(songs, df_ft.userId == songs.songs_userId).distinct() df_ft = df_ft.join(events, df_ft.userId == events.events_userId).distinct() df_ft = df_ft.join(thumbsup, df_ft.userId == thumbsup.thumbsup_userId, how='left').distinct() df_ft = df_ft.fillna(0, subset=['thumbs_up']) df_ft = df_ft.join(thumbsdown, df_ft.userId == thumbsdown.thumbsdown_userId, how='left').distinct() df_ft = df_ft.fillna(0, subset=['thumbs_down']) df_ft = df_ft.join(addfriend, df_ft.userId == addfriend.addfriend_userId, how='left').distinct() df_ft = df_ft.fillna(0, subset=['addfriend']) df_ft = df_ft.drop('songs_userId','events_userId', 'thumbsup_userId', 'thumbsdown_userId', 'addfriend_userId') return df, df_ft
fabricatedQuantity = pow(col("Quantity") * col("UnitPrice"), 2) + 5 df.select(expr("CustomerId"), fabricatedQuantity.alias("realQuantity")).show(2) # COMMAND ---------- df.selectExpr( "CustomerId", "(POWER((Quantity * UnitPrice), 2.0) + 5) as realQuantity").show(2) # COMMAND ---------- from pyspark.sql.functions import lit, round, bround df.select(round(lit("2.5")), bround(lit("2.5"))).show(2) # COMMAND ---------- from pyspark.sql.functions import corr df.stat.corr("Quantity", "UnitPrice") df.select(corr("Quantity", "UnitPrice")).show() # COMMAND ---------- df.describe().show() # COMMAND ----------
d1 = spark.read.option("header", "true") \ .option("sep", ",").option("inferSchema", True) \ .option("mode", "DROPMALFORMED") \ .csv("file:///Users/beginspark/Temp/data2.csv") d2 = d1.toDF("year", "month", "road", "avr_traffic_month", "avr_velo_month", "mon", "tue", "wed", "thu", "fri", "sat", "sun") # data 확인 d2.printSchema() # null 값 제거 d3 = d2.where("avr_velo_month is not null") # 도로별 평균 속도 d4 = d3.groupBy("road").agg(functions.round(functions.avg("avr_velo_month"), 1).alias("avr_velo_total")) d5 = d3.join(d4, ["road"]) # label 부여 d6 = d5.withColumn("label", label(d5.avr_velo_month, d5.avr_velo_total).cast("double")) d6.select("road", "avr_velo_month", "avr_velo_total", "label").show(5, False) d6.groupBy("label").count().show(truncate=False) dataArr = d6.randomSplit([0.7, 0.3]) train = dataArr[0] test = dataArr[1] indexer = StringIndexer(inputCol="road", outputCol="roadcode") assembler = VectorAssembler(inputCols=["roadcode", "mon", "tue", "wed", "thu", "fri", "sat", "sun"], outputCol="features")