def impute(self, input_df): import pyspark.sql.functions as f ori_column_name = self.timestamp_column_name + "_ori" df = input_df.withColumnRenamed( self.timestamp_column_name, ori_column_name) merged_df = df.withColumn( "add_seconds", (f.round( f.second(ori_column_name) / self.time_interval) * self.time_interval) - f.second(ori_column_name)) .withColumn( self.timestamp_column_name, f.from_unixtime( f.unix_timestamp(ori_column_name) + f.col("add_seconds"))).drop("add_seconds") if self.mode == "max": merged_df = merged_df.groupby(self.timestamp_column_name).max() elif self.mode == "min": merged_df = merged_df.groupby(self.timestamp_column_name).min() elif self.mode == "mean": merged_df = merged_df.groupby(self.timestamp_column_name).mean() elif self.mode == "sum": merged_df = merged_df.groupby(self.timestamp_column_name).sum() elif self.mode == "": merged_df else: raise Exception("Currently only support max/min/mean/sum mode") return merged_df
def __appendAggKey(tsdf, freq = None): """ :param tsdf: TSDF object as input :param freq: frequency at which to upsample :return: return a TSDF with a new aggregate key (called agg_key) """ df = tsdf.df checkAllowableFreq(freq) # compute timestamp columns sec_col = f.second(f.col(tsdf.ts_col)) min_col = f.minute(f.col(tsdf.ts_col)) hour_col = f.hour(f.col(tsdf.ts_col)) if (freq == SEC): agg_key = f.concat(f.col(tsdf.ts_col).cast("date"), f.lit(" "), f.lpad(hour_col, 2, '0'), f.lit(':'), f.lpad(min_col, 2, '0'), f.lit(':'), f.lpad(sec_col, 2, '0')).cast("timestamp") elif (freq == MIN): agg_key = f.concat(f.col(tsdf.ts_col).cast("date"), f.lit(' '), f.lpad(hour_col, 2, '0'), f.lit(':'), f.lpad(min_col, 2, '0'), f.lit(':'), f.lit('00')).cast("timestamp") elif (freq == HR): agg_key = f.concat(f.col(tsdf.ts_col).cast("date"), f.lit(' '), f.lpad(hour_col, 2, '0'), f.lit(':'), f.lit('00'), f.lit(':'), f.lit('00')).cast("timestamp") elif (freq == DAY): agg_key = f.col(tsdf.ts_col).cast("date").cast("timestamp") df = df.withColumn("agg_key", agg_key) return tempo.TSDF(df, tsdf.ts_col, partition_cols = tsdf.partitionCols)
def _transform(self, df): time_variable = self.getColumn() new_time_variable = time_variable + '_new' # code from tawab. Convert all times in a same format. df = df.withColumn( new_time_variable, self.udf_date_formatting()( funct.col(time_variable).cast("String"))) df = df.withColumn( new_time_variable, funct.from_unixtime( funct.unix_timestamp(new_time_variable, self.time_format)).cast(TimestampType())) df = df.withColumn(time_variable + '_year', funct.year(new_time_variable)) df = df.withColumn(time_variable + '_month', funct.month(new_time_variable)) df = df.withColumn(time_variable + '_day', funct.dayofmonth(new_time_variable)) df = df.withColumn(time_variable + '_dayofweek', funct.dayofweek(new_time_variable)) df = df.withColumn(time_variable + '_hour', funct.hour(new_time_variable)) df = df.withColumn(time_variable + '_minutes', funct.minute(new_time_variable)) df = df.withColumn(time_variable + '_seconds', funct.second(new_time_variable)) df = df.drop(new_time_variable) df = df.drop(time_variable) return df
def _transform(self, df): input = self.getInputCol() df = df.withColumn("dt_day", F.dayofmonth(input)) df = df.withColumn("dt_hour", F.hour(input)) df = df.withColumn("dt_minute", F.minute(input)) df = df.withColumn("dt_second", F.second(input)) df = df.withColumn("dt_dayofyear", F.dayofyear(input)) df = df.withColumn("dt_dayofweek", F.dayofweek(input)) df = df.withColumn("dt_weekofyear", F.weekofyear(input)) return df
def main(spark): df = createDataframe(spark) df.show(truncate=False) df = df.withColumn("date", F.date_format(F.col("time"), "yyyy-MM-dd HH:mm:ss.SSSS")) \ .withColumn("h", F.hour(F.col("date"))) \ .withColumn("m", F.minute(F.col("date"))) \ .withColumn("s", F.second(F.col("date"))) \ .withColumn("event", F.expr("h*3600 + m*60 +s")) \ .drop("date","h","m","s") df.show(truncate=False) inRange = F.udf(in_range, BooleanType()) df = df.withColumn("between", inRange(F.col("range"), F.col("event"))) df.show(truncate=False)
def lowest_avg_idle_user(self, df): ''' Find average idle hours for each users. And then find who is idle less then total average hour. ''' df_idle = df.drop('working_hour', 'start_time', 'end_time') # only take working hour column # Find average idle hour for each user df_avg = df_idle.groupBy('user_name').agg( sqlFun.from_unixtime( sqlFun.avg(sqlFun.unix_timestamp('idle_time')), 'hh:mm:ss').alias('avg_time')) # Convert all into hour df_avg_hours = df_avg.withColumn( 'avg_hour', (hour(df_avg['avg_time']) * 3600 + minute(df_avg['avg_time']) * 60 + second(df_avg['avg_time'])) / 3600) #calculating average hours total_avg_idle_hour = df_avg_hours.select( avg('avg_hour')).collect()[0][0] lowest_idle_users = df_avg_hours.filter( df_avg_hours['avg_hour'] < total_avg_idle_hour).select('user_name') return lowest_idle_users
# Cast string a timestamp col time timeFormatUDF = F.udf(lambda ts: timeFormat(ts)) dataset = dataset.withColumn( "time", timeFormatUDF(F.col("time")).cast(TimestampType())) # Separamos col time dataset = dataset.withColumn("year", F.year(F.col("time"))) dataset = dataset.withColumn("month", F.month(F.col("time"))) dataset = dataset.withColumn("day", F.dayofmonth(F.col("time"))) dataset = dataset.withColumn("hour", F.hour(F.col("time"))) dataset = dataset.withColumn("minute", F.minute(F.col("time"))) dataset = dataset.withColumn("second", F.second(F.col("time"))) # Separamos MCC, MNC y MSIN de la columna IMSI dataset = dataset.withColumn('mcc', dataset.imsi.substr(1, 3)) dataset = dataset.withColumn('mnc', dataset.imsi.substr(4, 2)) dataset = dataset.withColumn('msin', dataset.imsi.substr(6, 10)) # Separamos TAC, SNR y CD de la columna IMEI # Formato de los IMEI: TAC -- Serial_Number (14 digitos) dataset = dataset.withColumn('tac', dataset.imei.substr(1, 8)) dataset = dataset.withColumn('snr', dataset.imei.substr(9, 6)) # Escalamos la columna year con MinMaxScaler en el rango [0,1]
def process_data(spark): """ Read from S3 and process bike share data into dimensional tables. the bike share data (as CSVs) is read from a public S3 bucket to dataframes. the data is transformed using pyspark.sql functions finally data is saved back to the same S3 bucket in parquet fromat Parameters: spark: Spark session """ # read from S3 to dataframes st_station_df = spark.read.csv('s3://omar-dend/station.csv', header=True) st_weather_df = spark.read.csv('s3://omar-dend/weather.csv', header=True) st_trip_df = spark.read.csv('s3://omar-dend/trip.csv', header=True) st_status_df = spark.read.csv('s3://omar-dend/status.csv', header=True) st_city_df = spark.read.csv('s3://omar-dend/city.csv', header=True) # save counts to ensure later that all rows are present station_count = st_station_df.count() weather_count = st_weather_df.count() # adding timestamp to all the dataframes to standardize datetime st_station_df = st_station_df.withColumn( 'datetime', F.to_timestamp(st_station_df.installation_date, 'MM/dd/yyyy')) st_weather_df = st_weather_df.withColumn( 'datetime', F.to_timestamp(st_weather_df.date, 'MM/dd/yyyy')) st_trip_df = st_trip_df.withColumn( 'datetime_start', F.to_timestamp(st_trip_df.start_date, 'MM/dd/yyyy HH:mm')) st_trip_df = st_trip_df.withColumn( 'datetime_end', F.to_timestamp(st_trip_df.end_date, 'MM/dd/yyyy HH:mm')) st_status_df = st_status_df.withColumn( 'datetime', F.to_timestamp(st_status_df.time, 'yyyy/MM/dd HH:mm:ss')) # create dim_weather weather_df = st_weather_df.select('max_temperature_f', 'mean_temperature_f', 'min_temperature_f', 'max_humidity', 'mean_humidity', 'min_humidity', 'max_wind_Speed_mph', 'mean_wind_speed_mph', 'precipitation_inches', 'events', 'zip_code', 'datetime')\ .dropDuplicates() # create dim_station station_df = st_station_df.select( F.col('id').alias('station_id'), F.col('name').alias('station_name'), 'lat', 'long', 'dock_count', 'city', F.col('datetime').alias('installation_datetime')) station_df = station_df.join(st_city_df, station_df.city == st_city_df.city, 'left')\ .drop('city')\ .dropDuplicates() # make sure none of station or wheather data was dropped by mistake station_dim_count = station_df.count() weather_dim_count = weather_df.count() if station_dim_count != station_count or weather_dim_count != weather_count: raise Exception('Some dimensional rows are missing') else: print('All is good') # load (save) dim_staion to S3 in parquet fromat station_df.write.mode('overwrite')\ .parquet('s3://omar-dend/dim_station') # load (save) dim_weather to S3 in parquet fromat partitioned by zip_code weather_df.write.mode('overwrite')\ .partitionBy('zip_code')\ .parquet('s3://omar-dend/dim_weather') # create dim_time time_df = st_station_df.select('datetime')\ .withColumn('second', F.second('datetime'))\ .withColumn('minute', F.minute('datetime'))\ .withColumn('hour', F.hour('datetime'))\ .withColumn('day', F.dayofmonth('datetime'))\ .withColumn('week', F.weekofyear('datetime'))\ .withColumn('month', F.month('datetime'))\ .withColumn('year', F.year('datetime'))\ .withColumn('weekday', F.dayofweek('datetime')) time_df = st_weather_df.select('datetime')\ .withColumn('second', F.second('datetime'))\ .withColumn('minute', F.minute('datetime'))\ .withColumn('hour', F.hour('datetime'))\ .withColumn('day', F.dayofmonth('datetime'))\ .withColumn('week', F.weekofyear('datetime'))\ .withColumn('month', F.month('datetime'))\ .withColumn('year', F.year('datetime'))\ .withColumn('weekday', F.dayofweek('datetime')) time_df = st_trip_df.select(F.col('datetime_start').alias('datetime'))\ .withColumn('second', F.second('datetime'))\ .withColumn('minute', F.minute('datetime'))\ .withColumn('hour', F.hour('datetime'))\ .withColumn('day', F.dayofmonth('datetime'))\ .withColumn('week', F.weekofyear('datetime'))\ .withColumn('month', F.month('datetime'))\ .withColumn('year', F.year('datetime'))\ .withColumn('weekday', F.dayofweek('datetime'))\ time_df = st_trip_df.select(F.col('datetime_end').alias('datetime'))\ .withColumn('second', F.second('datetime'))\ .withColumn('minute', F.minute('datetime'))\ .withColumn('hour', F.hour('datetime'))\ .withColumn('day', F.dayofmonth('datetime'))\ .withColumn('week', F.weekofyear('datetime'))\ .withColumn('month', F.month('datetime'))\ .withColumn('year', F.year('datetime'))\ .withColumn('weekday', F.dayofweek('datetime'))\ time_df = st_status_df.select('datetime')\ .withColumn('second', F.second('datetime'))\ .withColumn('minute', F.minute('datetime'))\ .withColumn('hour', F.hour('datetime'))\ .withColumn('day', F.dayofmonth('datetime'))\ .withColumn('week', F.weekofyear('datetime'))\ .withColumn('month', F.month('datetime'))\ .withColumn('year', F.year('datetime'))\ .withColumn('weekday', F.dayofweek('datetime'))\ .dropDuplicates() # load (save) dim_weather to S3 in parquet fromat partitioned by year & month time_df.write.mode('overwrite')\ .partitionBy('year', 'month')\ .parquet('s3://omar-dend/dim_time') # create fact_trip trip_df = st_trip_df.select(F.col('id').alias('trip_id'), 'duration', 'bike_id', 'subscription_type', 'start_station_id', 'end_station_id', 'datetime_start', 'datetime_end')\ .dropDuplicates() # load (save) dim_weather to S3 in parquet fromat trip_df.write.mode('overwrite')\ .parquet('s3://omar-dend/fact_trip') # create fact_status status_df = st_status_df.select('station_id', 'bikes_available', 'docks_available', 'datetime')\ .dropDuplicates() # load (save) dim_weather to S3 in parquet fromat partitioned by station_id status_df.write.mode('overwrite')\ .partitionBy('station_id')\ .parquet('s3://omar-dend/fact_status')
def process_log_data(spark, input_data_path): pl_start = time() print('Starting to process log data') # get filepath to log data file log_data = input_data_path # read log data file #df = log_schema = StructType([ StructField("artist", StringType()), StructField("auth", StringType()), StructField("firstName", StringType()), StructField("gender", StringType()), StructField("itemInSession", LongType()), StructField("lastName", StringType()), StructField("length", DoubleType()), StructField("level", StringType()), StructField("location", StringType()), StructField("method", StringType()), StructField("page", StringType()), StructField("registration", DoubleType()), StructField("sessionId", LongType()), StructField("song", StringType()), StructField("status", StringType()), StructField("ts", StringType()), StructField("userAgent", StringType()), StructField("userId", StringType()) ]) log_df = spark.read.json(input_data_path, schema=log_schema) # filter by actions for song plays # Filter only column page with value "NextSong" #df = log_df = log_df.filter(log_df.page == 'NextSong').collect() # Convert List to Spark log_df = spark.createDataFrame(log_df, schema=log_schema) # Convert ts from long to datetime convert_ts = udf( lambda x: datetime.datetime.fromtimestamp(float(x) / 1000.0), TimestampType()) log_df = log_df.withColumn("ts_converted", convert_ts(log_df.ts)) # Convert registration from double to long log_df = log_df.withColumn("registration_converted", log_df.registration.cast(LongType())) pl_et = time() - pl_start print("=== {} Total Elapsed time is {} sec\n".format( 'Process log files : Read & Transformation', round(pl_et, 2))) print('Creating users table') temp_start = time() # extract columns for users table # creating users table with columns user_id, first_name, last_name, gender, level users_table = log_df.select(['userId', 'firstName', 'lastName', 'gender', 'level'])\ .withColumnRenamed('userId', 'user_id')\ .withColumnRenamed('firstName', 'first_name')\ .withColumnRenamed('lastName', 'last_name').dropDuplicates() pl_et = time() - temp_start print("=== {} Total Elapsed time is {} sec\n".format( 'Creating users table', round(pl_et, 2))) # extract columns to create time table # Creating time table with columns start_time, hour, day, week, month, year, weekday print('Creating time table') temp_start = time() time_table = log_df.select(['ts_converted'])\ .withColumnRenamed('ts_converted','start_time') time_table = time_table.withColumn('day', F.dayofmonth('start_time')) \ .withColumn('month', F.month('start_time')) \ .withColumn('year', F.year('start_time')) \ .withColumn('hour', F.hour('start_time')) \ .withColumn('minute', F.minute('start_time')) \ .withColumn('second', F.second('start_time')) \ .withColumn('week', F.weekofyear('start_time')) \ .withColumn('weekday', F.dayofweek('start_time')).dropDuplicates() pl_et = time() - temp_start print("=== {} Total Elapsed time is {} sec\n".format( 'Creating time table', round(pl_et, 2))) pl_et = time() - pl_start print("=== {} Total Elapsed time is {} sec\n".format( 'Process log files : Total', round(pl_et, 2))) return log_df, users_table, time_table
def get_time_to_purchase(timeframe, partner, premium, purchase): # todo """ returns the distribution of time it takes a user to achieve a purchase input: a period (string), a partner (string), 2 spark dataframes (first and last one in the workflow) output: a dictionary with the bucket and the values associated """ keys = [0, 20, 40, 60, 120, 180, 240, 300, 600] result = collections.OrderedDict() result = {key: 0 for key in keys} timeframe_is = get_date(timeframe) purchase_renam = (purchase.filter(purchase.keen.timestamp >= timeframe_is) .filter(purchase.search_info.partner_id == partner) .withColumnRenamed('keen', 'keen_purchase') .withColumnRenamed('flight', 'flight_purchase')) premium_renam = (premium.filter(premium.keen.timestamp >= timeframe_is) .filter(premium.search_info.partner_id == partner) .withColumnRenamed('keen', 'keen_premium') .withColumnRenamed('flight', 'flight_premium')) joined_df = purchase_renam.join(premium_renam, purchase_renam.search_info.search_id == premium_renam.search_info.search_id, 'inner') joined_df = joined_df.withColumn("time_to_purchase", (minute(joined_df.keen_purchase.timestamp) - minute(joined_df.keen_premium.timestamp)) * 60 + second(joined_df.keen_purchase.timestamp) - second(joined_df.keen_premium.timestamp)) times = joined_df.groupBy("time_to_purchase").sum("purchase.quantity").collect() for row in times: for i in range(len(keys) - 1): if row[0] > keys[i] and row[0] <= keys[i+1]: result[keys[i+1]] += row[1] result.pop(0) return result
def second(self) -> "ks.Series": """ The seconds of the datetime. """ return _column_op(lambda c: F.second(c).cast(LongType()))( self._data).alias(self._data.name)
converttimeudf = UserDefinedFunction(lambda x: convertstamp(x), TimestampType()) convertdateudf = UserDefinedFunction(lambda x: convertdate(x), TimestampType()) #with column allows to introduce a new column keeping others in place. df3=df.withColumn("TIME_SCHEDULED", converttimeudf(df['TIME_SCHEDULED']))\ .withColumn("TRIP_START_TIME", converttimeudf(df['TRIP_START_TIME']))\ .withColumn("TIME_ACTUAL_ARRIVE", converttimeudf(df['TIME_ACTUAL_ARRIVE']))\ .withColumn("TIME_ACTUAL_DEPART", converttimeudf(df['TIME_ACTUAL_DEPART']))\ .withColumn("SURVEY_DATE",convertdateudf(df["SURVEY_DATE"]).cast(DateType())) df4=df3.withColumn("MONTH",month(df3["SURVEY_DATE"]))\ .withColumn("YEAR",year(df3["SURVEY_DATE"]))\ .withColumn("TIME_SCHEDULED_HOUR",hour(df3["TIME_SCHEDULED"]))\ .withColumn("TIME_SCHEDULED_MIN",minute(df3["TIME_SCHEDULED"]))\ .withColumn("TIME_SCHEDULED_SEC",second(df3["TIME_SCHEDULED"]))\ .withColumn("TRIP_START_TIME_HOUR",hour(df3["TRIP_START_TIME"]))\ .withColumn("TRIP_START_TIME_MIN",minute(df3["TRIP_START_TIME"]))\ .withColumn("TRIP_START_TIME_SEC",second(df3["TRIP_START_TIME"]))\ .withColumn("TIME_ACTUAL_ARRIVE_HOUR",hour(df3["TIME_ACTUAL_ARRIVE"]))\ .withColumn("TIME_ACTUAL_ARRIVE_MIN",minute(df3["TIME_ACTUAL_ARRIVE"]))\ .withColumn("TIME_ACTUAL_ARRIVE_SEC",second(df3["TIME_ACTUAL_ARRIVE"]))\ .withColumn("TIME_ACTUAL_DEPART_HOUR",hour(df3["TIME_ACTUAL_DEPART"]))\ .withColumn("TIME_ACTUAL_DEPART_MIN",minute(df3["TIME_ACTUAL_DEPART"]))\ .withColumn("TIME_ACTUAL_DEPART_SEC",second(df3["TIME_ACTUAL_DEPART"])) df5=df4.withColumn('DIRECTION_NAME',when(df4.DIRECTION_NAME=="OUTYBOUND" ,"OUTBOUND")\ .when(df4.DIRECTION_NAME=="0" ,"OUTBOUND")\ .when(df4.DIRECTION_NAME=="1" ,"INBOUND")\ .otherwise(df4.DIRECTION_NAME))
the_variable = df_time_ma.select('variable').distinct().take(1)[0].variable the_day = (sorted([ x.dayofyear for x in (df_time_ma.select( dayofyear('timestamp').alias('dayofyear')).distinct().take(5)) ]))[0] # min doesn't work on javalist the_hours = 12 the_title = ("Data for asset {}, variable {} for day {}, {} hours".format( the_asset, the_variable, the_day, the_hours)) # currently exports to pandas for visualization and export in CSV format, later on the pyspark dataframe is exported in CSV test_df = df_time_ma.filter(df_time_ma.asset == the_asset).filter( df_time_ma.variable == the_variable).filter( dayofyear('timestamp') == the_day).filter( hour('timestamp') <= the_hours).cache() test_df_1s = test_df.toPandas() test_df_60s = test_df.filter(second(df_time_ma.timestamp) == 0).toPandas() test_df_10m = test_df.filter(minute(df_time_ma.timestamp) % 10 == 0).filter( second(df_time_ma.timestamp) == 0).toPandas() plt.figure(figsize=(12, 4)) plt.plot(test_df_1s.timestamp, test_df_1s.ma, 'b') plt.plot(test_df_60s.timestamp, test_df_60s.ma, 'r') plt.plot(test_df_10m.timestamp, test_df_10m.ma, 'g') plt.grid() plt.title(the_title) plt.legend(['1s', '60s', '10m']) display(plt.gcf()) # COMMAND ---------- from itertools import chain
def process_log_dataset(spark, log_dataset, output_data, df_songs, df_artists, parquet, include_the_dimensions): # read log data file df_staging_events = spark.read.json(log_dataset) # create the get timestamp user defined function get_timestamp_udf = F.udf(lambda x: datetime.fromtimestamp((x / 1000.0)), T.TimestampType()) # filter events i.e. page = NextSong df_staging_events = df_staging_events.where(col('page').isin({'NextSong'})) # convert the ts column from epoch to timestamp df_staging_events = df_staging_events.withColumn( "start_time", get_timestamp_udf(df_staging_events.ts)) df_staging_events = df_staging_events.withColumn( "year", year(df_staging_events.start_time)) df_staging_events = df_staging_events.withColumn( "month", month(df_staging_events.start_time)) # create time dataframe and drop duplicate ts rows df_time = df_staging_events.drop_duplicates(subset=['ts']) # convert the ts column from epoch to timestamp df_time = df_time.withColumn("start_time", get_timestamp_udf(df_time.ts)) # create time dimension dataframe df_time = df_time.select( df_time.start_time.alias('start_time'), year(df_time.start_time).alias('year'), month(df_time.start_time).alias('month'), dayofmonth(df_time.start_time).alias('dayofmonth'), hour(df_time.start_time).alias('hour'), minute(df_time.start_time).alias('minute'), second(df_time.start_time).alias('second'), dayofweek(df_time.start_time).alias('dayofweek'), dayofyear(df_time.start_time).alias('dayofyear'), weekofyear(df_time.start_time).alias('weekofyear')) # create users dimension df_users = df_staging_events.select( df_staging_events.userId.alias('user_id'), df_staging_events.firstName.alias('first_name'), df_staging_events.lastName.alias('last_name'), df_staging_events.gender.alias('gender')) # drop duplicate user rows df_users = df_users.drop_duplicates(subset=['user_id']) # create temporary views df_staging_events.createOrReplaceTempView("staging_events") df_artists.createOrReplaceTempView("artists") df_songs.createOrReplaceTempView("songs") # extract columns from joined song and log datasets to create songplays table df_songplays = spark.sql( """select null as songplay_id ,se.start_time ,se.year ,se.month ,se.userId as user_id ,se.level ,s.song_id ,a.artist_id ,se.sessionId as session_id ,se.location ,se.userAgent as user_agent from staging_events se join artists a on se.artist = a.name join songs s on se.song = s.title and s.artist_id = a.artist_id where 1 = 1""" ) # populate songplays surrogate key df_songplays = df_songplays.withColumn("songplay_id", monotonically_increasing_id()) if parquet: # write songplays table to parquet files partitioned by year and month df_songplays.write.partitionBy('year', 'month').parquet( output_data + "songplays", mode="overwrite") if include_the_dimensions: df_songplays.createOrReplaceTempView("songplays") df_artists = spark.sql( """select distinct a.* from artists a join songplays sp on a.artist_id = sp.artist_id where 1 = 1""" ) df_songs = spark.sql( """select distinct s.* from songs s join songplays sp on s.song_id = sp.song_id where 1 = 1""" ) df_time.createOrReplaceTempView("time") df_time = spark.sql( """select distinct t.* from time t join songplays sp on t.start_time = sp.start_time where 1 = 1""" ) df_users.createOrReplaceTempView("users") df_users = spark.sql( """select distinct u.* from users u join songplays sp on u.user_id = sp.user_id where 1 = 1""" ) if parquet: if include_the_dimensions: # write users table to parquet files partitioned by none df_users.write.parquet(output_data + "users", mode="overwrite") # write songs table to parquet files partitioned by year and artist df_songs.write.partitionBy('year', 'artist_id').parquet( output_data + "songs", mode="overwrite") # write artists table to parquet files partitioned by none df_artists.write.parquet(output_data + "artists", mode="overwrite") # write time table to parquet files partitioned by year and month df_time.write.partitionBy('year', 'month').parquet(output_data + "time", mode="overwrite") # print table info print('songplays info...') print('count(s)') df_songplays.groupby(df_songplays.year, df_songplays.month).count().orderBy( df_songplays.year, df_songplays.month).show() df_songplays.printSchema() df_songplays.show(5) if include_the_dimensions: print('users info...') print('count(s)') print(df_users.count()) df_users.printSchema() df_users.show(5) print('songs info...') print('count(s)') df_songs.groupby(df_songs.year).count().orderBy(df_songs.year).show() df_songs.printSchema() df_songs.show(5) print('artists info...') print('count(s)') print(df_artists.count()) df_artists.printSchema() df_artists.show(5) print('time info...') print('count(s)') df_time.groupby(df_time.year, df_time.month).count().orderBy(df_time.year, df_time.month).show() df_time.printSchema() df_time.show(5)
df.printSchema() from pyspark.sql.functions import to_timestamp, date_format from pyspark.sql.functions import year, month, dayofmonth, dayofweek, hour, minute, second # Convert processing time from int to timestamp df = df.withColumn("processing-time", to_timestamp(col="processing-time")) # Add columns for extracted features print("After Conversion \n") df.printSchema() # Extract Day of Week and Week of Month using the feature extraction df = df.withColumn("dayOfWeek",dayofweek(col="processing-time"))\ .withColumn("dayOfMonth",dayofmonth(col="processing-time"))\ .withColumn("weekOfMonth",date_format(date ="processing-time",format= "W"))\ .withColumn("year",year(col="processing-time"))\ .withColumn("month",month(col="processing-time"))\ .withColumn("hour",hour(col="processing-time"))\ .withColumn("minute",minute(col="processing-time"))\ .withColumn("second",second(col="processing-time"))\ .drop("processing-time") df.show() startTime = time.time() df.toPandas().to_csv(path_or_buf="../../resources/newDatasets/dataset-1.csv") endTime = time.time() print("Time taken to convert df to pandas to csv: ", endTime - startTime)
def _bin_time_stamp(self, origin: pd.Timestamp, ts_scol: Column) -> Column: sql_utils = SparkContext._active_spark_context._jvm.PythonSQLUtils origin_scol = F.lit(origin) (rule_code, n) = (self._offset.rule_code, self._offset.n ) # type: ignore[attr-defined] left_closed, right_closed = (self._closed == "left", self._closed == "right") left_labeled, right_labeled = (self._label == "left", self._label == "right") if rule_code == "A-DEC": assert (origin.month == 12 and origin.day == 31 and origin.hour == 0 and origin.minute == 0 and origin.second == 0) diff = F.year(ts_scol) - F.year(origin_scol) mod = F.lit(0) if n == 1 else (diff % n) edge_cond = (mod == 0) & (F.month(ts_scol) == 12) & (F.dayofmonth(ts_scol) == 31) edge_label = F.year(ts_scol) if left_closed and right_labeled: edge_label += n elif right_closed and left_labeled: edge_label -= n if left_labeled: non_edge_label = F.when(mod == 0, F.year(ts_scol) - n).otherwise(F.year(ts_scol) - mod) else: non_edge_label = F.when( mod == 0, F.year(ts_scol)).otherwise(F.year(ts_scol) - (mod - n)) return F.to_timestamp( F.make_date( F.when(edge_cond, edge_label).otherwise(non_edge_label), F.lit(12), F.lit(31))) elif rule_code == "M": assert (origin.is_month_end and origin.hour == 0 and origin.minute == 0 and origin.second == 0) diff = ((F.year(ts_scol) - F.year(origin_scol)) * 12 + F.month(ts_scol) - F.month(origin_scol)) mod = F.lit(0) if n == 1 else (diff % n) edge_cond = (mod == 0) & (F.dayofmonth(ts_scol) == F.dayofmonth( F.last_day(ts_scol))) truncated_ts_scol = F.date_trunc("MONTH", ts_scol) edge_label = truncated_ts_scol if left_closed and right_labeled: edge_label += sql_utils.makeInterval("MONTH", F.lit(n)._jc) elif right_closed and left_labeled: edge_label -= sql_utils.makeInterval("MONTH", F.lit(n)._jc) if left_labeled: non_edge_label = F.when( mod == 0, truncated_ts_scol - sql_utils.makeInterval("MONTH", F.lit(n)._jc), ).otherwise(truncated_ts_scol - sql_utils.makeInterval("MONTH", mod._jc)) else: non_edge_label = F.when(mod == 0, truncated_ts_scol).otherwise( truncated_ts_scol - sql_utils.makeInterval("MONTH", (mod - n)._jc)) return F.to_timestamp( F.last_day( F.when(edge_cond, edge_label).otherwise(non_edge_label))) elif rule_code == "D": assert origin.hour == 0 and origin.minute == 0 and origin.second == 0 if n == 1: # NOTE: the logic to process '1D' is different from the cases with n>1, # since hour/minute/second parts are taken into account to determine edges! edge_cond = ((F.hour(ts_scol) == 0) & (F.minute(ts_scol) == 0) & (F.second(ts_scol) == 0)) if left_closed and left_labeled: return F.date_trunc("DAY", ts_scol) elif left_closed and right_labeled: return F.date_trunc("DAY", F.date_add(ts_scol, 1)) elif right_closed and left_labeled: return F.when(edge_cond, F.date_trunc("DAY", F.date_sub( ts_scol, 1))).otherwise( F.date_trunc("DAY", ts_scol)) else: return F.when(edge_cond, F.date_trunc("DAY", ts_scol)).otherwise( F.date_trunc("DAY", F.date_add(ts_scol, 1))) else: diff = F.datediff(end=ts_scol, start=origin_scol) mod = diff % n edge_cond = mod == 0 truncated_ts_scol = F.date_trunc("DAY", ts_scol) edge_label = truncated_ts_scol if left_closed and right_labeled: edge_label = F.date_add(truncated_ts_scol, n) elif right_closed and left_labeled: edge_label = F.date_sub(truncated_ts_scol, n) if left_labeled: non_edge_label = F.date_sub(truncated_ts_scol, mod) else: non_edge_label = F.date_sub(truncated_ts_scol, mod - n) return F.when(edge_cond, edge_label).otherwise(non_edge_label) elif rule_code in ["H", "T", "S"]: unit_mapping = {"H": "HOUR", "T": "MINUTE", "S": "SECOND"} unit_str = unit_mapping[rule_code] truncated_ts_scol = F.date_trunc(unit_str, ts_scol) diff = sql_utils.timestampDiff(unit_str, origin_scol._jc, truncated_ts_scol._jc) mod = F.lit(0) if n == 1 else (diff % F.lit(n)) if rule_code == "H": assert origin.minute == 0 and origin.second == 0 edge_cond = (mod == 0) & (F.minute(ts_scol) == 0) & (F.second(ts_scol) == 0) elif rule_code == "T": assert origin.second == 0 edge_cond = (mod == 0) & (F.second(ts_scol) == 0) else: edge_cond = mod == 0 edge_label = truncated_ts_scol if left_closed and right_labeled: edge_label += sql_utils.makeInterval(unit_str, F.lit(n)._jc) elif right_closed and left_labeled: edge_label -= sql_utils.makeInterval(unit_str, F.lit(n)._jc) if left_labeled: non_edge_label = F.when(mod == 0, truncated_ts_scol).otherwise( truncated_ts_scol - sql_utils.makeInterval(unit_str, mod._jc)) else: non_edge_label = F.when( mod == 0, truncated_ts_scol + sql_utils.makeInterval(unit_str, F.lit(n)._jc), ).otherwise(truncated_ts_scol - sql_utils.makeInterval(unit_str, (mod - n)._jc)) return F.when(edge_cond, edge_label).otherwise(non_edge_label) else: raise ValueError("Got the unexpected unit {}".format(rule_code))
# Similar methods: month, dayofweek, minute, second # Expected: # +-----------------+--------------------+-------------+---------------+----------+----+-----+---------+------+------+ # | id| ts| date string| time string| date_new|year|month|dayofweek|minute|second| # +-----------------+--------------------+-------------+---------------+----------+----+-----+---------+------+------+ # |UA000000107379500|2020-07-04 16:09:...|July 04, 2020|16:09:06.592107|04-09-2020|2020| 7| 7| 9| 6| # |UA000000107359357|2020-07-04 15:36:...|July 04, 2020|15:36:51.756535|04-36-2020|2020| 7| 7| 36| 51| # |UA000000107375547|2020-07-04 16:06:...|July 04, 2020|16:06:55.459100|04-06-2020|2020| 7| 7| 6| 55| # +-----------------+--------------------+-------------+---------------+----------+----+-----+---------+------+------+ # Answer df = (df.withColumn("year", F.year(F.col("ts"))).withColumn( "month", F.month(F.col("ts"))).withColumn("dayofweek", F.dayofweek( F.col("ts"))).withColumn("minute", F.minute(F.col("ts"))).withColumn( "second", F.second(F.col("ts")))) df.show() # COMMAND ---------- # Converts the column into DateType with name "date" by casting rules to DateType (use function to_date). # Then create a column plus_two_days that adds 2 days to the date. Select "date" and "plus_two_days" # Expected: # +----------+-------------+ # | date|plus_two_days| # +----------+-------------+ # |2020-07-04| 2020-07-06| # |2020-07-04| 2020-07-06| # |2020-07-04| 2020-07-06| # +----------+-------------+
train_data = spark.read.csv("E:/taikingdata/train.csv", header=True, schema=schematype1) test_data = spark.read.csv("E:/taikingdata/test.csv", header=True, schema=schematype2) train_df = train_data #点击时间的拆分 train_df = train_df.withColumn('hour', fn.hour('click_time').cast(IntegerType())) train_df = train_df.withColumn('day', fn.dayofmonth('click_time').cast(IntegerType())) train_df = train_df.withColumn('minute', fn.minute('click_time').cast(IntegerType())) train_df = train_df.withColumn('second', fn.second('click_time').cast(IntegerType())) #分组点击次数的衍生特征(按照click_time衍生) gp = train_df.groupby(['ip', 'app', 'device', 'os']).agg( fn.count('click_time').cast(IntegerType()).alias('ct1_count')) train_df = train_df.join(gp, on=['ip', 'app', 'device', 'os'], how='left') gp = train_df.groupby(['ip', 'device', 'os', 'channel']).agg( fn.count('click_time').cast(IntegerType()).alias('ct2_count')) train_df = train_df.join(gp, on=['ip', 'device', 'os', 'channel'], how='left') gp = train_df.groupby(['ip', 'app', 'device', 'os', 'channel']).agg( fn.count('click_time').cast(IntegerType()).alias('ct3_count')) train_df = train_df.join(gp, on=['ip', 'app', 'device', 'os', 'channel'], how='left') #仅对ip分组,对其他特征做count和unique gp = train_df.groupby(['ip', 'app']).agg(
def second(self) -> "ps.Series": """ The seconds of the datetime. """ return self._data.spark.transform(lambda c: F.second(c).cast(LongType()))
def process_log_data(spark, input_data_path): """ Summary line. Process log data Parameters: arg1 (spark object) arg2 (Read input from this path which can be local or S3) Returns: log_df, users_table, time_table, user_listen """ pl_start = time() print('Starting to process log data') # get filepath to log data file log_data = input_data_path # read log data file log_schema = StructType([ StructField("artist", StringType()), StructField("auth", StringType()), StructField("firstName", StringType()), StructField("gender", StringType()), StructField("itemInSession", LongType()), StructField("lastName", StringType()), StructField("length", DoubleType()), StructField("level", StringType()), StructField("location", StringType()), StructField("method", StringType()), StructField("page", StringType()), StructField("registration", DoubleType()), StructField("sessionId", LongType()), StructField("song", StringType()), StructField("status", StringType()), StructField("ts", StringType()), StructField("userAgent", StringType()), StructField("userId", StringType()) ]) log_df = spark.read.json(input_data_path, schema=log_schema) # Number of songs users listened to during each level paid_users = log_df.select(['userId', 'level']).filter(log_df['level'] == 'paid') paid_users = paid_users.groupby(['userId']).count() free_users = log_df.select(['userId', 'level']).filter(log_df['level'] == 'free') free_users = free_users.groupby(['userId']).count() paid_users.createOrReplaceTempView('paid_users') free_users.createOrReplaceTempView('free_users') user_listen = spark.sql(""" select a.userId, a.count puCount, b.count fuCount from paid_users a join free_users b on a.userId = b.userId where a.userId != '' """) # Filter only column page with value "NextSong" log_df = log_df.filter(log_df.page == 'NextSong').collect() # Convert List to Spark log_df = spark.createDataFrame(log_df, schema=log_schema) # Convert ts from long to datetime convert_ts = udf( lambda x: datetime.datetime.fromtimestamp(float(x) / 1000.0), TimestampType()) log_df = log_df.withColumn("ts_converted", convert_ts(log_df.ts)) # Convert registration from double to long log_df = log_df.withColumn("registration_converted", log_df.registration.cast(LongType())) pl_et = time() - pl_start print("=== {} Total Elapsed time is {} sec\n".format( 'Process log files : Read & Transformation', round(pl_et, 2))) print('Creating users table') temp_start = time() # extract columns for users table # creating users table with columns user_id, first_name, last_name, gender, level users_table = log_df.select(['userId', 'firstName', 'lastName', 'gender', 'level'])\ .withColumnRenamed('userId', 'user_id')\ .withColumnRenamed('firstName', 'first_name')\ .withColumnRenamed('lastName', 'last_name').dropDuplicates() pl_et = time() - temp_start print("=== {} Total Elapsed time is {} sec\n".format( 'Creating users table', round(pl_et, 2))) print('Creating user_listen table') temp_start = time() user_listen.createOrReplaceTempView('user_listen') users_table.createOrReplaceTempView('users') user_listen = spark.sql(""" select distinct b.first_name, a.puCount, a.fuCount from user_listen a join users b on a.userId = b.user_id """) pl_et = time() - temp_start print("=== {} Total Elapsed time is {} sec\n".format( 'Creating user_listen table', round(pl_et, 2))) # extract columns to create time table # Creating time table with columns start_time, hour, day, week, month, year, weekday print('Creating time table') temp_start = time() time_table = log_df.select(['ts_converted'])\ .withColumnRenamed('ts_converted','start_time') time_table = time_table.withColumn('day', F.dayofmonth('start_time')) \ .withColumn('month', F.month('start_time')) \ .withColumn('year', F.year('start_time')) \ .withColumn('hour', F.hour('start_time')) \ .withColumn('minute', F.minute('start_time')) \ .withColumn('second', F.second('start_time')) \ .withColumn('week', F.weekofyear('start_time')) \ .withColumn('weekday', F.dayofweek('start_time')).dropDuplicates() pl_et = time() - temp_start print("=== {} Total Elapsed time is {} sec\n".format( 'Creating time table', round(pl_et, 2))) pl_et = time() - pl_start print("=== {} Total Elapsed time is {} sec\n".format( 'Process log files : Total', round(pl_et, 2))) return log_df, users_table, time_table, user_listen
def process_log_data(spark, input_data, output_data): # get filepath to log data file log_data = os.path.join(input_data, "log_data/*/*/*.json") # read log data file df = spark.read.json(log_data) # filter by actions for song plays df = df.filter(df.page == "NextSong") # extract columns for users table users_table = df.select("userId", "firstName", "lastName", "gender", "level").dropDuplicates() # write users table to parquet files usersParquetPath = os.path.join(output_data, "users.parquet") users_table.write.parquet(usersParquetPath) usersParquetFile = spark.read.parquet(usersParquetPath) usersParquetFile.createOrReplaceTempView("usersParquetFile") # create timestamp column from original timestamp column get_timestamp = udf(lambda x: datetime.fromtimestamp(x / 1000).strftime( '%Y-%m-%d %H:%M:%S')) df = df.withColumn("datetime", get_timestamp(df.ts)) df = df.withColumn("year", F.year("datetime"))\ .withColumn("month", F.month("datetime"))\ .withColumn("day", F.dayofweek("datetime"))\ .withColumn("hour", F.hour("datetime"))\ .withColumn("minute", F.minute("datetime"))\ .withColumn("second", F.second("datetime"))\ .withColumn("weekday", F.dayofweek("datetime")) # create datetime column from original timestamp column # get_datetime = udf() # df = # extract columns to create time table time_table = df.select("ts", "year", "month", "day", "hour", "minute", "second", "weekday").dropDuplicates() # write time table to parquet files partitioned by year and month timeParquetPath = os.path.join(output_data, "time.parquet") time_table.write.partitionBy("year", "month").parquet(timeParquetPath) timeParquetFile = spark.read.parquet(timeParquetPath) timeParquetFile.createOrReplaceTempView("timeParquetFile") # read in song data to use for songplays table song_data = os.path.join(input_data, "song_data/*/*/*/*.json") song_df = spark.read.json(song_data) # extract columns from joined song and log datasets to create songplays table df1 = df.alias('df1') df2 = song_df.alias('df2') songplays_table = df1.join(df2, df1.artist == df2.artist_name)\ .select("ts", "userId", "level", "song_id", "artist_id", "sessionId", "location", "userAgent", "df1.year", "month")\ .dropDuplicates() # write songplays table to parquet files partitioned by year and month songPlaysParquetPath = os.path.join(output_data, "songplay.parquet") songplays_table.write.partitionBy("year", "month").parquet(songPlaysParquetPath) songplayParquetFile = spark.read.parquet(songPlaysParquetPath) songplayParquetFile.createOrReplaceTempView("songplayParquetFile")
# Generate dim_Time feed combined_timestamp = source_glasses.select("timestamp") \ .union(source_report.select("timestamp")) \ .union(source_smartphone.select("timestamp")) \ .union(source_smartwatch.select("timestamp")) time_df = combined_timestamp.select("timestamp") \ .where(col("Timestamp").isNotNull()) \ .distinct() \ .orderBy("timestamp") time_df = time_df.withColumn("Year", year(time_df["timestamp"])) \ .withColumn("Month", month(time_df["timestamp"])) \ .withColumn("Day", dayofmonth(time_df["timestamp"])) \ .withColumn("Hour", hour(time_df["timestamp"])) \ .withColumn("Minute", minute(time_df["timestamp"])) \ .withColumn("Second", second(time_df["timestamp"])) # prepare glasses activities glasses_activities_acc_x = time_df.join(glasses_df, "timestamp", how="inner") \ .select( [F.lit(1).alias("PersonId"), F.lit(3).alias("SourceId"), F.lit("ACC_X").alias("Collectible"), "timestamp", "ACC_X"]) glasses_activities_acc_y = time_df.join(glasses_df, "timestamp", how="inner") \ .select( [F.lit(1).alias("PersonId"), F.lit(3).alias("SourceId"), F.lit("ACC_Y").alias("Collectible"), "timestamp", "ACC_Y"]) glasses_activities_acc_z = time_df.join(glasses_df, "timestamp", how="inner") \ .select( [F.lit(1).alias("PersonId"), F.lit(3).alias("SourceId"), F.lit("ACC_Z").alias("Collectible"), "timestamp", "ACC_Z"]) glasses_activities_gyro_x = time_df.join(glasses_df, "timestamp", how="inner") \ .select([F.lit(1).alias("PersonId"), F.lit(3).alias("SourceId"), F.lit("GYRO_X").alias("Collectible"), "timestamp", "GYRO_X"]) glasses_activities_gyro_y = time_df.join(glasses_df, "timestamp", how="inner") \