Beispiel #1
0
    def impute(self, input_df):
        import pyspark.sql.functions as f

        ori_column_name = self.timestamp_column_name + "_ori"
        df = input_df.withColumnRenamed(
            self.timestamp_column_name, ori_column_name)
        merged_df = df.withColumn(
            "add_seconds",
            (f.round(
                f.second(ori_column_name) /
                self.time_interval) *
                self.time_interval) -
            f.second(ori_column_name)) .withColumn(
            self.timestamp_column_name,
            f.from_unixtime(
                f.unix_timestamp(ori_column_name) +
                f.col("add_seconds"))).drop("add_seconds")
        if self.mode == "max":
            merged_df = merged_df.groupby(self.timestamp_column_name).max()
        elif self.mode == "min":
            merged_df = merged_df.groupby(self.timestamp_column_name).min()
        elif self.mode == "mean":
            merged_df = merged_df.groupby(self.timestamp_column_name).mean()
        elif self.mode == "sum":
            merged_df = merged_df.groupby(self.timestamp_column_name).sum()
        elif self.mode == "":
            merged_df
        else:
            raise Exception("Currently only support max/min/mean/sum mode")

        return merged_df
Beispiel #2
0
def __appendAggKey(tsdf, freq = None):
    """
    :param tsdf: TSDF object as input
    :param freq: frequency at which to upsample
    :return: return a TSDF with a new aggregate key (called agg_key)
    """
    df = tsdf.df
    checkAllowableFreq(freq)

    # compute timestamp columns
    sec_col = f.second(f.col(tsdf.ts_col))
    min_col = f.minute(f.col(tsdf.ts_col))
    hour_col = f.hour(f.col(tsdf.ts_col))

    if (freq == SEC):
        agg_key = f.concat(f.col(tsdf.ts_col).cast("date"), f.lit(" "), f.lpad(hour_col, 2, '0'), f.lit(':'), f.lpad(min_col, 2, '0'), f.lit(':'), f.lpad(sec_col, 2, '0')).cast("timestamp")
    elif (freq == MIN):
        agg_key = f.concat(f.col(tsdf.ts_col).cast("date"), f.lit(' '), f.lpad(hour_col, 2, '0'), f.lit(':'), f.lpad(min_col, 2, '0'), f.lit(':'), f.lit('00')).cast("timestamp")
    elif (freq == HR):
        agg_key = f.concat(f.col(tsdf.ts_col).cast("date"), f.lit(' '), f.lpad(hour_col, 2, '0'), f.lit(':'), f.lit('00'), f.lit(':'), f.lit('00')).cast("timestamp")
    elif (freq == DAY):
        agg_key = f.col(tsdf.ts_col).cast("date").cast("timestamp")

    df = df.withColumn("agg_key", agg_key)
    return tempo.TSDF(df, tsdf.ts_col, partition_cols = tsdf.partitionCols)
Beispiel #3
0
    def _transform(self, df):

        time_variable = self.getColumn()
        new_time_variable = time_variable + '_new'

        # code from tawab. Convert all times in a same format.

        df = df.withColumn(
            new_time_variable,
            self.udf_date_formatting()(
                funct.col(time_variable).cast("String")))
        df = df.withColumn(
            new_time_variable,
            funct.from_unixtime(
                funct.unix_timestamp(new_time_variable,
                                     self.time_format)).cast(TimestampType()))

        df = df.withColumn(time_variable + '_year',
                           funct.year(new_time_variable))
        df = df.withColumn(time_variable + '_month',
                           funct.month(new_time_variable))
        df = df.withColumn(time_variable + '_day',
                           funct.dayofmonth(new_time_variable))
        df = df.withColumn(time_variable + '_dayofweek',
                           funct.dayofweek(new_time_variable))
        df = df.withColumn(time_variable + '_hour',
                           funct.hour(new_time_variable))
        df = df.withColumn(time_variable + '_minutes',
                           funct.minute(new_time_variable))
        df = df.withColumn(time_variable + '_seconds',
                           funct.second(new_time_variable))

        df = df.drop(new_time_variable)
        df = df.drop(time_variable)
        return df
Beispiel #4
0
  def _transform(self, df):
    input = self.getInputCol()

    df = df.withColumn("dt_day", F.dayofmonth(input))
    df = df.withColumn("dt_hour", F.hour(input))
    df = df.withColumn("dt_minute", F.minute(input))
    df = df.withColumn("dt_second", F.second(input))

    df = df.withColumn("dt_dayofyear", F.dayofyear(input))
    df = df.withColumn("dt_dayofweek", F.dayofweek(input))
    df = df.withColumn("dt_weekofyear", F.weekofyear(input))

    return df
Beispiel #5
0
def main(spark):
    df = createDataframe(spark)
    df.show(truncate=False)

    df = df.withColumn("date", F.date_format(F.col("time"), "yyyy-MM-dd HH:mm:ss.SSSS")) \
        .withColumn("h", F.hour(F.col("date"))) \
        .withColumn("m", F.minute(F.col("date"))) \
        .withColumn("s", F.second(F.col("date"))) \
        .withColumn("event", F.expr("h*3600 + m*60 +s")) \
        .drop("date","h","m","s")

    df.show(truncate=False)

    inRange = F.udf(in_range, BooleanType())

    df = df.withColumn("between", inRange(F.col("range"), F.col("event")))

    df.show(truncate=False)
Beispiel #6
0
 def lowest_avg_idle_user(self, df):
     '''
     Find average idle hours for each users. And then find who is idle less then total average hour.
     '''
     df_idle = df.drop('working_hour', 'start_time',
                       'end_time')  # only take working hour column
     # Find average idle hour for each user
     df_avg = df_idle.groupBy('user_name').agg(
         sqlFun.from_unixtime(
             sqlFun.avg(sqlFun.unix_timestamp('idle_time')),
             'hh:mm:ss').alias('avg_time'))
     # Convert all into hour
     df_avg_hours = df_avg.withColumn(
         'avg_hour',
         (hour(df_avg['avg_time']) * 3600 +
          minute(df_avg['avg_time']) * 60 + second(df_avg['avg_time'])) /
         3600)
     #calculating average hours
     total_avg_idle_hour = df_avg_hours.select(
         avg('avg_hour')).collect()[0][0]
     lowest_idle_users = df_avg_hours.filter(
         df_avg_hours['avg_hour'] < total_avg_idle_hour).select('user_name')
     return lowest_idle_users
Beispiel #7
0
# Cast string a timestamp col time

timeFormatUDF = F.udf(lambda ts: timeFormat(ts))

dataset = dataset.withColumn(
    "time",
    timeFormatUDF(F.col("time")).cast(TimestampType()))

# Separamos col time

dataset = dataset.withColumn("year", F.year(F.col("time")))
dataset = dataset.withColumn("month", F.month(F.col("time")))
dataset = dataset.withColumn("day", F.dayofmonth(F.col("time")))
dataset = dataset.withColumn("hour", F.hour(F.col("time")))
dataset = dataset.withColumn("minute", F.minute(F.col("time")))
dataset = dataset.withColumn("second", F.second(F.col("time")))

# Separamos MCC, MNC y MSIN de la columna IMSI

dataset = dataset.withColumn('mcc', dataset.imsi.substr(1, 3))
dataset = dataset.withColumn('mnc', dataset.imsi.substr(4, 2))
dataset = dataset.withColumn('msin', dataset.imsi.substr(6, 10))

# Separamos TAC, SNR y CD de la columna IMEI
# Formato de los IMEI: TAC -- Serial_Number (14 digitos)

dataset = dataset.withColumn('tac', dataset.imei.substr(1, 8))
dataset = dataset.withColumn('snr', dataset.imei.substr(9, 6))

# Escalamos la columna year con MinMaxScaler en el rango [0,1]
Beispiel #8
0
def process_data(spark):
    """
    Read from S3 and process bike share data into dimensional tables.
    
    the bike share data (as CSVs) is read from a public S3 bucket to dataframes.
    the data is transformed using pyspark.sql functions
    finally data is saved back to the same S3 bucket in parquet fromat
    
    Parameters:
        spark: Spark session    
    """

    # read from S3 to dataframes
    st_station_df = spark.read.csv('s3://omar-dend/station.csv', header=True)
    st_weather_df = spark.read.csv('s3://omar-dend/weather.csv', header=True)
    st_trip_df = spark.read.csv('s3://omar-dend/trip.csv', header=True)
    st_status_df = spark.read.csv('s3://omar-dend/status.csv', header=True)
    st_city_df = spark.read.csv('s3://omar-dend/city.csv', header=True)

    # save counts to ensure later that all rows are present
    station_count = st_station_df.count()
    weather_count = st_weather_df.count()

    # adding timestamp to all the dataframes to standardize datetime
    st_station_df = st_station_df.withColumn(
        'datetime',
        F.to_timestamp(st_station_df.installation_date, 'MM/dd/yyyy'))

    st_weather_df = st_weather_df.withColumn(
        'datetime', F.to_timestamp(st_weather_df.date, 'MM/dd/yyyy'))

    st_trip_df = st_trip_df.withColumn(
        'datetime_start',
        F.to_timestamp(st_trip_df.start_date, 'MM/dd/yyyy HH:mm'))
    st_trip_df = st_trip_df.withColumn(
        'datetime_end', F.to_timestamp(st_trip_df.end_date,
                                       'MM/dd/yyyy HH:mm'))

    st_status_df = st_status_df.withColumn(
        'datetime', F.to_timestamp(st_status_df.time, 'yyyy/MM/dd HH:mm:ss'))

    # create dim_weather
    weather_df = st_weather_df.select('max_temperature_f', 'mean_temperature_f', 'min_temperature_f',
                                      'max_humidity', 'mean_humidity', 'min_humidity',
                                      'max_wind_Speed_mph', 'mean_wind_speed_mph',
                                      'precipitation_inches',
                                      'events', 'zip_code', 'datetime')\
                                    .dropDuplicates()

    # create dim_station
    station_df = st_station_df.select(
        F.col('id').alias('station_id'),
        F.col('name').alias('station_name'), 'lat', 'long', 'dock_count',
        'city',
        F.col('datetime').alias('installation_datetime'))

    station_df = station_df.join(st_city_df, station_df.city == st_city_df.city, 'left')\
                                .drop('city')\
                                .dropDuplicates()

    # make sure none of station or wheather data was dropped by mistake
    station_dim_count = station_df.count()
    weather_dim_count = weather_df.count()

    if station_dim_count != station_count or weather_dim_count != weather_count:
        raise Exception('Some dimensional rows are missing')
    else:
        print('All is good')

    # load (save) dim_staion to S3 in parquet fromat
    station_df.write.mode('overwrite')\
        .parquet('s3://omar-dend/dim_station')

    # load (save) dim_weather to S3 in parquet fromat partitioned by zip_code
    weather_df.write.mode('overwrite')\
        .partitionBy('zip_code')\
        .parquet('s3://omar-dend/dim_weather')

    # create dim_time
    time_df = st_station_df.select('datetime')\
        .withColumn('second', F.second('datetime'))\
        .withColumn('minute', F.minute('datetime'))\
        .withColumn('hour', F.hour('datetime'))\
        .withColumn('day', F.dayofmonth('datetime'))\
        .withColumn('week', F.weekofyear('datetime'))\
        .withColumn('month', F.month('datetime'))\
        .withColumn('year', F.year('datetime'))\
        .withColumn('weekday', F.dayofweek('datetime'))

    time_df = st_weather_df.select('datetime')\
        .withColumn('second', F.second('datetime'))\
        .withColumn('minute', F.minute('datetime'))\
        .withColumn('hour', F.hour('datetime'))\
        .withColumn('day', F.dayofmonth('datetime'))\
        .withColumn('week', F.weekofyear('datetime'))\
        .withColumn('month', F.month('datetime'))\
        .withColumn('year', F.year('datetime'))\
        .withColumn('weekday', F.dayofweek('datetime'))

    time_df = st_trip_df.select(F.col('datetime_start').alias('datetime'))\
        .withColumn('second', F.second('datetime'))\
        .withColumn('minute', F.minute('datetime'))\
        .withColumn('hour', F.hour('datetime'))\
        .withColumn('day', F.dayofmonth('datetime'))\
        .withColumn('week', F.weekofyear('datetime'))\
        .withColumn('month', F.month('datetime'))\
        .withColumn('year', F.year('datetime'))\
        .withColumn('weekday', F.dayofweek('datetime'))\

    time_df = st_trip_df.select(F.col('datetime_end').alias('datetime'))\
        .withColumn('second', F.second('datetime'))\
        .withColumn('minute', F.minute('datetime'))\
        .withColumn('hour', F.hour('datetime'))\
        .withColumn('day', F.dayofmonth('datetime'))\
        .withColumn('week', F.weekofyear('datetime'))\
        .withColumn('month', F.month('datetime'))\
        .withColumn('year', F.year('datetime'))\
        .withColumn('weekday', F.dayofweek('datetime'))\

    time_df = st_status_df.select('datetime')\
        .withColumn('second', F.second('datetime'))\
        .withColumn('minute', F.minute('datetime'))\
        .withColumn('hour', F.hour('datetime'))\
        .withColumn('day', F.dayofmonth('datetime'))\
        .withColumn('week', F.weekofyear('datetime'))\
        .withColumn('month', F.month('datetime'))\
        .withColumn('year', F.year('datetime'))\
        .withColumn('weekday', F.dayofweek('datetime'))\
        .dropDuplicates()

    # load (save) dim_weather to S3 in parquet fromat partitioned by year & month
    time_df.write.mode('overwrite')\
        .partitionBy('year', 'month')\
        .parquet('s3://omar-dend/dim_time')

    # create fact_trip
    trip_df = st_trip_df.select(F.col('id').alias('trip_id'), 'duration', 'bike_id',
                            'subscription_type',
                            'start_station_id', 'end_station_id',
                            'datetime_start', 'datetime_end')\
                            .dropDuplicates()

    # load (save) dim_weather to S3 in parquet fromat
    trip_df.write.mode('overwrite')\
        .parquet('s3://omar-dend/fact_trip')

    # create fact_status
    status_df = st_status_df.select('station_id', 'bikes_available',
                                'docks_available', 'datetime')\
                                .dropDuplicates()

    # load (save) dim_weather to S3 in parquet fromat partitioned by station_id
    status_df.write.mode('overwrite')\
        .partitionBy('station_id')\
        .parquet('s3://omar-dend/fact_status')
def process_log_data(spark, input_data_path):
    pl_start = time()
    print('Starting to process log data')
    # get filepath to log data file
    log_data = input_data_path

    # read log data file
    #df =
    log_schema = StructType([
        StructField("artist", StringType()),
        StructField("auth", StringType()),
        StructField("firstName", StringType()),
        StructField("gender", StringType()),
        StructField("itemInSession", LongType()),
        StructField("lastName", StringType()),
        StructField("length", DoubleType()),
        StructField("level", StringType()),
        StructField("location", StringType()),
        StructField("method", StringType()),
        StructField("page", StringType()),
        StructField("registration", DoubleType()),
        StructField("sessionId", LongType()),
        StructField("song", StringType()),
        StructField("status", StringType()),
        StructField("ts", StringType()),
        StructField("userAgent", StringType()),
        StructField("userId", StringType())
    ])

    log_df = spark.read.json(input_data_path, schema=log_schema)

    # filter by actions for song plays
    # Filter only column page with value "NextSong"
    #df =
    log_df = log_df.filter(log_df.page == 'NextSong').collect()

    # Convert List to Spark
    log_df = spark.createDataFrame(log_df, schema=log_schema)

    # Convert ts from long to datetime
    convert_ts = udf(
        lambda x: datetime.datetime.fromtimestamp(float(x) / 1000.0),
        TimestampType())
    log_df = log_df.withColumn("ts_converted", convert_ts(log_df.ts))

    # Convert registration from double to long
    log_df = log_df.withColumn("registration_converted",
                               log_df.registration.cast(LongType()))

    pl_et = time() - pl_start
    print("=== {} Total Elapsed time is {} sec\n".format(
        'Process log files : Read & Transformation', round(pl_et, 2)))

    print('Creating users table')
    temp_start = time()
    # extract columns for users table
    # creating users table with columns user_id, first_name, last_name, gender, level
    users_table = log_df.select(['userId', 'firstName', 'lastName', 'gender', 'level'])\
            .withColumnRenamed('userId', 'user_id')\
            .withColumnRenamed('firstName', 'first_name')\
            .withColumnRenamed('lastName', 'last_name').dropDuplicates()

    pl_et = time() - temp_start
    print("=== {} Total Elapsed time is {} sec\n".format(
        'Creating users table', round(pl_et, 2)))

    # extract columns to create time table
    # Creating time table with columns start_time, hour, day, week, month, year, weekday
    print('Creating time table')
    temp_start = time()
    time_table = log_df.select(['ts_converted'])\
                        .withColumnRenamed('ts_converted','start_time')

    time_table = time_table.withColumn('day', F.dayofmonth('start_time')) \
                          .withColumn('month', F.month('start_time')) \
                          .withColumn('year', F.year('start_time')) \
                          .withColumn('hour', F.hour('start_time')) \
                          .withColumn('minute', F.minute('start_time')) \
                          .withColumn('second', F.second('start_time')) \
                          .withColumn('week', F.weekofyear('start_time')) \
                          .withColumn('weekday', F.dayofweek('start_time')).dropDuplicates()
    pl_et = time() - temp_start
    print("=== {} Total Elapsed time is {} sec\n".format(
        'Creating time table', round(pl_et, 2)))

    pl_et = time() - pl_start
    print("=== {} Total Elapsed time is {} sec\n".format(
        'Process log files : Total', round(pl_et, 2)))
    return log_df, users_table, time_table
Beispiel #10
0
def get_time_to_purchase(timeframe, partner, premium, purchase): # todo
    """
    returns the distribution of time it takes a user to achieve a purchase
    input: a period (string), a partner (string), 2 spark dataframes (first and last one in the workflow)
    output: a dictionary with the bucket and the values associated
    """
    keys = [0, 20, 40, 60, 120, 180, 240, 300, 600]
    result = collections.OrderedDict()
    result = {key: 0 for key in keys}
    timeframe_is = get_date(timeframe)
    purchase_renam = (purchase.filter(purchase.keen.timestamp >= timeframe_is)
                      .filter(purchase.search_info.partner_id == partner)
                      .withColumnRenamed('keen', 'keen_purchase')
                      .withColumnRenamed('flight', 'flight_purchase'))
    premium_renam = (premium.filter(premium.keen.timestamp >= timeframe_is)
                     .filter(premium.search_info.partner_id == partner)
                     .withColumnRenamed('keen', 'keen_premium')
                     .withColumnRenamed('flight', 'flight_premium'))
    joined_df = purchase_renam.join(premium_renam, purchase_renam.search_info.search_id == premium_renam.search_info.search_id, 'inner')
    joined_df = joined_df.withColumn("time_to_purchase", (minute(joined_df.keen_purchase.timestamp) - minute(joined_df.keen_premium.timestamp)) * 60 + second(joined_df.keen_purchase.timestamp) - second(joined_df.keen_premium.timestamp))
    times = joined_df.groupBy("time_to_purchase").sum("purchase.quantity").collect()
    for row in times:
        for i in range(len(keys) - 1):
            if row[0] > keys[i] and row[0] <= keys[i+1]:
                result[keys[i+1]] += row[1]
    result.pop(0)
    return result
Beispiel #11
0
 def second(self) -> "ks.Series":
     """
     The seconds of the datetime.
     """
     return _column_op(lambda c: F.second(c).cast(LongType()))(
         self._data).alias(self._data.name)
Beispiel #12
0
converttimeudf = UserDefinedFunction(lambda x: convertstamp(x), TimestampType())
convertdateudf = UserDefinedFunction(lambda x: convertdate(x), TimestampType())


#with column allows to introduce a new column keeping others in place.
df3=df.withColumn("TIME_SCHEDULED", converttimeudf(df['TIME_SCHEDULED']))\
    .withColumn("TRIP_START_TIME", converttimeudf(df['TRIP_START_TIME']))\
    .withColumn("TIME_ACTUAL_ARRIVE", converttimeudf(df['TIME_ACTUAL_ARRIVE']))\
    .withColumn("TIME_ACTUAL_DEPART", converttimeudf(df['TIME_ACTUAL_DEPART']))\
    .withColumn("SURVEY_DATE",convertdateudf(df["SURVEY_DATE"]).cast(DateType()))

df4=df3.withColumn("MONTH",month(df3["SURVEY_DATE"]))\
    .withColumn("YEAR",year(df3["SURVEY_DATE"]))\
    .withColumn("TIME_SCHEDULED_HOUR",hour(df3["TIME_SCHEDULED"]))\
    .withColumn("TIME_SCHEDULED_MIN",minute(df3["TIME_SCHEDULED"]))\
    .withColumn("TIME_SCHEDULED_SEC",second(df3["TIME_SCHEDULED"]))\
    .withColumn("TRIP_START_TIME_HOUR",hour(df3["TRIP_START_TIME"]))\
    .withColumn("TRIP_START_TIME_MIN",minute(df3["TRIP_START_TIME"]))\
    .withColumn("TRIP_START_TIME_SEC",second(df3["TRIP_START_TIME"]))\
    .withColumn("TIME_ACTUAL_ARRIVE_HOUR",hour(df3["TIME_ACTUAL_ARRIVE"]))\
    .withColumn("TIME_ACTUAL_ARRIVE_MIN",minute(df3["TIME_ACTUAL_ARRIVE"]))\
    .withColumn("TIME_ACTUAL_ARRIVE_SEC",second(df3["TIME_ACTUAL_ARRIVE"]))\
    .withColumn("TIME_ACTUAL_DEPART_HOUR",hour(df3["TIME_ACTUAL_DEPART"]))\
    .withColumn("TIME_ACTUAL_DEPART_MIN",minute(df3["TIME_ACTUAL_DEPART"]))\
    .withColumn("TIME_ACTUAL_DEPART_SEC",second(df3["TIME_ACTUAL_DEPART"]))

df5=df4.withColumn('DIRECTION_NAME',when(df4.DIRECTION_NAME=="OUTYBOUND" ,"OUTBOUND")\
    .when(df4.DIRECTION_NAME=="0" ,"OUTBOUND")\
        .when(df4.DIRECTION_NAME=="1" ,"INBOUND")\
            .otherwise(df4.DIRECTION_NAME))
Beispiel #13
0
the_variable = df_time_ma.select('variable').distinct().take(1)[0].variable
the_day = (sorted([
    x.dayofyear for x in (df_time_ma.select(
        dayofyear('timestamp').alias('dayofyear')).distinct().take(5))
]))[0]  # min doesn't work on javalist
the_hours = 12
the_title = ("Data for asset {}, variable {} for day {}, {} hours".format(
    the_asset, the_variable, the_day, the_hours))

# currently exports to pandas for visualization and export in CSV format, later on the pyspark dataframe is exported in CSV
test_df = df_time_ma.filter(df_time_ma.asset == the_asset).filter(
    df_time_ma.variable == the_variable).filter(
        dayofyear('timestamp') == the_day).filter(
            hour('timestamp') <= the_hours).cache()
test_df_1s = test_df.toPandas()
test_df_60s = test_df.filter(second(df_time_ma.timestamp) == 0).toPandas()
test_df_10m = test_df.filter(minute(df_time_ma.timestamp) % 10 == 0).filter(
    second(df_time_ma.timestamp) == 0).toPandas()

plt.figure(figsize=(12, 4))
plt.plot(test_df_1s.timestamp, test_df_1s.ma, 'b')
plt.plot(test_df_60s.timestamp, test_df_60s.ma, 'r')
plt.plot(test_df_10m.timestamp, test_df_10m.ma, 'g')
plt.grid()
plt.title(the_title)
plt.legend(['1s', '60s', '10m'])
display(plt.gcf())

# COMMAND ----------

from itertools import chain
Beispiel #14
0
def process_log_dataset(spark, log_dataset, output_data, df_songs, df_artists,
                        parquet, include_the_dimensions):
    # read log data file
    df_staging_events = spark.read.json(log_dataset)

    # create the get timestamp user defined function
    get_timestamp_udf = F.udf(lambda x: datetime.fromtimestamp((x / 1000.0)),
                              T.TimestampType())

    # filter events i.e. page = NextSong
    df_staging_events = df_staging_events.where(col('page').isin({'NextSong'}))

    # convert the ts column from epoch to timestamp
    df_staging_events = df_staging_events.withColumn(
        "start_time", get_timestamp_udf(df_staging_events.ts))
    df_staging_events = df_staging_events.withColumn(
        "year", year(df_staging_events.start_time))
    df_staging_events = df_staging_events.withColumn(
        "month", month(df_staging_events.start_time))

    # create time dataframe and drop duplicate ts rows
    df_time = df_staging_events.drop_duplicates(subset=['ts'])

    # convert the ts column from epoch to timestamp
    df_time = df_time.withColumn("start_time", get_timestamp_udf(df_time.ts))

    # create time dimension dataframe
    df_time = df_time.select(
        df_time.start_time.alias('start_time'),
        year(df_time.start_time).alias('year'),
        month(df_time.start_time).alias('month'),
        dayofmonth(df_time.start_time).alias('dayofmonth'),
        hour(df_time.start_time).alias('hour'),
        minute(df_time.start_time).alias('minute'),
        second(df_time.start_time).alias('second'),
        dayofweek(df_time.start_time).alias('dayofweek'),
        dayofyear(df_time.start_time).alias('dayofyear'),
        weekofyear(df_time.start_time).alias('weekofyear'))

    # create users dimension
    df_users = df_staging_events.select(
        df_staging_events.userId.alias('user_id'),
        df_staging_events.firstName.alias('first_name'),
        df_staging_events.lastName.alias('last_name'),
        df_staging_events.gender.alias('gender'))

    # drop duplicate user rows
    df_users = df_users.drop_duplicates(subset=['user_id'])

    # create temporary views
    df_staging_events.createOrReplaceTempView("staging_events")
    df_artists.createOrReplaceTempView("artists")
    df_songs.createOrReplaceTempView("songs")

    # extract columns from joined song and log datasets to create songplays table
    df_songplays = spark.sql(
        """select null as songplay_id ,se.start_time ,se.year ,se.month ,se.userId as user_id ,se.level ,s.song_id ,a.artist_id ,se.sessionId as session_id ,se.location ,se.userAgent as user_agent from staging_events se join artists a on se.artist = a.name join songs s on se.song = s.title and s.artist_id = a.artist_id where 1 = 1"""
    )
    # populate songplays surrogate key
    df_songplays = df_songplays.withColumn("songplay_id",
                                           monotonically_increasing_id())

    if parquet:
        # write songplays table to parquet files partitioned by year and month
        df_songplays.write.partitionBy('year', 'month').parquet(
            output_data + "songplays", mode="overwrite")

    if include_the_dimensions:
        df_songplays.createOrReplaceTempView("songplays")
        df_artists = spark.sql(
            """select distinct a.* from artists a join songplays sp on a.artist_id = sp.artist_id where 1 = 1"""
        )
        df_songs = spark.sql(
            """select distinct s.* from songs s join songplays sp on s.song_id = sp.song_id where 1 = 1"""
        )
        df_time.createOrReplaceTempView("time")
        df_time = spark.sql(
            """select distinct t.* from time t join songplays sp on t.start_time = sp.start_time where 1 = 1"""
        )
        df_users.createOrReplaceTempView("users")
        df_users = spark.sql(
            """select distinct u.* from users u join songplays sp on u.user_id = sp.user_id where 1 = 1"""
        )

    if parquet:
        if include_the_dimensions:
            # write users table to parquet files partitioned by none
            df_users.write.parquet(output_data + "users", mode="overwrite")

            # write songs table to parquet files partitioned by year and artist
            df_songs.write.partitionBy('year', 'artist_id').parquet(
                output_data + "songs", mode="overwrite")

            # write artists table to parquet files partitioned by none
            df_artists.write.parquet(output_data + "artists", mode="overwrite")

            # write time table to parquet files partitioned by year and month
            df_time.write.partitionBy('year',
                                      'month').parquet(output_data + "time",
                                                       mode="overwrite")

    # print table info
    print('songplays info...')
    print('count(s)')
    df_songplays.groupby(df_songplays.year,
                         df_songplays.month).count().orderBy(
                             df_songplays.year, df_songplays.month).show()
    df_songplays.printSchema()
    df_songplays.show(5)

    if include_the_dimensions:
        print('users info...')
        print('count(s)')
        print(df_users.count())
        df_users.printSchema()
        df_users.show(5)

        print('songs info...')
        print('count(s)')
        df_songs.groupby(df_songs.year).count().orderBy(df_songs.year).show()
        df_songs.printSchema()
        df_songs.show(5)

        print('artists info...')
        print('count(s)')
        print(df_artists.count())
        df_artists.printSchema()
        df_artists.show(5)

        print('time info...')
        print('count(s)')
        df_time.groupby(df_time.year,
                        df_time.month).count().orderBy(df_time.year,
                                                       df_time.month).show()
        df_time.printSchema()
        df_time.show(5)
Beispiel #15
0
df.printSchema()
from pyspark.sql.functions import to_timestamp, date_format
from pyspark.sql.functions import year, month, dayofmonth, dayofweek, hour, minute, second

# Convert processing time from int to timestamp
df = df.withColumn("processing-time", to_timestamp(col="processing-time"))

# Add columns for extracted features

print("After Conversion \n")
df.printSchema()

# Extract Day of Week and Week of Month using the feature extraction

df = df.withColumn("dayOfWeek",dayofweek(col="processing-time"))\
    .withColumn("dayOfMonth",dayofmonth(col="processing-time"))\
    .withColumn("weekOfMonth",date_format(date ="processing-time",format= "W"))\
    .withColumn("year",year(col="processing-time"))\
    .withColumn("month",month(col="processing-time"))\
    .withColumn("hour",hour(col="processing-time"))\
    .withColumn("minute",minute(col="processing-time"))\
    .withColumn("second",second(col="processing-time"))\
    .drop("processing-time")

df.show()

startTime = time.time()
df.toPandas().to_csv(path_or_buf="../../resources/newDatasets/dataset-1.csv")
endTime = time.time()
print("Time taken to convert df to pandas to csv: ", endTime - startTime)
Beispiel #16
0
    def _bin_time_stamp(self, origin: pd.Timestamp, ts_scol: Column) -> Column:
        sql_utils = SparkContext._active_spark_context._jvm.PythonSQLUtils
        origin_scol = F.lit(origin)
        (rule_code, n) = (self._offset.rule_code, self._offset.n
                          )  # type: ignore[attr-defined]
        left_closed, right_closed = (self._closed == "left",
                                     self._closed == "right")
        left_labeled, right_labeled = (self._label == "left",
                                       self._label == "right")

        if rule_code == "A-DEC":
            assert (origin.month == 12 and origin.day == 31
                    and origin.hour == 0 and origin.minute == 0
                    and origin.second == 0)

            diff = F.year(ts_scol) - F.year(origin_scol)
            mod = F.lit(0) if n == 1 else (diff % n)
            edge_cond = (mod == 0) & (F.month(ts_scol)
                                      == 12) & (F.dayofmonth(ts_scol) == 31)

            edge_label = F.year(ts_scol)
            if left_closed and right_labeled:
                edge_label += n
            elif right_closed and left_labeled:
                edge_label -= n

            if left_labeled:
                non_edge_label = F.when(mod == 0,
                                        F.year(ts_scol) -
                                        n).otherwise(F.year(ts_scol) - mod)
            else:
                non_edge_label = F.when(
                    mod == 0,
                    F.year(ts_scol)).otherwise(F.year(ts_scol) - (mod - n))

            return F.to_timestamp(
                F.make_date(
                    F.when(edge_cond, edge_label).otherwise(non_edge_label),
                    F.lit(12), F.lit(31)))

        elif rule_code == "M":
            assert (origin.is_month_end and origin.hour == 0
                    and origin.minute == 0 and origin.second == 0)

            diff = ((F.year(ts_scol) - F.year(origin_scol)) * 12 +
                    F.month(ts_scol) - F.month(origin_scol))
            mod = F.lit(0) if n == 1 else (diff % n)
            edge_cond = (mod == 0) & (F.dayofmonth(ts_scol) == F.dayofmonth(
                F.last_day(ts_scol)))

            truncated_ts_scol = F.date_trunc("MONTH", ts_scol)
            edge_label = truncated_ts_scol
            if left_closed and right_labeled:
                edge_label += sql_utils.makeInterval("MONTH", F.lit(n)._jc)
            elif right_closed and left_labeled:
                edge_label -= sql_utils.makeInterval("MONTH", F.lit(n)._jc)

            if left_labeled:
                non_edge_label = F.when(
                    mod == 0,
                    truncated_ts_scol -
                    sql_utils.makeInterval("MONTH",
                                           F.lit(n)._jc),
                ).otherwise(truncated_ts_scol -
                            sql_utils.makeInterval("MONTH", mod._jc))
            else:
                non_edge_label = F.when(mod == 0, truncated_ts_scol).otherwise(
                    truncated_ts_scol -
                    sql_utils.makeInterval("MONTH", (mod - n)._jc))

            return F.to_timestamp(
                F.last_day(
                    F.when(edge_cond, edge_label).otherwise(non_edge_label)))

        elif rule_code == "D":
            assert origin.hour == 0 and origin.minute == 0 and origin.second == 0

            if n == 1:
                # NOTE: the logic to process '1D' is different from the cases with n>1,
                # since hour/minute/second parts are taken into account to determine edges!
                edge_cond = ((F.hour(ts_scol) == 0) & (F.minute(ts_scol) == 0)
                             & (F.second(ts_scol) == 0))

                if left_closed and left_labeled:
                    return F.date_trunc("DAY", ts_scol)
                elif left_closed and right_labeled:
                    return F.date_trunc("DAY", F.date_add(ts_scol, 1))
                elif right_closed and left_labeled:
                    return F.when(edge_cond,
                                  F.date_trunc("DAY", F.date_sub(
                                      ts_scol, 1))).otherwise(
                                          F.date_trunc("DAY", ts_scol))
                else:
                    return F.when(edge_cond,
                                  F.date_trunc("DAY", ts_scol)).otherwise(
                                      F.date_trunc("DAY",
                                                   F.date_add(ts_scol, 1)))

            else:
                diff = F.datediff(end=ts_scol, start=origin_scol)
                mod = diff % n

                edge_cond = mod == 0

                truncated_ts_scol = F.date_trunc("DAY", ts_scol)
                edge_label = truncated_ts_scol
                if left_closed and right_labeled:
                    edge_label = F.date_add(truncated_ts_scol, n)
                elif right_closed and left_labeled:
                    edge_label = F.date_sub(truncated_ts_scol, n)

                if left_labeled:
                    non_edge_label = F.date_sub(truncated_ts_scol, mod)
                else:
                    non_edge_label = F.date_sub(truncated_ts_scol, mod - n)

                return F.when(edge_cond, edge_label).otherwise(non_edge_label)

        elif rule_code in ["H", "T", "S"]:
            unit_mapping = {"H": "HOUR", "T": "MINUTE", "S": "SECOND"}
            unit_str = unit_mapping[rule_code]

            truncated_ts_scol = F.date_trunc(unit_str, ts_scol)
            diff = sql_utils.timestampDiff(unit_str, origin_scol._jc,
                                           truncated_ts_scol._jc)
            mod = F.lit(0) if n == 1 else (diff % F.lit(n))

            if rule_code == "H":
                assert origin.minute == 0 and origin.second == 0
                edge_cond = (mod == 0) & (F.minute(ts_scol)
                                          == 0) & (F.second(ts_scol) == 0)
            elif rule_code == "T":
                assert origin.second == 0
                edge_cond = (mod == 0) & (F.second(ts_scol) == 0)
            else:
                edge_cond = mod == 0

            edge_label = truncated_ts_scol
            if left_closed and right_labeled:
                edge_label += sql_utils.makeInterval(unit_str, F.lit(n)._jc)
            elif right_closed and left_labeled:
                edge_label -= sql_utils.makeInterval(unit_str, F.lit(n)._jc)

            if left_labeled:
                non_edge_label = F.when(mod == 0, truncated_ts_scol).otherwise(
                    truncated_ts_scol -
                    sql_utils.makeInterval(unit_str, mod._jc))
            else:
                non_edge_label = F.when(
                    mod == 0,
                    truncated_ts_scol +
                    sql_utils.makeInterval(unit_str,
                                           F.lit(n)._jc),
                ).otherwise(truncated_ts_scol -
                            sql_utils.makeInterval(unit_str, (mod - n)._jc))

            return F.when(edge_cond, edge_label).otherwise(non_edge_label)

        else:
            raise ValueError("Got the unexpected unit {}".format(rule_code))
Beispiel #17
0
# Similar methods: month, dayofweek, minute, second
# Expected:
# +-----------------+--------------------+-------------+---------------+----------+----+-----+---------+------+------+
# |               id|                  ts|  date string|    time string|  date_new|year|month|dayofweek|minute|second|
# +-----------------+--------------------+-------------+---------------+----------+----+-----+---------+------+------+
# |UA000000107379500|2020-07-04 16:09:...|July 04, 2020|16:09:06.592107|04-09-2020|2020|    7|        7|     9|     6|
# |UA000000107359357|2020-07-04 15:36:...|July 04, 2020|15:36:51.756535|04-36-2020|2020|    7|        7|    36|    51|
# |UA000000107375547|2020-07-04 16:06:...|July 04, 2020|16:06:55.459100|04-06-2020|2020|    7|        7|     6|    55|
# +-----------------+--------------------+-------------+---------------+----------+----+-----+---------+------+------+

# Answer
df = (df.withColumn("year", F.year(F.col("ts"))).withColumn(
    "month",
    F.month(F.col("ts"))).withColumn("dayofweek", F.dayofweek(
        F.col("ts"))).withColumn("minute", F.minute(F.col("ts"))).withColumn(
            "second", F.second(F.col("ts"))))
df.show()

# COMMAND ----------

# Converts the column into DateType with name "date" by casting rules to DateType (use function to_date).
# Then create a column plus_two_days that adds 2 days to the date. Select "date" and "plus_two_days"
# Expected:
# +----------+-------------+
# |      date|plus_two_days|
# +----------+-------------+
# |2020-07-04|   2020-07-06|
# |2020-07-04|   2020-07-06|
# |2020-07-04|   2020-07-06|
# +----------+-------------+
Beispiel #18
0
train_data = spark.read.csv("E:/taikingdata/train.csv",
                            header=True,
                            schema=schematype1)
test_data = spark.read.csv("E:/taikingdata/test.csv",
                           header=True,
                           schema=schematype2)
train_df = train_data
#点击时间的拆分
train_df = train_df.withColumn('hour',
                               fn.hour('click_time').cast(IntegerType()))
train_df = train_df.withColumn('day',
                               fn.dayofmonth('click_time').cast(IntegerType()))
train_df = train_df.withColumn('minute',
                               fn.minute('click_time').cast(IntegerType()))
train_df = train_df.withColumn('second',
                               fn.second('click_time').cast(IntegerType()))
#分组点击次数的衍生特征(按照click_time衍生)
gp = train_df.groupby(['ip', 'app', 'device', 'os']).agg(
    fn.count('click_time').cast(IntegerType()).alias('ct1_count'))
train_df = train_df.join(gp, on=['ip', 'app', 'device', 'os'], how='left')
gp = train_df.groupby(['ip', 'device', 'os', 'channel']).agg(
    fn.count('click_time').cast(IntegerType()).alias('ct2_count'))
train_df = train_df.join(gp, on=['ip', 'device', 'os', 'channel'], how='left')
gp = train_df.groupby(['ip', 'app', 'device', 'os', 'channel']).agg(
    fn.count('click_time').cast(IntegerType()).alias('ct3_count'))
train_df = train_df.join(gp,
                         on=['ip', 'app', 'device', 'os', 'channel'],
                         how='left')
#仅对ip分组,对其他特征做count和unique

gp = train_df.groupby(['ip', 'app']).agg(
 def second(self) -> "ps.Series":
     """
     The seconds of the datetime.
     """
     return self._data.spark.transform(lambda c: F.second(c).cast(LongType()))
Beispiel #20
0
def process_log_data(spark, input_data_path):
    """
    Summary line. 
    Process log data
  
    Parameters: 
    arg1 (spark object)
    arg2 (Read input from this path which can be local or S3)
  
    Returns: 
    log_df, users_table, time_table, user_listen
    """

    pl_start = time()
    print('Starting to process log data')
    # get filepath to log data file
    log_data = input_data_path

    # read log data file
    log_schema = StructType([
        StructField("artist", StringType()),
        StructField("auth", StringType()),
        StructField("firstName", StringType()),
        StructField("gender", StringType()),
        StructField("itemInSession", LongType()),
        StructField("lastName", StringType()),
        StructField("length", DoubleType()),
        StructField("level", StringType()),
        StructField("location", StringType()),
        StructField("method", StringType()),
        StructField("page", StringType()),
        StructField("registration", DoubleType()),
        StructField("sessionId", LongType()),
        StructField("song", StringType()),
        StructField("status", StringType()),
        StructField("ts", StringType()),
        StructField("userAgent", StringType()),
        StructField("userId", StringType())
    ])

    log_df = spark.read.json(input_data_path, schema=log_schema)

    # Number of songs users listened to during each level
    paid_users = log_df.select(['userId',
                                'level']).filter(log_df['level'] == 'paid')
    paid_users = paid_users.groupby(['userId']).count()
    free_users = log_df.select(['userId',
                                'level']).filter(log_df['level'] == 'free')
    free_users = free_users.groupby(['userId']).count()
    paid_users.createOrReplaceTempView('paid_users')
    free_users.createOrReplaceTempView('free_users')
    user_listen = spark.sql("""
        select a.userId, a.count puCount, b.count fuCount
        from paid_users a join free_users b
        on a.userId = b.userId
        where a.userId != ''
    """)

    # Filter only column page with value "NextSong"
    log_df = log_df.filter(log_df.page == 'NextSong').collect()

    # Convert List to Spark
    log_df = spark.createDataFrame(log_df, schema=log_schema)

    # Convert ts from long to datetime
    convert_ts = udf(
        lambda x: datetime.datetime.fromtimestamp(float(x) / 1000.0),
        TimestampType())
    log_df = log_df.withColumn("ts_converted", convert_ts(log_df.ts))

    # Convert registration from double to long
    log_df = log_df.withColumn("registration_converted",
                               log_df.registration.cast(LongType()))

    pl_et = time() - pl_start
    print("=== {} Total Elapsed time is {} sec\n".format(
        'Process log files : Read & Transformation', round(pl_et, 2)))

    print('Creating users table')
    temp_start = time()
    # extract columns for users table
    # creating users table with columns user_id, first_name, last_name, gender, level
    users_table = log_df.select(['userId', 'firstName', 'lastName', 'gender', 'level'])\
            .withColumnRenamed('userId', 'user_id')\
            .withColumnRenamed('firstName', 'first_name')\
            .withColumnRenamed('lastName', 'last_name').dropDuplicates()

    pl_et = time() - temp_start
    print("=== {} Total Elapsed time is {} sec\n".format(
        'Creating users table', round(pl_et, 2)))

    print('Creating user_listen table')
    temp_start = time()
    user_listen.createOrReplaceTempView('user_listen')
    users_table.createOrReplaceTempView('users')
    user_listen = spark.sql("""
    select distinct b.first_name, a.puCount, a.fuCount
    from user_listen a join users b
    on a.userId = b.user_id
    """)
    pl_et = time() - temp_start
    print("=== {} Total Elapsed time is {} sec\n".format(
        'Creating user_listen table', round(pl_et, 2)))

    # extract columns to create time table
    # Creating time table with columns start_time, hour, day, week, month, year, weekday
    print('Creating time table')
    temp_start = time()
    time_table = log_df.select(['ts_converted'])\
                        .withColumnRenamed('ts_converted','start_time')

    time_table = time_table.withColumn('day', F.dayofmonth('start_time')) \
                          .withColumn('month', F.month('start_time')) \
                          .withColumn('year', F.year('start_time')) \
                          .withColumn('hour', F.hour('start_time')) \
                          .withColumn('minute', F.minute('start_time')) \
                          .withColumn('second', F.second('start_time')) \
                          .withColumn('week', F.weekofyear('start_time')) \
                          .withColumn('weekday', F.dayofweek('start_time')).dropDuplicates()
    pl_et = time() - temp_start
    print("=== {} Total Elapsed time is {} sec\n".format(
        'Creating time table', round(pl_et, 2)))

    pl_et = time() - pl_start
    print("=== {} Total Elapsed time is {} sec\n".format(
        'Process log files : Total', round(pl_et, 2)))
    return log_df, users_table, time_table, user_listen
Beispiel #21
0
def process_log_data(spark, input_data, output_data):
    # get filepath to log data file
    log_data = os.path.join(input_data, "log_data/*/*/*.json")

    # read log data file
    df = spark.read.json(log_data)

    # filter by actions for song plays
    df = df.filter(df.page == "NextSong")

    # extract columns for users table
    users_table = df.select("userId", "firstName", "lastName", "gender",
                            "level").dropDuplicates()

    # write users table to parquet files
    usersParquetPath = os.path.join(output_data, "users.parquet")
    users_table.write.parquet(usersParquetPath)
    usersParquetFile = spark.read.parquet(usersParquetPath)
    usersParquetFile.createOrReplaceTempView("usersParquetFile")

    # create timestamp column from original timestamp column
    get_timestamp = udf(lambda x: datetime.fromtimestamp(x / 1000).strftime(
        '%Y-%m-%d %H:%M:%S'))
    df = df.withColumn("datetime", get_timestamp(df.ts))
    df = df.withColumn("year", F.year("datetime"))\
       .withColumn("month", F.month("datetime"))\
       .withColumn("day", F.dayofweek("datetime"))\
       .withColumn("hour", F.hour("datetime"))\
       .withColumn("minute", F.minute("datetime"))\
       .withColumn("second", F.second("datetime"))\
       .withColumn("weekday", F.dayofweek("datetime"))

    # create datetime column from original timestamp column
    # get_datetime = udf()
    # df =

    # extract columns to create time table
    time_table = df.select("ts", "year", "month", "day", "hour", "minute",
                           "second", "weekday").dropDuplicates()

    # write time table to parquet files partitioned by year and month

    timeParquetPath = os.path.join(output_data, "time.parquet")
    time_table.write.partitionBy("year", "month").parquet(timeParquetPath)
    timeParquetFile = spark.read.parquet(timeParquetPath)
    timeParquetFile.createOrReplaceTempView("timeParquetFile")

    # read in song data to use for songplays table
    song_data = os.path.join(input_data, "song_data/*/*/*/*.json")
    song_df = spark.read.json(song_data)

    # extract columns from joined song and log datasets to create songplays table
    df1 = df.alias('df1')
    df2 = song_df.alias('df2')
    songplays_table = df1.join(df2, df1.artist == df2.artist_name)\
                         .select("ts", "userId", "level", "song_id", "artist_id", "sessionId", "location", "userAgent", "df1.year", "month")\
                         .dropDuplicates()

    # write songplays table to parquet files partitioned by year and month

    songPlaysParquetPath = os.path.join(output_data, "songplay.parquet")
    songplays_table.write.partitionBy("year",
                                      "month").parquet(songPlaysParquetPath)
    songplayParquetFile = spark.read.parquet(songPlaysParquetPath)
    songplayParquetFile.createOrReplaceTempView("songplayParquetFile")
# Generate dim_Time feed
combined_timestamp = source_glasses.select("timestamp") \
    .union(source_report.select("timestamp")) \
    .union(source_smartphone.select("timestamp")) \
    .union(source_smartwatch.select("timestamp"))
time_df = combined_timestamp.select("timestamp") \
    .where(col("Timestamp").isNotNull()) \
    .distinct() \
    .orderBy("timestamp")
time_df = time_df.withColumn("Year", year(time_df["timestamp"])) \
    .withColumn("Month", month(time_df["timestamp"])) \
    .withColumn("Day", dayofmonth(time_df["timestamp"])) \
    .withColumn("Hour", hour(time_df["timestamp"])) \
    .withColumn("Minute", minute(time_df["timestamp"])) \
    .withColumn("Second", second(time_df["timestamp"]))

# prepare glasses activities
glasses_activities_acc_x = time_df.join(glasses_df, "timestamp", how="inner") \
    .select(
    [F.lit(1).alias("PersonId"), F.lit(3).alias("SourceId"), F.lit("ACC_X").alias("Collectible"), "timestamp", "ACC_X"])
glasses_activities_acc_y = time_df.join(glasses_df, "timestamp", how="inner") \
    .select(
    [F.lit(1).alias("PersonId"), F.lit(3).alias("SourceId"), F.lit("ACC_Y").alias("Collectible"), "timestamp", "ACC_Y"])
glasses_activities_acc_z = time_df.join(glasses_df, "timestamp", how="inner") \
    .select(
    [F.lit(1).alias("PersonId"), F.lit(3).alias("SourceId"), F.lit("ACC_Z").alias("Collectible"), "timestamp", "ACC_Z"])
glasses_activities_gyro_x = time_df.join(glasses_df, "timestamp", how="inner") \
    .select([F.lit(1).alias("PersonId"), F.lit(3).alias("SourceId"), F.lit("GYRO_X").alias("Collectible"), "timestamp",
             "GYRO_X"])
glasses_activities_gyro_y = time_df.join(glasses_df, "timestamp", how="inner") \