Beispiel #1
0
def process_fact_table(spark):
    '''
    Join and enrich the data

    Parameters:
    spark : Spark Session
    '''
    files_list = [
        'equipment_sensors', 'equipment_failure_sensors', 'equipment'
    ]

    #dict comprehension
    df_dict = {
        f: read_silver(spark, 'datalake/silver/{0}/'.format(f))
        for f in files_list
    }

    fact = df_dict.get('equipment_sensors').join(df_dict.get('equipment'),
                                                 ['equipment_id'],
                                                 how='inner')

    fact = fact.join(df_dict.get('equipment_failure_sensors'), ['sensor_id'],
                     how='inner')

    fact = fact.select('*',
                       dayofmonth('date').alias('day'),
                       weekofyear('date').alias('week'),
                       month('date').alias('month'),
                       year('date').alias('year'),
                       dayofweek('date').alias('weekday'))

    print('2-->Enrich - OK ')
    write_gold(fact, 'datalake/gold/fact/')
def create_immigration_calendar_dimension(df, output_data):
    """This function creates an immigration calendar based on arrival date
    :param df: spark dataframe of immigration events
    :param output_data: path to write dimension dataframe to
    :return: spark dataframe representing calendar dimension
    """
    # create a udf to convert arrival date in SAS format to datetime object
    get_datetime = udf(lambda x: (dt.datetime(1960, 1, 1).date() + dt.timedelta(x)).isoformat() if x else None)

    # create initial calendar df from arrdate column
    calendar = df.select(['arrdate']).withColumn("arrdate", get_datetime(df.arrdate)).distinct()

    # expand df by adding other calendar columns
    calendar = calendar_df.withColumn('arrival_day', dayofmonth('arrdate'))
    calendar = calendar_df.withColumn('arrival_week', weekofyear('arrdate'))
    calendar = calendar_df.withColumn('arrival_month', month('arrdate'))
    calendar = calendar_df.withColumn('arrival_year', year('arrdate'))
    calendar = calendar_df.withColumn('arrival_weekday', dayofweek('arrdate'))

    # create an id field in calendar df
    calendar = calendar.withColumn('id', monotonically_increasing_id())

    # write the calendar dimension to parquet file
    partition_columns = ['arrival_year', 'arrival_month', 'arrival_week']
    calendar.write.parquet(output_data + "immigration_calendar", partitionBy=partition_columns, mode="overwrite")

    return calendar
Beispiel #3
0
def process_time(spark, log_df, output_loc):
    """Extract time data from input DataFrame and write to output location as
    collection of parquet files.
    Args:
        spark (obj): SparkSession
        log_df (obj): DataFrame with input data
        output_loc (str): output location
    """
    get_datetime = udf(lambda x: datetime.fromtimestamp(x / 1000.0).strftime(
        '%Y-%m-%d %H:%M:%S'))

    # extract columns to create time table
    time_table = log_df.select(
        get_datetime('ts').cast('timestamp').alias('start_time'))
    time_table = time_table \
        .select('start_time',
                hour('start_time').alias('hour'),
                dayofmonth('start_time').alias('day'),
                weekofyear('start_time').alias('week'),
                month('start_time').alias('month'),
                year('start_time').alias('year'),
                dayofweek('start_time').alias('weekday')) \
        .dropDuplicates()

    # write time table to parquet file
    time_table.write.partitionBy(
        'year', 'month').mode('append').parquet(f'{output_loc}time')

    print(f'Processed {time_table.count()} timestamps')
Beispiel #4
0
    def _extract_dim_date(self, imgDF: DataFrame, output_path: str):
        print('Extracting dim_date from immigration data...')

        dateCols = [col for col in imgDF.columns if col.endswith('_date')]
        dates: DataFrame = None
        for dateCol in dateCols:
            dt = imgDF. \
                where(
                F.col(dateCol).isNotNull()
            ). \
                select(
                    F.col(dateCol).alias('date'),
                    F.year(F.col(dateCol)).cast('smallint').alias('year'),
                    F.quarter(F.col(dateCol)).cast('smallint').alias('quarter'),
                    F.month(F.col(dateCol)).cast('smallint').alias('month'),
                    F.dayofweek(F.col(dateCol)).cast('smallint').alias('day_of_week'),
                    F.dayofmonth(F.col(dateCol)).cast('smallint').alias('day_of_month'),
                    F.dayofyear(F.col(dateCol)).cast('smallint').alias('day_of_year'),
                    F.weekofyear(F.col(dateCol)).cast('smallint').alias('week_of_year')
            )
            if dates:
                dates = dates.union(dt).distinct()
            else:
                dates = dt

        print('Saving dim_date')
        self._write_data(dates, output_path, 'date')
        print('Finished saving dim_date')

        print('Finished extracting dim_date from immigration data.')
Beispiel #5
0
    def transform_i94_data(self):
        #Read the Immigration dataset in SAS format 
        i94_df = self.spark.read.format('com.github.saurfang.sas.spark').load(self.load_path + config['LOAD']['I94_DATA'] )
        
        #drop the columns that are not required for analysis
        i94_drop_cols =  ['count','visapost','occup','matflag','biryear','insnum']
        i94_df = i94_df.drop(*i94_drop_cols)
        
        #convert the numeric columns to Integer datatype
        int_cols = ['cicid', 'i94yr', 'i94mon', 'i94cit', 'i94res', 'arrdate', 'i94mode','depdate', 'i94bir', 'i94visa']
        i94_df = fn.cast_type(i94_df,dict(zip(int_cols,len(int_cols)*[IntegerType()])))
        
        #convert the SAS date format to String format
        date_cols = ['arrdate','depdate']
        for c in date_cols:
            i94_df = i94_df.withColumn(c,fn.sas_to_date_udf(i94_df[c]))

        #Create dates dataframe to be stored as a separate file
        arrdate  = i94_df.select('arrdate').distinct()
        depdate  = i94_df.select('depdate').distinct()
        date_df = arrdate.union(depdate)
        date_df = date_df.withColumnRenamed('arrdate','date')
        date_df = date_df.withColumn('year',F.year(date_df.date))   \
                         .withColumn('month',F.month(date_df.date)) \
                         .withColumn('day',F.dayofmonth(date_df.date))  \
                         .withColumn('dayofweek',F.dayofweek(date_df.date))  \
                         .withColumn('dayofyear',F.dayofyear(date_df.date))
        
        #Write the dates dataframe to a file in S3 in parquet format
        date_df.write.mode('overwrite').parquet(self.save_path + date_save_path)

        #Write the I94 dataframe to a file in S3 in parquet format partioned by year and month   
        i94_df.write.partitionBy('i94yr','i94mon').mode('overwrite').parquet(self.save_path + immigration_save_path)
Beispiel #6
0
    def create_time_table(log_raw, output_bucket_name: str):
        """
        Create time pyspark dataframe

        Arguments:
            log_raw {DerivativeDF} -- Log helping class for pyspak dtaframes
            output_bucket_name {str} -- Output in S3 location

        Returns:
            {DerivativeDF} -- time derivate dataframe
        """
        time = DerivativeDF(log_raw.df
                            .withColumn("hour", hour("start_time"))
                            .withColumn("day", dayofmonth("start_time"))
                            .withColumn("week", weekofyear("start_time"))
                            .withColumn("month", month("start_time"))
                            .withColumn("year", year("start_time"))
                            .withColumn("weekday", dayofweek("start_time"))
                            .select("start_time", "hour", "day", "week", "month", "year", "weekday")
                            .distinct()
                            )

        time._write_to_parquet(
            s3_output_path=f"s3://{output_bucket_name}/time",
            partitions=["year", "month"]
        )

        return time
Beispiel #7
0
def create_dim_time(spark, log_data_df):
    """Creates time dimension dateframe

    Args:
        spark: Spark client
        log_data_df: Sparkify event log data

    Returns:
        Time dataframe
    """
    time_df = (
        log_data_df
        # It's possible for timestamps to appear multiple times if two events happened at the same time
        .drop_duplicates(["ts"])
        # Filter by NextSong since we only care about "song plays"
        .filter(log_data_df.page == "NextSong")
        # Divide the ts column by 1000 since the timestamp is in milliseconds
        .select(F.from_unixtime(log_data_df.ts / 1000).alias("start_time"))
    )
    return time_df.select(
        time_df.start_time,
        F.hour(time_df.start_time).alias("hour"),
        F.dayofmonth(time_df.start_time).alias("day"),
        F.weekofyear(time_df.start_time).alias("week"),
        F.month(time_df.start_time).alias("month"),
        F.year(time_df.start_time).alias("year"),
        F.dayofweek(time_df.start_time).alias("weekday"),
    )
    def get_time_analysis(data, time_name):
        time_types = {'month' : month('date'), 'year' : year('date'), 'dayofweek' : dayofweek('date'), 'dayofmonth' : dayofmonth('date')}
        group_function = time_types[time_name]
        
        # Groupby the time type, aggregate usefull columns, and sort by time
        data_bytime = data.groupby(group_function).agg(f.avg('star_rating'), f.count('review_id'), f.stddev('star_rating')).withColumnRenamed('avg(star_rating)', 'rating').withColumnRenamed('stddev_samp(star_ratinng)', 'std_rating').withColumnRenamed(time_name + '(date)', time_name)
        data_bytime_sorted = data_bytime.sort(time_name)
        
        by_time_pd = data_bytime_sorted.toPandas()
        
        
        #fig, (ax1, ax2) = plt.subplots(1,2, figsize=(15,6))
        
        # First plot the average rating with it's condidence interval
        plt.figure(figsize=(10,6))
        plt.plot(by_time_pd[time_name], by_time_pd['rating'])
        plt.fill_between(by_time_pd[time_name], by_time_pd['rating'] - 1.96 * by_time_pd['stddev_samp(star_rating)'] / np.sqrt(by_time_pd['count(review_id)']), \
                                             by_time_pd['rating'] + 1.96 * by_time_pd['stddev_samp(star_rating)'] / np.sqrt(by_time_pd['count(review_id)']), alpha=0.5)
        plt.xlabel(time_name)
        plt.ylabel('Average Rating')
        plt.title('Average Rating vs ' + time_name + ' for ' + name)
        plt.savefig(IMG_PATH + "rating_by_" + time_name + "_evolution_"+ name + ".png")  
        plt.clf()

        plt.figure(figsize=(10,6))
        plt.plot(by_time_pd[time_name], by_time_pd['count(review_id)'])
        plt.xlabel(time_name)
        plt.ylabel('Number of reviews')
        plt.title('Number of reviews vs ' + time_name + ' for ' + name)
        plt.savefig(IMG_PATH + "number_by_" + time_name + "_evolution_"+ name + ".png")  
        plt.clf()
Beispiel #9
0
    def test_dayofweek(self):
        from pyspark.sql.functions import dayofweek

        dt = datetime.datetime(2017, 11, 6)
        df = self.spark.createDataFrame([Row(date=dt)])
        row = df.select(dayofweek(df.date)).first()
        self.assertEqual(row[0], 2)
def create_time_dimension(input_df, output_data):
    """
        Gather time data, create dataframe and write data into parquet files.
        
        :param input_df: dataframe of input data.
        :param output_data: path to write data to.
        :return: dataframe representing time dimension
    """
    
    from datetime import datetime, timedelta
    from pyspark.sql import types as T
    
    def convert_datetime(x):
        try:
            start = datetime(1960, 1, 1)
            return start + timedelta(days=int(x))
        except:
            return None
    
    udf_datetime_from_sas = udf(lambda x: convert_datetime(x), T.DateType())

    df = input_df.select(["arrdate"])\
                .withColumn("arrival_date", udf_datetime_from_sas("arrdate")) \
                .withColumn('day', F.dayofmonth('arrival_date')) \
                .withColumn('month', F.month('arrival_date')) \
                .withColumn('year', F.year('arrival_date')) \
                .withColumn('week', F.weekofyear('arrival_date')) \
                .withColumn('weekday', F.dayofweek('arrival_date'))\
                .select(["arrdate", "arrival_date", "day", "month", "year", "week", "weekday"])\
                .dropDuplicates(["arrdate"])
    
    tools.write_to_parquet(df, output_data, "time")
    
    return df
Beispiel #11
0
def process_log_data(spark, input_data, output_data):
    """
    Aim:    To read the json files from s3 and write the related columns to users, time and songplays
    Input:  spark, input_data(source files url), output_data(target files url)
    Output: users, time,songplays will be written to given target url.    
    """
    # get filepath to log data file
    log_data = input_data + "log_data/*/*/*"

    # read log data file
    log_df = spark.read.json(log_data)

    # filter by actions for song plays
    log_df = log_df[log_df['page'] == 'NextSong']

    # extract columns for users table
    ufcols = ["userId", "firstName", "lastName", "gender", "level"]
    users_table = log_df[ufcols].dropDuplicates()

    # write users table to parquet files
    users_table.write.parquet(os.path.join(output_data, "users"), "overwrite")

    # create timestamp column from original timestamp column
    get_timestamp = udf(lambda x: datetime.fromtimestamp(x / 1000.0).strftime(
        '%Y-%m-%d %H:%M:%S'))
    time_table = log_df[["ts"]]
    time_table = time_table.withColumn("start_time", to_timestamp(get_timestamp(time_table.ts)))\
                      .withColumn("hour", hour(to_timestamp(get_timestamp(time_table.ts))))\
                      .withColumn("day", dayofmonth(to_timestamp(get_timestamp(time_table.ts))))\
                      .withColumn("week", weekofyear(to_timestamp(get_timestamp(time_table.ts))))\
                      .withColumn("month", month(to_timestamp(get_timestamp(time_table.ts))))\
                      .withColumn("year", year(to_timestamp(get_timestamp(time_table.ts))))\
                      .withColumn("weekday", dayofweek(to_timestamp(get_timestamp(time_table.ts))))
    time_table = time_table[[
        "start_time", "hour", "day", "week", "month", "year", "weekday"
    ]].dropDuplicates()

    # write time table to parquet files partitioned by year and month
    time_table.write.partitionBy("year", "month").parquet(
        os.path.join(output_data, "time"), "overwrite")

    # extract columns from joined song and log datasets to create songplays table
    songplays_table = song_df.join(log_df, (song_df.title == log_df.song) &
                                   (song_df.duration == log_df.length) &
                                   (song_df.artist_name == log_df.artist))
    songplays_table = songplays_table.withColumn("songplay_id",
                                                 monotonically_increasing_id())
    songplays_table = songplays_table[[
        'songplay_id',
        from_unixtime(
            (songplays_table.ts.cast('bigint') /
             1000)).cast('timestamp').alias('start_time'), 'userId', 'level',
        'song_id', 'artist_id', 'sessionId', 'artist_location', 'userAgent'
    ]]
    songplays_table = songplays_table.withColumn(
        "year", year("start_time")).withColumn("month", month("start_time"))

    # write songplays table to parquet files partitioned by year and month
    songplays_table.write.partitionBy("year", "month").parquet(
        os.path.join(output_data, "songplays"), "overwrite")
def process_time_table(df, output_data):
    '''
    This function to select columns needed to build time table, then write data into parquet files.
    
    Input:
        - a dataframe of log data.
        - a S3 path where parquet files will be stored.
        
    Following steps will be performed:
        - From log dataframe, select columns and rename them. Result will be kept in a 'time_table' dataframe.
        - Write 'time_table' data into parquet files.

    '''

    # extract columns to create time table
    time_table = df.select(df.datetime.alias('start_time'),
                           F.hour(df.datetime).alias('hour'),
                           F.dayofmonth(df.datetime).alias('day'),
                           F.weekofyear(df.datetime).alias('week'),
                           F.month(df.datetime).alias('month'),
                           F.year(df.datetime).alias('year'),
                           F.dayofweek(df.datetime).alias('weekday'))

    # write time table to parquet files partitioned by year and month
    time_table.write.partitionBy('year',
                                 'month').parquet(output_data + "time.parquet")
Beispiel #13
0
def write_time_table(
    df: DataFrame,
    output_path,
    filename: str = "time.csv",
) -> DataFrame:
    """
    Extract a time dimension table from the staging data and save it in the csv format.
    Args:
        df: Staging dataframe containing source data.
        output_path: Path to where the resulting csv files are saved.
        filename: Optional filename of the resulting csv directory in the output path.


    Returns: A time dimension dataframe.
    """
    time_table = df.drop_duplicates(["date"]).where(f.col("date").isNotNull())
    time_table = (time_table.select(["date"]).withColumn(
        "year", f.year(time_table["date"])).withColumn(
            "month", f.month(time_table["date"])).withColumn(
                "day", f.dayofmonth(time_table["date"])).withColumn(
                    "week", f.weekofyear(time_table["date"])).withColumn(
                        "weekday", f.dayofweek(time_table["date"])))
    if output_path:
        time_table.write.csv(f"{output_path}/{filename}")
        logger.info("Saved time table")
    return time_table
Beispiel #14
0
def create_date_table(df, output_path="./output_data/date"):
    """ Creates a date dimension table based off of the arrival dates from the 
    immigration events.

    Args:
        df (Spark dataframe): input Spark dataframe with demographics data
        output_path (str): path to write the output dataframe to
    Return:
        DataFrame corresponding to date dimension table
    """

    # udf to convert arrival date to datetime object
    get_datetime = udf(lambda x: (dt.datetime(1960, 1, 1).date() + dt.
                                  timedelta(x)).isoformat() if x else None)

    # use arrival date to create dataframe
    df_date = df.select(['arrdate'
                         ]).withColumn("arrdate",
                                       get_datetime(df.arrdate)).distinct()
    df_date = df_date.withColumn('date_id', monotonically_increasing_id())
    df_date = df_date.withColumn('arrival_day', dayofmonth('arrdate'))
    df_date = df_date.withColumn('arrival_week', weekofyear('arrdate'))
    df_date = df_date.withColumn('arrival_month', month('arrdate'))
    df_date = df_date.withColumn('arrival_year', year('arrdate'))
    df_date = df_date.withColumn('arrival_weekday', dayofweek('arrdate'))

    # write the calendar dimension to parquet file
    partition_columns = ['arrival_year', 'arrival_month', 'arrival_week']
    df_date.write.parquet(output_path,
                          partitionBy=partition_columns,
                          mode="overwrite")

    return df_date
Beispiel #15
0
def createDimDate(df):
    '''
    Creates date dimensional table from DateTime field in upstream dataframe
    :param df:
    :return: Date Dimensional Dataframe
    '''
    df = df.withColumn("rawKey", F.col('rawKey'))
    df = df.withColumn("year", F.year(F.col('DateTime')))
    df = df.withColumn("month", F.month(F.col('DateTime')))
    df = df.withColumn("dayofmonth", F.dayofmonth(F.col('DateTime')))
    df = df.withColumn("dayofweek", F.dayofweek(F.col('DateTime')))
    df = df.withColumn("dayofyear", F.dayofyear(F.col('DateTime')))
    df = df.withColumn("hour", F.hour(F.col('DateTime')))
    df = df.withColumn("minute", F.minute(F.col('DateTime')))
    df = df.withColumn("dateMinute", F.date_format(F.col("DateTime"), "yyyyMMddHHmm"))
    df = df.withColumn("quarter", F.quarter(F.col('DateTime')))
    df = df.withColumn("date", F.to_date(F.col('DateTime')))
    df.createOrReplaceTempView('tempDimDateTable')
    dimDateDF = spark.sql(" SELECT * FROM \
                    (select rawKey,dateMinute,dateTime, date,year, month,dayofmonth,dayofweek,dayofyear,hour, minute,quarter \
                    from tempDimDateTable \
                    group by rawKey,dateMinute,dateTime, date,year, month,dayofmonth,dayofweek,dayofyear,hour, minute,quarter \
                    order by dateMinute ASC) \
                    ")

    # Generating dateKey field
    dimDateDF = dimDateDF.withColumn('dateKey', F.monotonically_increasing_id() + 1)
    # Creating dataframe including date field which will help to generate Fact table
    factHelperDateDF = dimDateDF.select(F.col('rawKey'), F.col('dateKey'), F.col('dateMinute'))
    # Dropping unnecessary rawKey field
    dimDateDF = dimDateDF.drop(F.col('rawKey'))
    return dimDateDF, factHelperDateDF
def generate_full_time_table(spark, output_folder):
    """
    This creates a full time table until the year 2060. This should reduce the processing
    time a bit, since the dates don't have to processed each time new facts are added

    Parameters
    ----------
    spark: SparkSession
    output_folder : str
        Folder path of processed parquet file.
    """
    logger.info('Creating dates table')
    days_till_2060 = range(int(100 * 365.25))
    all_dates = [(t, ) for t in days_till_2060]

    t_schema = T.StructType([T.StructField('i_date', T.IntegerType())])
    timeframe = spark.createDataFrame(all_dates, t_schema)
    timeframe = timeframe.withColumn(
        "dt_date", F.expr("date_add(to_date('1960-01-01'), i_date)"))
    timeframe = timeframe.select('i_date', 'dt_date',
                                 F.year('dt_date').alias('year'),
                                 F.month('dt_date').alias('month'),
                                 F.dayofmonth('dt_date').alias('day'),
                                 F.dayofweek('dt_date').alias('weekday'))

    timeframe.write\
        .parquet(os.path.join(output_folder, 'dates.parquet'), 'overwrite')
    logger.info('dates.parquet written')
Beispiel #17
0
def clean_data(input_df, threshold=15000):
    """Clean data"""
    # Set counts that correspond to time_diff >= 8h to NaNs
    # Take absolute values of entries_count and exits_count
    tmp_df = (
        input_df
        .withColumn("entries_count", F.when(F.col("time_diff") >= 8, None).otherwise(
            F.abs(F.col("entries_count"))))
        .withColumn("exits_count", F.when(F.col("time_diff") >= 8, None).otherwise(
            F.abs(F.col("exits_count"))))
    )

    # Replace absolute values of entries_count and exits_count > 15,000 with NaNs
    tmp_df = (
        tmp_df
        .withColumn("entries_count", F.when(F.col("entries_count") > threshold, None).otherwise(
            F.col("entries_count")))
        .withColumn("exits_count", F.when(F.col("exits_count") > threshold, None).otherwise(
            F.col("exits_count")))
    )

    # Impute NaNs with average counts of the same turnstile, hour & day of week
    tmp_df = (
        tmp_df
        .withColumn("hour", F.hour("time_rounded"))
        .withColumn("wkdy", F.dayofweek("time_rounded"))
    )
    tmp_df = impute_nans(tmp_df, "entries_count")
    tmp_df = impute_nans(tmp_df, "exits_count")

    # Compute traffic
    output_df = tmp_df.withColumn("traffic", F.col("entries_count") + F.col("exits_count"))
    return output_df
Beispiel #18
0
def create_time_table(log_data: DataFrame) -> DataFrame:
    """
    Process the log data and create table "time". Processing:
    - timestamp cannot be empty
    - select distinct timestamps
    - convert unix timestamp into timestamp
    - extract date parts

    Parameters
    ----------
    log_data : pyspark.sql.dataframe.DataFrame
        Log data.

    Returns
    -------
    time : pyspark.sql.dataframe.DataFrame
    """
    return (log_data.where(
        F.col('ts').isNotNull()).select('ts').distinct().withColumn(
            'start_time',
            F.from_unixtime(F.col('ts') / 1000).cast('timestamp')).select(
                'start_time',
                F.hour('start_time').alias('hour'),
                F.dayofmonth('start_time').alias('day'),
                F.weekofyear('start_time').alias('week'),
                F.month('start_time').alias('month'),
                F.year('start_time').alias('year'),
                F.dayofweek('start_time').alias('weekday')))
Beispiel #19
0
    def _transform(self, df):

        time_variable = self.getColumn()
        new_time_variable = time_variable + '_new'

        # code from tawab. Convert all times in a same format.

        df = df.withColumn(
            new_time_variable,
            self.udf_date_formatting()(
                funct.col(time_variable).cast("String")))
        df = df.withColumn(
            new_time_variable,
            funct.from_unixtime(
                funct.unix_timestamp(new_time_variable,
                                     self.time_format)).cast(TimestampType()))

        df = df.withColumn(time_variable + '_year',
                           funct.year(new_time_variable))
        df = df.withColumn(time_variable + '_month',
                           funct.month(new_time_variable))
        df = df.withColumn(time_variable + '_day',
                           funct.dayofmonth(new_time_variable))
        df = df.withColumn(time_variable + '_dayofweek',
                           funct.dayofweek(new_time_variable))
        df = df.withColumn(time_variable + '_hour',
                           funct.hour(new_time_variable))
        df = df.withColumn(time_variable + '_minutes',
                           funct.minute(new_time_variable))
        df = df.withColumn(time_variable + '_seconds',
                           funct.second(new_time_variable))

        df = df.drop(new_time_variable)
        df = df.drop(time_variable)
        return df
Beispiel #20
0
def process_log_data(spark, input_data, output_data):
    """
    Process the log data file from S3, creates song and artist tables and load them back to S3
    
    """
    # get filepath to log data file
    log_data = input_data + 'log_data/*/*/*.json'

    # read log data file
    df = spark.read.json(log_data)

    # filter by actions for song plays
    df = df.filter(df.page == 'NextSong')

    # extract columns for users table
    user_table = df.select(col("userId"), col("firstName"), col("lastName"),
                           col("gender"), col("level")).distinct()

    # write users table to parquet files
    user_table.write.parquet("{}users/users_table.parquet".format(output_data))

    # create timestamp column from original timestamp column
    get_timestamp = udf(lambda x: datetime.fromtimestamp(x / 1000),
                        TimestampType())
    df = df.withColumn("timestamp", get_timestamp(col("ts")))

    # create datetime column from original timestamp column
    get_datetime = udf(lambda x: to_date(x), TimestampType())
    df = df.withColumn("start_time", get_timestamp(col("ts")))

    # extract columns to create time table
    df = df.withColumn("hour", hour("timestamp"))
    df = df.withColumn("day", dayofmonth("timestamp"))
    df = df.withColumn("month", month("timestamp"))
    df = df.withColumn("year", year("timestamp"))
    df = df.withColumn("week", weekofyear("timestamp"))
    df = df.withColumn("weekday", dayofweek("timestamp"))

    time_table = df.select(col("start_time"), col("hour"), col("day"), col("week"), \
                           col("month"), col("year"), col("weekday")).distinct()

    # write time table to parquet files partitioned by year and month
    time_table.write.partitionBy("year", "month").parquet(
        "{}time/time_table.parquet".format(output_data))

    # read in song data to use for songplays table
    song_df = spark.sql(
        "SELECT DISTINCT song_id, artist_id, artist_name FROM song_data_table")

    # extract columns from joined song and log datasets to create songplays table
    songplays_table = df.join(song_df, song_df.artist_name == df.artist, "inner") \
        .distinct() \
        .select(col("start_time"), col("userId"), col("level"), col("sessionId"), \
                col("location"), col("userAgent"), col("song_id"), col("artist_id")) \
        .withColumn("songplay_id", monotonically_increasing_id())

    # write songplays table to parquet files partitioned by year and month
    songplays_table.write.partitionBy("year", "month").parquet(output_data +
                                                               'songplays/')
Beispiel #21
0
def process_log_data(spark):
    """
    Extract and load log data from S3 then transform it users, times, and songplays parquet files in S3. 
    Times and songplays parquet files are partitioned by year and month
    
    Arguments:
        Spark session
    
    Returns:
        None
    """
    
    # get filepath to log data file
    log_data = input_data + "log_data/*/*/*.json"

    # read log data file
    df = spark.read.json(log_data)
    
    # filter by actions for song plays
    df = df.filter(df.page == "NextSong")

    # extract columns for users table    
    users_table = df.select("userId", "firstName", "lastName", "gender", "level")
    
    # write users table to parquet files
    users_table.write.mode('overwrite').parquet(output_data + "users/users.parquet")

    # create timestamp column from original timestamp column
    get_timestamp = udf(lambda x:datetime.fromtimestamp(x/1000), TimestampType())
    df = df.withColumn("timestamp", get_timestamp(df.ts))
    
    # create datetime column from original timestamp column
    get_datetime = udf(lambda x:datetime.fromtimestamp(x/1000), DateType())
    df = df.withColumn("datetime", get_datetime(df.ts))
    
    # extract columns to create time table
    time_table = df.select(col("timestamp").alias("start_time"), 
                           hour(df.timestamp).alias("hour"), 
                           dayofmonth(df.timestamp).alias("day"), 
                           weekofyear(df.timestamp).alias("week"),
                           month(df.timestamp).alias("month"),
                           year(df.timestamp).alias("year"),
                           dayofweek(df.timestamp).alias("weekday")).drop_duplicates()
    
    # write time table to parquet files partitioned by year and month
    time_table.write.partitionBy("year", "month").mode('overwrite').parquet(output_data + "times/times.parquet")

    # read in song data to use for songplays table
    song_df = spark.read.parquet(output_data + "songs/songs.parquet")

    # read in artist data to use for songplays table
    artist_df = spark.read.parquet(output_data + "artists/artists.parquet")
    
    # extract columns from joined song, artist, and log datasets to create songplays table
    joinedSongArtist_df = song_df.join(artist_df, ['artist_id'])
    songplays_table = df.join(joinedSongArtist_df, (df.song == joinedSongArtist_df.title) & (df.artist == joinedSongArtist_df.artist_name) & (df.length == joinedSongArtist_df.duration), "left").select(col("timestamp").alias("start_time"), year("timestamp").alias("year"), month("timestamp").alias("month"), col("userId").alias("user_id"), df.level, joinedSongArtist_df.song_id, col("artist_id"), col("sessionId").alias("session_id"), df.location, col("userAgent").alias("user_agent")).drop_duplicates()
    
    # write songplays table to parquet files partitioned by year and month
    songplays_table.write.partitionBy("year", "month").mode('overwrite').parquet(output_data + "songplays/songplays.parquet")
Beispiel #22
0
def process_log_data(spark, logs_df, songs_df, output_location):
    # filter by actions for song plays
    logs_df = logs_df.filter("page = 'NextSong'")

    # extract user data
    users_table = logs_df.select([
        logs_df.userId.alias("user_id"),
        logs_df.firstName.alias("first_name"),
        logs_df.lastName.alias("last_name"), logs_df.gender, logs_df.level
    ])

    users_table = users_table.distinct()

    # write users table to parquet files
    users_table.write.mode("overwrite").parquet(output_location +
                                                '/users.parquet')
    users_table.show(2)

    # build time fact table
    time_df = logs_df.select(getDateTime(logs_df.ts).alias("timestamp"))
    time_table = time_df.select([
        time_df.timestamp.alias("start_time"),
        F.hour(time_df.timestamp).alias("hour"),
        F.year(time_df.timestamp).alias("year"),
        F.dayofmonth(time_df.timestamp).alias("day"),
        F.month(time_df.timestamp).alias("month"),
        F.dayofweek(time_df.timestamp).alias("weekday")
    ])

    time_table = time_table.distinct()
    time_table.show(2)

    # write time table to parquet files partitioned by year and month
    time_table.write.mode("overwrite").partitionBy(
        "year", "month").parquet(output_location + '/time.parquet')

    # extract columns from joined song and log datasets to create songplays table
    songs_df.createOrReplaceTempView("songs")
    logs_df.createOrReplaceTempView("logs")
    spark.udf.register("getDateTime", getDateTime)

    songplays_table = spark.sql("""select
        getDateTime(l.ts) as start_time, l.userId as user_id,l.level,
        s.song_id, s.artist_id, l.location, l.userAgent as user_agent
        from logs as l  left outer join songs as s
        on s.artist_name = l.artist
        and s.title = l.song
        and s.duration = l.length
    """)

    songplays_table = songplays_table.withColumn(
        "year", F.year(songplays_table.start_time))

    songplays_table = songplays_table.withColumn(
        "month", F.year(songplays_table.start_time))

    # write songplays table to parquet files partitioned by year and month
    songplays_table.write.mode("overwrite").partitionBy(
        "year", "month").parquet(output_location + '/song_plays.parquet')
Beispiel #23
0
def process_log_data(spark, input_data, output_data):
    """
    Description:
                Function loads data from logs in Udacity S3 bucket, processess it and after extraction of artist and songs tables
                loads them into my S3 bucket. Output df from earlier functin is used to filter the page == 'NextSong'
    Parameters:
                spark       : Earlier initiated spark session
                input_data  : Udacity location of log data in json format for events
                output_data : my S3 bucket where the dimensional tables in parquet format will be written
                return      : None
    """

    log_data = input_data + 'log_data/*/*/*.json'

    df = spark.read.json(log_data)

    df = df.filter(df.page == 'NextSong')

    users_fields = [
        "userId as user_id", "firstName as first_name",
        "lastName as last_name", "gender", "level"
    ]
    users_table = df.selectExpr(users_fields).dropDuplicates()

    users_table.write.mode('overwrite').parquet(output_data + 'users/')

    get_timestamp = udf(lambda x: datetime.utcfromtimestamp(int(x) / 1000),
                        TimestampType())
    df = df.withColumn("start_time", get_timestamp('ts'))

    time_table = df.withColumn("hour",hour("start_time"))\
                    .withColumn("day",dayofmonth("start_time"))\
                    .withColumn("week",weekofyear("start_time"))\
                    .withColumn("month",month("start_time"))\
                    .withColumn("year",year("start_time"))\
                    .withColumn("weekday",dayofweek("start_time"))\
                    .select("ts","start_time","hour", "day", "week", "month", "year", "weekday").drop_duplicates()

    time_table.write.mode('overwrite').partitionBy(
        "year", "month").parquet(output_data + 'time/')

    song_df = spark.read\
                .format("parquet")\
                .option("basePath", os.path.join(output_data, "songs/"))\
                .load(os.path.join(output_data, "songs/*/*/"))

    # extract columns from joined song and log datasets to create songplays table
    songplays_table = df.join(song_df, df.song == song_df.title, how='inner')\
    .select(monotonically_increasing_id().alias("songplay_id"),col("start_time"),col("userId").alias("user_id"),"level","song_id","artist_id", col("sessionId").alias("session_id"), "location", col("userAgent")\
            .alias("user_agent"))

    songplays_table = songplays_table.join(time_table, songplays_table.start_time == time_table.start_time, how="inner")\
                        .select("songplay_id", songplays_table.start_time, "user_id", "level", "song_id", "artist_id", "session_id", "location", "user_agent", "year", "month")

    # write songplays table to parquet files partitioned by year and month
    songplays_table.drop_duplicates().write.parquet(
        os.path.join(output_data, "songplays/"),
        mode="overwrite",
        partitionBy=["year", "month"])
def add_timedelta_columns(df):
  df = df.withColumn("DEP_LOCAL_TIMESTAMP_PLUS_2", df.DEP_LOCAL_TIMESTAMP + expr('INTERVAL 2 HOURS'))
  df = df.withColumn("DEP_LOCAL_HOUR_PLUS_2", hour(df.DEP_LOCAL_TIMESTAMP_PLUS_2))
  df = df.withColumn("MONTH_PLUS_2", month(df.DEP_LOCAL_TIMESTAMP_PLUS_2))
  df = df.withColumn("DAY_OF_WEEK_PLUS_2", dayofweek(df.DEP_LOCAL_TIMESTAMP_PLUS_2))
  df = df.withColumn("FL_DATETIMEHOUR", (unix_timestamp("FL_DATETIME") + col("DEP_LOCAL_HOUR")*60*60).cast('timestamp'))  
  df = df.withColumn("FL_DATETIMEHOUR_PLUS_2", df.FL_DATETIMEHOUR + expr('INTERVAL 2 HOURS') )
  return df
Beispiel #25
0
def process_log_data(spark, input_data, output_data):
    """Insert records into time, users, and songplays tables using data from log and song files."""

    # get filepath to log data file
    log_data = os.path.join(input_data, "log_data/*/*/*.json")

    # read log data file
    df = spark.read.json(log_data)

    df = df.filter(df.page == 'NextSong')

    # extract columns for users table
    users_table = df.selectExpr([
        "userId as user_id", "firstName as first_name",
        "lastName as last_name", "gender", "level"
    ]).dropDuplicates()

    users_table.write.parquet(os.path.join(output_data, 'users/'), 'overwrite')

    # create start_time column from ts column and convert it to timestamp
    get_timestamp = udf(lambda x: datetime.fromtimestamp(x / 1000).strftime(
        '%Y-%m-%d %H:%M:%S'))
    df = df.withColumn("start_time", get_timestamp(df.ts))

    # extract columns to create time table
    time_table = df.select("start_time").dropDuplicates().withColumn(
        "hour", hour(col("start_time"))).withColumn(
            "day", dayofmonth(col("start_time"))).withColumn(
                "week", weekofyear(col("start_time"))).withColumn(
                    "month", month(col("start_time"))).withColumn(
                        "year", year(col("start_time"))).withColumn(
                            "weekday", dayofweek(col("start_time")))

    # write time table to parquet files partitioned by year and month
    time_table.write.partitionBy("year", "month").parquet(
        os.path.join(output_data, 'time/'), 'overwrite')

    # read in song data to use for songplays table
    song_df = spark.read.json(
        os.path.join(input_data, "song_data/*/*/*/*.json"))

    # extract columns from joined song and log datasets to create songplays table
    songplays_table = df.join(
        song_df, (df.song == song_df.title) &
        (df.artist == song_df.artist_name) & (df.length == song_df.duration),
        'left_outer').select(df.start_time, df.userId.alias("user_id"),
                             df.level, song_df.song_id, song_df.artist_id,
                             df.sessionId.alias("session_id"), df.location,
                             df.userAgent.alias("user_agent")).withColumn(
                                 "songplay_id",
                                 monotonically_increasing_id()).withColumn(
                                     "year",
                                     year(col("start_time"))).withColumn(
                                         "month", month(col("start_time")))

    #     # write songplays table to parquet files partitioned by year and month
    songplays_table.write.partitionBy('year', 'month').parquet(
        os.path.join(output_data, 'songplays'), 'overwrite')
def process_log_data(spark, schema, input, output):
    """
    Processes log_data and creating users, time, and songplays tables.

    Params:
        spark(spark.SparkSession): A SparkSession object
        schema(spark.sql.types.StructType): A StructType object that represents the schema of the table
        input(str): Input path
        output(str): Output path
    """
    logging.info("PROCESSING log_data...")
    # Read log_data
    df = (spark.read.json(input, schema = schema, mode = "PERMISSIVE")) \
                .where(F.col("page") == "NextSong")

    # # Create users table and write as parquet
    logging.info("CREATING users...")
    users_table = df.selectExpr(["userId as user_id", "firstName as first_name",
                                 "lastName as last_name", "gender", "level"]) \
                    .dropDuplicates()
    users_table.write \
               .partitionBy("last_name") \
               .parquet(os.path.join(output, "users"), mode = "overwrite")

    # Create time table and write as parquet
    logging.info("CREATING time...")
    df = df.withColumn(
        "start_time",
        F.to_timestamp(F.from_unixtime(df["ts"] / 1000),
                       format="YYYY-MM-dd HH:mm:ss"))
    time_table = df.select(["start_time", F.hour("start_time").alias("hour"),
                            F.dayofmonth("start_time").alias("day"), F.weekofyear("start_time").alias("week"),
                            F.month("start_time").alias("month"), F.year("start_time").alias("year"),
                            F.dayofweek("start_time").alias("weekday")]) \
                   .dropDuplicates()
    time_table.write \
              .partitionBy(["year", "month"]) \
              .parquet(os.path.join(output, "time"), mode = "overwrite")

    # Read songs table
    df_songs = spark.read.parquet(os.path.join(output, "songs"))
    df = df.join(df_songs, df.song == df_songs.title, how="inner")
    df = df.withColumn("songplay_id", F.monotonically_increasing_id() + 1)

    # Create songsplays table and write as parquet
    logging.info("CREATING songplays...")
    songplays_table = df.select([
        "songplay_id", "start_time",
        F.col("userId").alias("user_id"), "level",
        df_songs.song_id.alias("song_id"), "artist_id",
        F.col("sessionId").alias("session_id"), "location",
        F.col("userAgent").alias("user_agent")
    ])

    songplays_table.write \
                   .parquet(os.path.join(output, "songplays"), mode = "overwrite")

    return logging.info("PROCESSED log_data.")
Beispiel #27
0
def process_log_data(spark, input_data, output_data):
    """
    Load data from log_data dataset and extract columns for user and time tables. 
    Data is written to parquet files and stored on S3
    """
    # get filepath to log data file
    log_data = input_data + "log_data/*/*/*.json"

    # read log data file
    df = spark.read.json(log_data)
    
    # filter by actions for song plays
    df = df.filter(df.page == "NextSong")

    # extract columns for users table    
    users_table = df.select("userId", "firstName", "lastName", "gender", "level").dropDuplicates()
    
    # write users table to parquet files
    users_table.write.mode("overwrite").parquet(output_data + "users_table/users.parquet")

    # create timestamp column from original timestamp column
    get_timestamp = udf(lambda x: datetime.fromtimestamp(x / 1000), TimestampType())
    df = df.withColumn("start_time", get_timestamp(df.ts))
    
    # extract columns to create time table
    time_table = df.select("start_time") \
                        .withColumn("hour", hour("start_time")) \
                        .withColumn("day", dayofmonth("start_time")) \
                        .withColumn("week", weekofyear("start_time")) \
                        .withColumn("month", month("start_time")) \
                        .withColumn("year", year("start_time")) \
                        .withColumn("weekday", dayofweek("start_time")) \
                        .dropDuplicates()
    
    
    # write time table to parquet files partitioned by year and month
    time_table.write.mode("overwrite").partitionBy("year", "month").parquet(output_data+"time_table/time.parquet")

    # read in song data to use for songplays table
    song_df = spark.read.json(input_data + "song_data/*/*/*/*.json")

    # extract columns from joined song and log datasets to create songplays table 
    joined_df = df.join(song_df, song_df.artist_name == df.artist, "inner")
    songplays_table = joined_df.select(
    col("start_time"),
    col("userId").alias("user_id"),
    col("level"),
    col("song_id"),
    col("artist_id"),
    col("sessionId").alias("session_id"),
    col("location"), 
    year("start_time").alias("year"),
    month("start_time").alias("month"),
    col("userAgent").alias("user_agent"))\
    .withColumn("songplay_id", monotonically_increasing_id())

    # write songplays table to parquet files partitioned by year and month
    songplays_table.write.mode("overwrite").partitionBy("year", "month").parquet(output_data+"songplays/songplays.parquet")
Beispiel #28
0
def process_log_data(spark, input_data, output_data):
    # get filepath to log data file
    log_data = os.path.join(input_data, "log_data", "*", "*", "*.json")

    # read log data file
    df = spark.read.json(log_data)
    
    # filter by actions for song plays
    df = df.filter(df.page == 'NextSong')

    # extract columns for users table  
    user_columns = ['userId as user_id', 'firstName as first_name', 'lastName as last_name', 'gender', 'level']
    users_table = df.selectExpr(user_columns).dropDuplicates()
    
    # write users table to parquet files
    users_table.write.parquet(path = os.path.join(output_data, "users"), mode = "overwrite")

    # create timestamp column from original timestamp column
    get_timestamp = udf(lambda x: datetime.fromtimestamp(x/1000), TimestampType())
    df = df.withColumn('timestamp', get_timestamp('ts'))
    
    # create  column from original timestamp column
    get_datetime = udf(lambda x: to_date(x), DateType())
    df = df.withColumn('datetime', get_datetime('ts'))
    
    # extract columns to create time table
    time_table = df.selectExpr(["timestamp as start_time"])
    time_table = time_table.withColumn('hour', hour('start_time'))
    time_table = time_table.withColumn('day', dayofmonth('start_time'))
    time_table = time_table.withColumn('week', weekofyear('start_time'))
    time_table = time_table.withColumn('month', month('start_time'))
    time_table = time_table.withColumn('year', year('start_time'))
    time_table = time_table.withColumn('weekday', dayofweek('start_time'))
    
    # write time table to parquet files partitioned by year and month
    time_table.write.partitionBy("year", "month").parquet(path = os.path.join(output_data, "time"), mode = "overwrite")

    # read in song data to use for songplays table
    song_df = spark.read.parquet(os.path.join(output_data, 'songs'))

    # extract columns from joined song and log datasets to create songplays table 
    songplays_table = df \
        .join(song_df, song_df.title == df.song, 'inner') \
        .orderBy('ts') \
        .selectExpr(["timestamp as start_time",
                    "userId as user_id", "level",
                    "song_id",
                    "artist_id", 
                    "sessionId as session_id", 
                    "location", 
                    "userAgent as user_agent", 
                    "year(timestamp) as year", 
                    "month(timestamp) as month"]) \
        .withColumn('songplay_id', monotonically_increasing_id())
        

    # write songplays table to parquet files partitioned by year and month
    songplays_table.write.partitionBy("year", "month").parquet(path = os.path.join(output_data, "songplays"), mode = "overwrite")
Beispiel #29
0
def process_log_data(spark, input_data_log, output_data):
    """ 
    1. takes a spark session, input data path, output data path
    2. reads log data from input_data and songs table that is created in process_song_data
    3. processes the data, creates users, time, and songplays tables
    4. loads the tables to output_data
    """

    # read log data file
    df = spark.read.json(input_data_log)

    # cast user id to int
    df = df.withColumn('user_id', df['userId'].cast('int'))

    # filter by actions for song plays
    df = df.where(df.page == "NextSong")

    # create tmp view
    df.createOrReplaceTempView('tmp')

    # extract columns for users table
    users_table = spark.sql(users_table_query)
    users_table = users_table.dropDuplicates(['user_id'])

    # write users table to parquet files
    users_table.write.parquet(output_data + 'users')

    # create timestamp column from original timestamp column
    df = df.withColumn('start_time', from_unixtime(df.ts / 1000.0))
    df = df.withColumn('hour', hour(df.start_time))
    df = df.withColumn('day', dayofmonth(df.start_time))
    df = df.withColumn('week', weekofyear(df.start_time))
    df = df.withColumn('weekday', dayofweek(df.start_time))
    df = df.withColumn('month', month(df.start_time))
    df = df.withColumn('year', year(df.start_time))

    # create a temp view
    df.createOrReplaceTempView('tmp')

    # extract columns to create time table
    times_table = spark.sql(times_table_query)
    times_table = times_table.dropDuplicates(['start_time'])

    # write time table to parquet files partitioned by year and month
    times_table.write.partitionBy(['year',
                                   'month']).parquet(output_data + 'times')

    # read in song data to use for songplays table
    song_df = spark.read.parquet(output_data + 'songs')

    # extract columns from joined song and log datasets to create songplays table
    song_df.createOrReplaceTempView('tmp2')
    songplays_table = spark.sql(songplays_table_query)

    # write songplays table to parquet files partitioned by year and month
    songplays_table.withColumn('year', year(df.start_time))\
    .withColumn('month', month(df.start_time))\
    .write.partitionBy(['year', 'month']).parquet(output_data + 'songplays')
Beispiel #30
0
def i94_dates(spark, df):
    """
    Build the i94_dates DataFrame which is another dimension; this is
    built from the arrival_date field in the immigrations dataset
    :params spark - A Pyspark object
    :params df - A Pyspark cleaned DataFrame
    Returns - A cleaned dimensional DataFrame
    """
    i94_dates = (df.select(
        F.col('i94_year'), F.col('i94_month'), F.col('arrival_date'),
        F.year('arrival_date').alias('year'),
        F.month('arrival_date').alias('month'),
        F.dayofmonth('arrival_date').alias('day'),
        F.dayofweek('arrival_date').alias('dayofweek'),
        F.when((F.dayofweek('arrival_date') == 1) |
               (F.dayofweek('arrival_date') == 7), 'weekend').otherwise(
                   'weekday').alias('is_weekend')).dropDuplicates())
    return i94_dates
Beispiel #31
0
 def test_dayofweek(self):
     from pyspark.sql.functions import dayofweek
     dt = datetime.datetime(2017, 11, 6)
     df = self.spark.createDataFrame([Row(date=dt)])
     row = df.select(dayofweek(df.date)).first()
     self.assertEqual(row[0], 2)