Ejemplo n.º 1
0
def process_log_data(spark, input_data, output_data):
    # get filepath to log data file
    log_data = input_data + "log_data"

    # read log data file
    #df = spark.read.json(log_data + "/2018/11/2018-11-*.json")
    df = spark.read.json(log_data)

    # filter by actions for song plays
    df = df.filter(df.page == 'NextSong')

    # extract columns for users table
    users_table = df.select(
        ['userId', 'firstName', 'lastName', 'gender', 'level'])

    # write users table to parquet files
    users_table.write.mode('overwrite').parquet(output_data + 'users')

    # create timestamp column from original timestamp column
    get_timestamp = F.udf(lambda x: datetime.fromtimestamp((x / 1000.0)),
                          T.TimestampType())
    df = df.withColumn("timestamp", get_timestamp(df.ts))

    # create datetime column from original timestamp column
    get_datetime = udf(lambda x: datetime.fromtimestamp(x / 1000.0).strftime(
        "%Y-%m-%d %H:%M:%S"))
    df = df.withColumn("start_time", get_datetime(df.ts))

    df = df.withColumn("hour", hour(col("start_time")))\
           .withColumn("day", dayofmonth(col("start_time")))\
           .withColumn("week", weekofyear(col("start_time")))\
           .withColumn("month", month(col("start_time")))\
           .withColumn("year", year(col("start_time")))\
           .withColumn("weekday", dayofweek(col("start_time")))

    # extract columns to create time table
    time_table = df.select([
        'timestamp', 'start_time', 'hour', 'day', 'week', 'month', 'year',
        'weekday'
    ])

    # write time table to parquet files partitioned by year and month
    time_table.write.mode('overwrite').partitionBy(
        'year', 'month').parquet(output_data + 'time')

    # read in song data to use for songplays table
    song_df = spark.read.parquet(output_data + '/songs/')

    # extract columns from joined song and log datasets to create songplays table
    songplays_table=song_df.select(['song_id', 'title', 'artist_id', 'artist_name', 'duration']) \
                           .join(df, (song_df.artist_name == df.artist) \
                                     & (song_df.title == df.song)) \
                           .select(['ts', 'userId', 'level', 'song_id', 'artist_id', 'sessionId', 'location', 'userAgent', 'year', 'month'])

    songplays_table = songplays_table.withColumn(
        "songplay_id",
        row_number().over(Window.orderBy(monotonically_increasing_id())) - 1)

    # write songplays table to parquet files partitioned by year and month
    songplays_table.write.mode('overwrite').partitionBy(
        'year', 'month').parquet(output_data + 'songplays')
def process_log_data(spark, input_data, output_data):
    """
    Loads data from S3, processes it to event tables, which are saved back to S3
    
    Input:
    spark:          A SparkSession instance
    input_data:     location of the json files for processing
    output_data:    S3 bucket for outputting dimensional data in parquet format
    """
    # get filepath to log data file
    # Documentation example from Udacity: log_data/2018/11/2018-11-12-events.json
    log_data = input_data + "log_data/*/*/*.json"

    # read log data file
    df = spark.read.json(log_data, 
                         columnNameOfCorruptRecord='corrupt_record').drop_duplicates()
    
    # filter by actions for song plays
    df = df.filter(df.page == "NextSong")

    # extract columns for users table    
    users_table = df.select(df.userId.alias("user_id"),
                            df.firstName.alias("first_name"),
                            df.lastName.alias("last_name"),
                            df.gender,
                            df.level).drop_duplicates()
    
    # write users table to parquet files
    users_table.write.parquet(output_data + "users/", 
                              mode="overwrite")

    # create timestamp column from original timestamp column
    get_timestamp = F.udf(lambda x : datetime.utcfromtimestamp(int(x)/1000), T.TimestampType())
    df = df.withColumn("start_time", get_timestamp('ts'))
    
    # create datetime column from original timestamp column
    # get_datetime = F.udf()
    # df = 
    
    # extract columns to create time table
    time_table = df.withColumn("start_time", F.col("start_time")) \
                   .withColumn("hour", F.hour(F.col("start_time"))) \
                   .withColumn("day", F.dayofmonth(F.col("start_time"))) \
                   .withColumn("week", F.weekofyear(F.col("start_time"))) \
                   .withColumn("month", F.month(F.col("start_time"))) \
                   .withColumn("year", F.year(F.col("start_time"))) \
                   .withColumn("weekday", F.dayofweek(F.col("start_time"))) \
                   .select("ts","start_time","hour", "day", "week", "month", "year", "weekday").drop_duplicates()
    
    # write time table to parquet files partitioned by year and month
    # write time table to parquet files partitioned by year and month
    time_table.write.parquet(output_data + "time/", 
                            mode="overwrite",
                            partitionBy=["year","month"])

    # read in song data to use for songplays table
    songs_parquet = output_data + 'songs/*/*/*.parquet'
    song_df = spark.read.parquet(songs_parquet)

    # extract columns from joined song and log datasets to create songplays table 
    songplays_table = df.join(song_df, [df.song == song_df.title], how='inner') \
                        .join(time_table, df.start_time == time_table.start_time, how="inner") \
                        .select(F.monotonically_increasing_id().alias("songplay_id"),
                                df.start_time,
                                df.userId.alias("user_id"),
                                df.level,
                                song_df.song_id,
                                df.artist.alias("artist_id"), 
                                df.sessionId.alias("session_id"), 
                                df.location, 
                                df.userAgent.alias("user_agent"),
                                time_table.year,
                                time_table.month) \
                        .repartition("year", "month") \
                        .drop_duplicates()

    # write songplays table to parquet files partitioned by year and month
    songplays_table.write.parquet(output_data + "songplays/", 
                                  mode="overwrite",
                                  partitionBy=["year","month"])
Ejemplo n.º 3
0
def process_log_data(spark, input_data, output_data):
    """Reads in JSON log data and then writes users, time and songplays tables to parquet on S3.

    Args:
        spark: The current SparkSession.
        input_data: The S3 bucket to read in the data.
        output_data: The S3 bucket to write to.
    """
    # get filepath to log data file
    # For working in the workspace: log_data = os.path.join(input_data, "log-data/*.json")
    log_data = os.path.join(input_data, "log-data/*/*/*.json")

    # read log data file
    df = spark.read.json(log_data)

    # filter by actions for song plays
    df = df.filter(df.page == 'NextSong')

    # rename the columns in df
    df = (df.withColumnRenamed('userId', 'user_id').withColumnRenamed(
        'firstName', 'first_name').withColumnRenamed(
            'lastName', 'last_name').withColumnRenamed(
                'itemInSession', 'item_in_session').withColumnRenamed(
                    'sessionId',
                    'session_id').withColumnRenamed('userAgent', 'user_agent'))

    # extract columns for users table
    users_table = df.select('user_id', 'first_name', 'last_name', 'gender',
                            'level').distinct()

    # write users table to parquet files
    users_table.write.parquet(output_data + 'users', mode='overwrite')

    # create timestamp column from original timestamp column
    # default type is string for UDFs, so we need to switch that by specifying the correct type
    get_timestamp = udf(lambda x: datetime.fromtimestamp(x / 1000.0),
                        T.TimestampType())
    df = df.withColumn('start_time', get_timestamp(df.ts))

    # extract columns to create time table
    time_table = df.select(
        'start_time',
        hour(col('start_time')).alias('hour'),
        dayofmonth(col('start_time')).alias('day'),
        weekofyear(col('start_time')).alias('week'),
        month(col('start_time')).alias('month'),
        year(col('start_time')).alias('year'),
        date_format(col('start_time'), 'EEEE').alias('weekday'))

    # write time table to parquet files partitioned by year and month
    time_table.write.partitionBy('year', 'month').parquet(output_data + 'time',
                                                          mode='overwrite')

    # read in song data to use for songplays table
    song_df = spark.read.parquet(output_data +
                                 'songs/year=*/artist_id=*/*.parquet')
    artist_df = spark.read.parquet(output_data + 'artists/*.parquet')

    # extract columns from joined song and log datasets to create songplays table
    songplays_table = df.join(
        song_df,
        (df.song == song_df.title) & (df.length == song_df.duration)).join(
            artist_df,
            df.artist == artist_df.artist_name).join(time_table,
                                                     ['start_time'])

    # create the songplay_id column
    songplays_table = songplays_table.withColumn('songplay_id',
                                                 monotonically_increasing_id())

    # select the columns of interest
    songplays_table = songplays_table.select('songplay_id', 'start_time',
                                             'user_id', 'level', 'song_id',
                                             'artist_id', 'session_id',
                                             'location', 'user_agent', 'year',
                                             'month')

    # write songplays table to parquet files partitioned by year and month (I think this is a copy paste error because year and month aren't listed as required cols)
    songplays_table.write.partitionBy('year', 'month').parquet(
        output_data + 'songplays', mode='overwrite')
Ejemplo n.º 4
0
    def _transform(self, df, auxiliar_train):

        if not self.train_file:
            auxiliar_train = auxiliar_train.drop('WinningBid')
            auxiliar_train = auxiliar_train.withColumn('test', lit(0))
            df = df.withColumn('test', lit(1))
            df = auxiliar_train.union(df)
            del auxiliar_train

        # We create the time as Index
        split_col = split(df['ApproximateDate'], ' ')
        df = df.withColumn('time', split_col.getItem(1))  # time

        # Hour Index
        func_index = udf(lambda x: auxiliar_func.time_to_num(x, index='hms'),
                         IntegerType())
        df = df.withColumn('hms_index', func_index(df['time']))

        # We order by UserId-Date
        df = df.orderBy(['UserID', 'hms_index'])

        # We check Null Values
        df.select([count_(when(isnan(c), c)).alias(c)
                   for c in df.columns]).show()

        # We create a rank of users by how many times in the past saw an ad
        w = (Window().partitionBy(df.UserID).orderBy('time').rowsBetween(
            Window.unboundedPreceding, 0))
        df = df.withColumn('user_id_acumulative', count_(df['UserId']).over(w))

        # Number of Ads/User/Second
        df = df.withColumn('key_id',
                           concat(df['UserID'], lit(' '), df['hms_index']))
        w = (Window().partitionBy(df.key_id).orderBy('hms_index').rowsBetween(
            -sys.maxsize, sys.maxsize))
        df = df.withColumn('number_ads_user_second', count_(df.key_id).over(w))

        # Number of Ads/User
        df_group = df.groupby(['key_id'
                               ]).agg(count_('key_id').alias('count_ads'))
        split_col = split(df_group['key_id'], ' ')
        df_group = df_group.withColumn('UserID', split_col.getItem(0))  # time
        w = (Window().partitionBy(
            df_group.UserID).orderBy('key_id').rowsBetween(
                Window.unboundedPreceding, 0))
        df_group = df_group.withColumn('number_ads_user',
                                       sum_(df_group.count_ads).over(w))
        df_group = df_group.select(['key_id', 'number_ads_user'])
        df = df.join(df_group, how='left', on='key_id')
        del df_group

        # Number of Users/Second
        w = (Window().partitionBy(df.ApproximateDate).rowsBetween(
            -sys.maxsize, sys.maxsize))
        df = df.withColumn('number_user_second',
                           approx_count_distinct(df.UserID).over(w))

        # Number of Ads/Second
        df = df.withColumn('number_ads_second',
                           count_(df.ApproximateDate).over(w))

        # Browser Dummy Transformation
        types = df.select('Browser').distinct().collect()
        types = [val['Browser'] for val in types]
        new_cols = [
            when(df['Browser'] == ty, 1).otherwise(0).alias('d_browser_' + ty)
            for ty in types
        ]
        df = df.select(df.columns + new_cols)

        # Decompose Date Variables
        df = df.withColumn('date', to_date(df['ApproximateDate']))  # date
        df = df.withColumn('month', month(df['ApproximateDate']))  # month
        df = df.withColumn('day', dayofmonth(df['ApproximateDate']))  # day
        df = df.withColumn('weekday', dayofweek(
            df['ApproximateDate']))  # weekday 1=Monday

        df = df.withColumn('hour', hour(df['time']))  # hour
        df = df.withColumn('minute', minute(df['time']))  # minute

        # Peak Hour
        df = df.withColumn('peak6am8am',
                           when(df['hour'].between(6, 8), 1).otherwise(0))
        df = df.withColumn('peak14pm16pm',
                           when(df['hour'].between(14, 16), 1).otherwise(0))

        # Minute Index
        func_index = udf(lambda x: auxiliar_func.time_to_num(x, index='hm'),
                         IntegerType())
        df = df.withColumn('hm_index', func_index(df['time']))

        # Convert to time-series by Minute
        # We reduce to minutes
        df_time_serie_ads = df.select([
            'hms_index', 'hm_index', 'number_user_second', 'number_ads_second'
        ]).drop_duplicates()
        df_time_serie_user = df.select(['UserID',
                                        'hm_index']).drop_duplicates()

        # Group-by the values
        df_time_serie_user = df_time_serie_user.groupBy('hm_index').agg(
            approx_count_distinct('UserID'))
        df_time_serie_ads = df_time_serie_ads.groupBy('hm_index').agg({
            'number_ads_second':
            'sum'
        }).drop_duplicates(subset=['hm_index'])

        # Join ads-users per minute
        df_time_serie = df_time_serie_ads.join(df_time_serie_user,
                                               how='left',
                                               on='hm_index')
        del df_time_serie_ads, df_time_serie_user

        # Rename columns
        df_time_serie = df_time_serie.withColumnRenamed(
            'sum(number_ads_second)', 'number_ads_minute').withColumnRenamed(
                'approx_count_distinct(UserID)', 'number_user_minute')

        # Resample Range of Minutes
        resample_range = list(
            range(
                df_time_serie.select(min_(
                    col('hm_index'))).limit(1).collect()[0][0],
                df_time_serie.select(max_(
                    col('hm_index'))).limit(1).collect()[0][0] + 1, 1))

        resample_range = self._spark.createDataFrame(resample_range,
                                                     IntegerType())

        # Join the original df
        df_time_serie = resample_range.join(
            df_time_serie,
            how='left',
            on=resample_range.value == df_time_serie.hm_index).drop(
                *['hm_index']).fillna(0)

        # Create Lags By Minutes
        w = Window().partitionBy().orderBy(col('value'))
        if self.ar_min_lag > 0:
            df_time_serie = df_time_serie.select(
                '*',
                lag('number_user_minute').over(w).alias(
                    'ar1_number_user_minute'))
            df_time_serie = df_time_serie.select(
                '*',
                lag('number_ads_minute').over(w).alias(
                    'ar1_number_ads_minute'))

            if self.ar_min_lag > 1:
                for l in range(2, self.ar_min_lag + 1, 1):
                    df_time_serie = df_time_serie.select(
                        '*',
                        lag('ar' + str(l - 1) + '_number_user_minute').over(
                            w).alias('ar' + str(l) + '_number_user_minute'))
                    df_time_serie = df_time_serie.select(
                        '*',
                        lag('ar' + str(l - 1) + '_number_ads_minute').over(
                            w).alias('ar' + str(l) + '_number_ads_minute'))

        # Remove the lagged Null Values
        df_time_serie = df_time_serie.dropna()

        # join and remove lag Null values of the first minute
        df = df.orderBy(['UserID', 'hms_index'])
        df = df.join(df_time_serie.orderBy(['hm_index']),
                     how='left',
                     on=df.hm_index == df_time_serie.value).drop('value')

        # Convert to time-series and resample by Seconds
        df_time_serie = df.select(
            ['hms_index', 'number_user_second',
             'number_ads_second']).drop_duplicates()
        resample_range = list(
            range(
                df_time_serie.select(min_(
                    col('hms_index'))).limit(1).collect()[0][0],
                df_time_serie.select(max_(
                    col('hms_index'))).limit(1).collect()[0][0] + 1, 1))
        resample_range = self._spark.createDataFrame(resample_range,
                                                     IntegerType())

        # Join the original df
        df_time_serie = resample_range.join(
            df_time_serie,
            how='left',
            on=resample_range.value == df_time_serie.hms_index).drop(
                *['hms_index']).fillna(0)

        # Create lags
        w = Window().partitionBy().orderBy(col('value'))
        if self.ar_lags > 0:
            df_time_serie = df_time_serie.select(
                '*',
                lag('number_user_second').over(w).alias(
                    'ar1_number_user_second'))
            df_time_serie = df_time_serie.select(
                '*',
                lag('number_ads_second').over(w).alias(
                    'ar1_number_ads_second'))

            if self.ar_lags > 1:
                for l in range(2, self.ar_lags + 1, 1):
                    df_time_serie = df_time_serie.select(
                        '*',
                        lag('ar' + str(l - 1) + '_number_user_second').over(
                            w).alias('ar' + str(l) + '_number_user_second'))
                    df_time_serie = df_time_serie.select(
                        '*',
                        lag('ar' + str(l - 1) + '_number_ads_second').over(
                            w).alias('ar' + str(l) + '_number_ads_second'))

        # Create Moving Average
        if self.ma_ss_lag is not None:

            # Get hour from index
            func_index = udf(lambda x: auxiliar_func.num_to_time(x),
                             StringType())
            df_time_serie = df_time_serie.withColumn(
                'time', func_index(df_time_serie['value']))

            # minute MA terms (Average per second last xx seconds)
            if self.ma_ss_lag is not None:
                for lag_val in self.ma_ss_lag:
                    # range to take into account
                    w = (Window.orderBy(df_time_serie['value']).rangeBetween(
                        -lag_val, 0))
                    # MA variables
                    df_time_serie = df_time_serie.withColumn(
                        'ma_seconds_' + str(lag_val) + '_number_user_second',
                        avg('number_user_second').over(w))
                    df_time_serie = df_time_serie.withColumn(
                        'ma_seconds_' + str(lag_val) + '_number_ads_second',
                        avg('number_ads_second').over(w))

                    # Increasing ID
                    df_time_serie = df_time_serie.withColumn(
                        'rn', monotonically_increasing_id())

                    # Replace first values by Null
                    df_time_serie = df_time_serie.withColumn(
                        'ma_seconds_' + str(lag_val) + '_number_user_second',
                        when(df_time_serie['rn'] < lag_val, None).otherwise(
                            df_time_serie['ma_seconds_' + str(lag_val) +
                                          '_number_user_second']))

                    df_time_serie = df_time_serie.withColumn(
                        'ma_seconds_' + str(lag_val) + '_number_ads_second',
                        when(df_time_serie['rn'] < lag_val, None).otherwise(
                            df_time_serie['ma_seconds_' + str(lag_val) +
                                          '_number_ads_second']))

                    # Get the average by Minute
                    df_time_serie = df_time_serie.withColumn(
                        'ma_minute_' + str(lag_val) + '_number_user_second',
                        df_time_serie['ma_seconds_' + str(lag_val) +
                                      '_number_user_second'] * 60)
                    df_time_serie = df_time_serie.withColumn(
                        'ma_minute_' + str(lag_val) + '_number_ads_second',
                        df_time_serie['ma_seconds_' + str(lag_val) +
                                      '_number_ads_second'] * 60)
                df_time_serie = df_time_serie.drop(*['rn'])

        # Remove the lagged Null Values
        df_time_serie = df_time_serie.drop(
            *['time', 'number_user_second', 'number_ads_second']).dropna()
        # join and remove lag Null values of the first minute
        df = df.join(
            df_time_serie.orderBy(['value']),
            how='left',
            on=df.hms_index == df_time_serie.value).drop('value').dropna()

        if self.train_file and not self.variable_analysis:
            df = df.select([
                'key_id', 'hms_index', 'number_ads_user', 'number_user_second',
                'number_ads_second', 'number_ads_user_second', 'peak6am8am',
                'peak14pm16pm', 'user_id_acumulative'
            ] + [x for x in df.columns if x.startswith('d_browser')] +
                           [x for x in df.columns if x.startswith('ar')] +
                           [x for x in df.columns if x.startswith('ma_')] +
                           ['WinningBid'])

        if not self.train_file:
            df = df.filter(df['test'] == 1)
            df = df.select([
                'UserID', 'key_id', 'number_ads_user', 'hms_index',
                'number_user_second', 'number_ads_second',
                'number_ads_user_second', 'peak6am8am', 'peak14pm16pm',
                'user_id_acumulative'
            ] + [x for x in df.columns if x.startswith('d_browser')] +
                           [x for x in df.columns if x.startswith('ar')] +
                           [x for x in df.columns if x.startswith('ma_')])

        df = df.orderBy(['hms_index', 'UserID'])
        df.show()
        return df
Ejemplo n.º 5
0
def process_log_data(spark, input_data, output_data):
    # get filepath to log data file
    log_data = input_data + "log_data/*/*/*.json"
    song_data = input_data + "song_data/*/*/*/*.json"
    #song_data = input_data + "song_data/A/A/A/*.json"

    DEBUG and print("Reading log data files from", log_data)

    # read log data file
    df = spark.read.json(log_data)

    # filter by actions for song plays
    df = df.filter(df.page == 'NextSong') \
            .where(df.ts.isNotNull()) \
            .withColumn("userId", df["userId"].cast(IntegerType())) \
            .withColumn("sessionId", df["sessionId"].cast(IntegerType()))

    DEBUG and print("Preparing users table")

    # extract columns for users table
    users_table = df.select(
        "userId", "firstName", "lastName", "gender",
        "level").where(col("userId").isNotNull()).dropDuplicates(['userId'])

    DEBUG and print("Creating and persisting users table")

    # write users table to parquet files
    users_table.write.parquet(output_data + "users/", mode='overwrite')

    DEBUG and print("Creating and persisting time table")

    # create timestamp column from original timestamp column
    get_timestamp = udf(lambda ts: datetime.fromtimestamp(ts / 1000),
                        TimestampType())
    df = df.withColumn("start_time", get_timestamp(df.ts))

    # extract columns to create time table
    time_table = df.withColumn("hour", hour(df.start_time)) \
                    .withColumn("day", dayofmonth(df.start_time)) \
                    .withColumn("week", weekofyear(df.start_time)) \
                    .withColumn("month", month(df.start_time)) \
                    .withColumn("year", year(df.start_time)) \
                    .withColumn("weekday", dayofweek(df.start_time)) \
                    .select("start_time", "hour", "day", "week", "month", "year", "weekday") \
                    .dropDuplicates(["start_time"])

    # write time table to parquet files partitioned by year and month
    time_table.write.partitionBy(["year",
                                  "month"]).parquet(output_data + "times/",
                                                    mode='overwrite')

    DEBUG and print("Creating and persisting songplays table")

    # read in song data to use for songplays table
    song_df = spark.read.json(song_data).select("song_id", "title",
                                                "artist_id", "artist_name")
    action_df = df.select("start_time", "userId", "level", "sessionId",
                          "location", "userAgent", "artist", "song")

    # extract columns from joined song and log datasets to create songplays table
    songplays_table = action_df.join(song_df, (action_df.artist == song_df.artist_name) & (action_df.song == song_df.title)) \
                                .select(monotonically_increasing_id().alias("songplay_id"), "start_time", "userId", "level", "song_id", "artist_id", "sessionId", "location", "userAgent") \
                                .withColumn("month", month(df.start_time)) \
                                .withColumn("year", year(df.start_time))

    # write songplays table to parquet files partitioned by year and month
    songplays_table.write.partitionBy(["year", "month"
                                       ]).parquet(output_data + "songplays/",
                                                  mode='overwrite')
Ejemplo n.º 6
0
def start_stream(args):
    validate_params(args)
    _, brokers, topic = args

    spark = create_spark_session()

    json = spark \
        .readStream \
        .format("kafka") \
        .option("kafka.bootstrap.servers", brokers) \
        .option("subscribe", topic) \
        .load()

    json.printSchema()

    # Explicitly set schema
    schema = StructType([
        StructField("symbol", StringType(), False),
        StructField("timestamp", TimestampType(), False),
        StructField("price", DoubleType(), False)
    ])

    json_options = {"timestampFormat": "yyyy-MM-dd'T'HH:mm'Z'"}
    stocks_json = json \
        .select(from_json(F.col("value").cast("string"), schema, json_options).alias("content"))

    stocks_json.printSchema

    stocks = stocks_json.select("content.*")

    ####################################
    # Stream to Parquet
    ####################################
    query = stocks \
        .withColumn('year', year(F.col('timestamp'))) \
        .withColumn('month', month(F.col('timestamp'))) \
        .withColumn('day', dayofmonth(F.col('timestamp'))) \
        .withColumn('hour', hour(F.col('timestamp'))) \
        .withColumn('minute', minute(F.col('timestamp'))) \
        .writeStream \
        .format('parquet') \
        .partitionBy('year', 'month', 'day', 'hour', 'minute') \
        .option('startingOffsets', 'earliest') \
        .option('checkpointLocation', '/dataset/checkpoint') \
        .option('path', '/dataset/streaming.parquet') \
        .trigger(processingTime='30 seconds') \
        .start()

    query.awaitTermination()

    # avg_pricing = stocks \
    #     .groupBy(F.col("symbol")) \
    #     .agg(F.avg(F.col("price")).alias("avg_price"))

    ####################################
    # Console Output
    ####################################
    # query2 = avg_pricing.writeStream \
    #     .outputMode('complete') \
    #     .format("console") \
    #     .trigger(processingTime="10 seconds") \
    #     .start()

    # query2.awaitTermination()

    ####################################
    # Table in Memory
    ####################################
    # query3 = avg_pricing \
    #     .writeStream \
    #     .queryName("avgPricing") \
    #     .outputMode("complete") \
    #     .format("memory") \
    #     .trigger(processingTime="10 seconds") \
    #     .start()
    #
    # while True:
    #     print('\n' + '_' * 30)
    #     # interactively query in-memory table
    #     spark.sql('SELECT * FROM avgPricing').show()
    #     print(query3.lastProgress)
    #     sleep(10)

    # query3.awaitTermination()

    ####################################
    # Writing to Postgres
    ####################################

    # Simple insert
    # query = stream_to_postgres(stocks)
    # query.awaitTermination()

    # Average Price Aggregation
    # query = stream_aggregation_to_postgres(stocks)
    # query.awaitTermination()

    # Final Average Price Aggregation with Timestamp columns
    # query = stream_aggregation_to_postgres_final(stocks)
    # query.awaitTermination()

    pass
Ejemplo n.º 7
0
def process_log_data(spark, input_data, output_data):
    '''
    Get the files from log folders and compose a DataFrame.
    Create the users, time and songplays tables with
    the desired columns and format.
    
    Parameters:
        spark (object): Previous created spark object.
        input_data(string): Key for AWS S3 objects to read.
        output_data(string): Key for AWS S3 objects to save.
        
    Returns:
        None
    '''
    # get filepath to log data file
    log_data = input_data + 'log_data'

    # read log data file
    # smaller data to test: s3a://{}:{}@udacity-dend/log_data/2018/11/2018-11-12*.json
    df = spark.read.json("s3a://{}:{}@udacity-dend/log_data/*/*/*.json"\
                      .format(os.environ['AWS_ACCESS_KEY_ID'],os.environ['AWS_SECRET_ACCESS_KEY']))

    # filter by actions for song plays
    df = df.filter(df['page'] == 'NextSong')

    # extract columns for users table
    users_columns = ['userId', 'firstName', 'lastName', 'gender', 'level']
    users_table = df.select(*users_columns).dropDuplicates()

    # write users table to parquet files
    users_table.write.parquet(output_data + '/users', mode='overwrite')

    # create datetime column from original timestamp column
    df = df.withColumn('datetime', from_unixtime(col('ts') / 1000))

    # extract columns to create time table
    df_time = df.select('datetime').dropDuplicates()
    time_table = df_time.withColumnRenamed('datetime', 'start_time')\
                         .orderBy('start_time', ascending=True)\
                         .withColumn('hour', hour(col('start_time')))\
                         .withColumn('day', dayofmonth(col('start_time')))\
                         .withColumn('week', weekofyear(col('start_time')))\
                         .withColumn('month', month(col('start_time')))\
                         .withColumn('year', year(col('start_time')))\
                         .withColumn('weekday', dayofweek(col('start_time')))

    # write time table to parquet files partitioned by year and month
    time_table.write.parquet(output_data + '/time',
                             mode='overwrite',
                             partitionBy=['year', 'month'])

    # read in song data to use for songplays table
    basePath = output_data + '/songs/'
    song_df = spark.read.option("basePath",
                                basePath).parquet(output_data + '/songs/*')

    # extract columns from joined song and log datasets to create songplays table
    songplays_table = df.join(song_df, df.song == song_df.title, how='left')
    songplays_table = songplays_table.drop('song', 'artist', 'title', 'year',
                                           'duration')

    columns_name = [
        'start_time', 'user_id', 'level', 'session_id', 'location',
        'user_agent', 'song_id', 'artist_id'
    ]
    songplays_table = songplays_table.toDF(*columns_name)
    songplays_table = songplays_table.withColumn('month', month(col('start_time')))\
                                     .withColumn('year', year(col('start_time')))

    # write songplays table to parquet files partitioned by year and month
    songplays_table.write.parquet(output_data + '/songplays',
                                  mode='overwrite',
                                  partitionBy=['year', 'month'])
Ejemplo n.º 8
0
def process_log_data(spark, input_data, output_data):
    """
    Loads the Log files, extracts the data for users table, time table and songplays table then saves it to parquet files
    
    Parameters:
    
    spark: spark session
    input_data: input files path
    output_data: output files path
    """

    # get filepath to log data file
    log_data = input_data

    # read log data file
    df = spark.read.json(log_data)

    # filter by actions for song plays
    df = df.filter(df.page == "NextSong")

    # extract columns for users table
    users_table = df.select("userId", "firstName", "lastName", "gender",
                            "level").drop_duplicates()

    # write users table to parquet files
    users_table.write.parquet(output_data + "users/", mode="overwrite")

    # create timestamp column from original timestamp column
    get_timestamp = udf(lambda ts: datetime.utcfromtimestamp(int(ts) / 1000),
                        TimestampType())
    df = df.withColumn("timestamp", get_timestamp(col("ts")))

    # create datetime column from original timestamp column
    get_datetime = udf(lambda ts: to_date(ts), TimestampType())
    df = df.withColumn("start_time", get_timestamp(col("ts")))

    # extract columns to create time table
    df = df.withColumn("hour",hour("start_time"))\
        .withColumn("day",dayofmonth("start_time"))\
        .withColumn("week",weekofyear("start_time"))\
        .withColumn("month",month("start_time"))\
        .withColumn("year",year("start_time"))\
        .withColumn("weekday",dayofweek("start_time"))

    time_table = df.select("start_time", "hour", "day", "week", "month",
                           "year", "weekday").distinct()

    # write time table to parquet files partitioned by year and month
    time_table.write.parquet(output_data + "time_table/",
                             mode='overwrite',
                             partitionBy=["year", "month"])

    # read in song data to use for songplays table
    song_df = spark.sql(
        "SELECT DISTINCT song_id, artist_id, artist_name FROM df_songs_table")

    # extract columns from joined song and log datasets to create songplays table
    songplays_table = df.join(song_df, song_df.artist_name == df.artist, "inner") \
        .distinct() \
        .select("start_time", "userId", "level", "sessionId", "location", "userAgent","song_id","artist_id", "month", "year") \
        .withColumn("songplay_id", monotonically_increasing_id())

    # write songplays table to parquet files partitioned by year and month
    songplays_table.write.parquet(OUTPUT_DATA + "songplays_table/",
                                  mode="overwrite",
                                  partitionBy=["year", "month"])
Ejemplo n.º 9
0
    users_table = df.select(user_fields).dropDuplicates()

    # write users table to parquet files
    users_table.write.parquet(output_data + 'users/')

    # create timestamp column from original timestamp column
    get_timestamp = udf(lambda x: x/1000, Dbl())
    df = df.withColumn('ts2', get_timestamp('ts'))

    # create datetime column from original timestamp column
    df = df.withColumn('start_time', from_unixtime('ts2').cast(dataType=TimestampType()))

    # extract columns to create time table
    time_table = df.select('start_time')\
                        .dropDuplicates()\
                        .withColumn('hour', hour(col('start_time')))\
                        .withColumn('day', dayofmonth(col('start_time')))\
                        .withColumn('week', weekofyear(col('start_time')))\
                        .withColumn('month', month(col('start_time')))\
                        .withColumn('year', year(col('start_time')))\
                        .withColumn('weekday', date_format(col('start_time'), 'E')

    # write time table to parquet files partitioned by year and month
    time_table.write.parquet(output_data + 'time/')

    # read in song data to use for songplays table
    song_df=spark.read.parquet(output_data + 'songs/*/*/*.parquet')

    songs_logs=df.join(song_df, (df.song == song_df.title))

    songplays = songs_logs.join(time_table,
Ejemplo n.º 10
0
# TEST Five dates for 404 requests (4g)

Test.assertEquals([(r[0], r[1]) for r in top_err_date_df.take(5)], [(7, 532), (8, 381), (6, 372), (4, 346), (15, 326)], 'incorrect top_err_date_df')

# COMMAND ----------

# MAGIC %md
# MAGIC ### (5h) Exercise: Hourly 404 Errors
# MAGIC
# MAGIC Using the DataFrame `not_found_df` you cached in the part (5a) and sorting by hour of the day in increasing order, create a DataFrame containing the number of requests that had a 404 return code for each hour of the day (midnight starts at 0). Cache the resulting DataFrame `hour_records_sorted_df` and print that as a list.

# COMMAND ----------

# TODO: Replace <FILL IN> with appropriate code
from pyspark.sql.functions import hour
hour_records_sorted_df = not_found_df.select(hour('time').alias('hour')).groupBy('hour').count().cache()

print 'Top hours for 404 requests:\n'
hour_records_sorted_df.show(24)

# COMMAND ----------

# TEST Hourly 404 response codes (5h)

errs_by_hour = [(row[0], row[1]) for row in hour_records_sorted_df.collect()]

expected = [
  (0, 175),
  (1, 171),
  (2, 422),
  (3, 272),
Ejemplo n.º 11
0
def process_log_data(spark, input_data, output_data):
    """
        Description: 
        - Extract log data from JSON files stored in S3 bucket
        - Transforms log data into three separate DataFrames; users_table, time_table and songplays_table
        - Loads them back into s3 as parquet files stored in a separate s3-bucket for analytical purposes

        Arguments:
        - Parameter spark: the instantiated SparkSession
        - Parameter input_data: input path
        - Parameter output_data: output path

        Returns:
        - None
    """
    # get filepath to log data file
    log_data = input_data + "log_data/*.json"

    # read log data file
    df = spark.read.json(log_data)

    # filter by actions for song plays
    df = df.where(df["page"] == "NextSong")

    # extract columns for users table
    users_table = df.selectExpr(["userId as user_id"       ,\
                                 "firstName as first_name" ,\
                                 "lastName as last_name"   ,\
                                 "gender"                  ,\
                                 "level"]).dropDuplicates()

    # write users table to parquet files
    users_table.write.mode("overwrite").parquet(output_data + "users.parquet")

    # create timestamp column from original timestamp column
    get_timestamp = udf(
        lambda epoch_time: datetime.fromtimestamp(epoch_time / 1000),
        TimestampType())
    df = df.withColumn('timestamp', get_timestamp(col('ts')))

    # create datetime column from original timestamp column
    get_datetime = udf(
        lambda epoch_time: datetime.fromtimestamp(epoch_time / 1000),
        DateType())
    df = df.withColumn('datetime', get_datetime(col('ts')))

    # extract columns to create time table
    time_table = df.select([hour("timestamp").alias("hour")       ,\
                            dayofmonth("timestamp").alias("day")  ,\
                            weekofyear("timestamp").alias("week") ,\
                            month("timestamp").alias("month")     ,\
                            year("timestamp").alias("year")       ,\
                            date_format("timestamp", 'E').alias("weekday")]).dropDuplicates()

    # write time table to parquet files partitioned by year and month
    time_table.write.mode("overwrite").partitionBy(
        "year", "month").parquet(output_data + "time.parquet")

    # read in song data to use for songplays table
    song_df = spark.read.parquet(output_data + "songs.parquet")

    # extract columns from joined song and log datasets to create songplays table
    songplays_table = song_df.join(df, df.song == song_df.title)\
    .selectExpr(["timestamp as start_time"  ,\
                 "userid as user_id"        ,\
                 "level"                    ,\
                 "song_id"                  ,\
                 "artist_id"                ,\
                 "sessionid as session_id"  ,\
                 "location"                 ,\
                 "useragent as user_agent"]) \
    .withColumn("year", year("start_time"))  \
    .withColumn("month", month("start_time")).dropDuplicates()

    # write songplays table to parquet files partitioned by year and month
    songplays_table.createOrReplaceTempView("songplays")
    spark.sql("""
            SELECT row_number() over (order by start_time asc) as songplay_id,
                   start_time,
                   user_id,
                   level,
                   song_id,
                   artist_id,
                   session_id,
                   location,
                   user_agent,
                   year,
                   month
            FROM songplays
    """).write.mode("overwrite").partitionBy(
        "year", "month").parquet(output_data + "songplays.parquet")
plt.ylabel('404 Errors')
plt.axhline(linewidth=3, color='#999999')
plt.axvline(linewidth=2, color='#999999')
display(fig)

display(errors_by_date_sorted_df)

#Top Five Days for 404 Errors
top_err_date_df = errors_by_date_sorted_df.sort('count',ascending=False)

print 'Top Five Dates for 404 Requests:\n'
top_err_date_df.show(5)

#Hourly 404 Errors
from pyspark.sql.functions import hour
hour_records_sorted_df = not_found_df.groupBy(hour('time').alias('hour')).count()

print 'Top hours for 404 requests:\n'
hour_records_sorted_df.show(24)
hour_records_sorted_df.cache()

#Visualizing the 404 Response Codes by Hour
hours_with_not_found =map (lambda hour: hour[0],(hour_records_sorted_df.select('hour').collect()))
not_found_counts_per_hour = map (lambda counts: counts[0],(hour_records_sorted_df.select('count').collect()))

print hours_with_not_found
print not_found_counts_per_hour

fig, ax = prepareSubplot(np.arange(0, 25, 5), np.arange(0, 500, 50))
colorMap = 'seismic'
cmap = cm.get_cmap(colorMap)
Ejemplo n.º 13
0
cmap = cm.get_cmap(colorMap)
plt.plot(days_with_errors_404, errors_404_by_day, color=cmap(0), linewidth=3)
plt.axis([0, max(days_with_errors_404), 0, max(errors_404_by_day)])
plt.xlabel('Day')
plt.ylabel('404 Errors')
plt.axhline(linewidth=3, color='#999999')
plt.axvline(linewidth=2, color='#999999')
display(fig)
# Top Five Days for 404 Errors
top_err_date_df = errors_by_date_sorted_df.sort("count",ascending=False)

print 'Top Five Dates for 404 Requests:\n'
top_err_date_df.show(5)
# Hourly 404 Errors
from pyspark.sql.functions import hour
hour_records_sorted_df = not_found_df.select(not_found_df['status'],hour(not_found_df['time']).alias('hour')).groupBy('hour').count().sort('hour',ascending=True).cache()

print 'Top hours for 404 requests:\n'
hour_records_sorted_df.show(24)
# Visualizing the 404 Response Codes by Hour
hours_with_not_found = []
not_found_counts_per_hour = []
for x,y in hour_records_sorted_df.select('hour','count').collect():
  hours_with_not_found.append(x)
  not_found_counts_per_hour.append(y)

print hours_with_not_found
print not_found_counts_per_hour
fig, ax = prepareSubplot(np.arange(0, 25, 5), np.arange(0, 500, 50))
colorMap = 'seismic'
cmap = cm.get_cmap(colorMap)
Ejemplo n.º 14
0

# Pickups/Dropoffs in entire NYC
taxi_nyc_df = taxi_df.groupby(taxi_df.Time).agg(*sum_aggregations('Nyc')).cache()

taxi_nyc_1h_df = get_agg_taxi_df(taxi_nyc_df, 1, 'Time', sum_aggregations('Nyc', 1))
taxi_nyc_4h_df = get_agg_taxi_df(taxi_nyc_df, 4, 'Time', sum_aggregations('Nyc', 4))


# Time features
date_df = taxi_df.select(taxi_df.Time).distinct()

weekday_udf = udf(lambda date_time: date_time.weekday(), IntegerType())
is_holiday_udf = udf(lambda date_time: date_time.date() in holidays.UnitedStates(), BooleanType())

date_df = date_df.withColumn('Hour', func.hour(date_df.Time))
date_df = date_df.withColumn('Day_Of_Week', weekday_udf(date_df.Time))
date_df = date_df.withColumn('Day_Of_Year', func.dayofyear(date_df.Time))
date_df = date_df.withColumn('Is_Holiday', is_holiday_udf(date_df.Time))


# Aggregate events happening in last and next 3 hours for each hour
event_3h_df = event_df.withColumnRenamed('Venues', 'Venues_0h')
for i in range(-3, 4):
    if i != 0:
        add_hours_udf = udf(lambda date_time: date_time + datetime.timedelta(hours=i), TimestampType())
        event_3h_df = event_3h_df.join(event_df.withColumn('Time', add_hours_udf(event_df.Time)).withColumnRenamed('Venues', 'Venues_%sh' % str(i)), 'Time')


# Join single feature groups
features_df = taxi_df.select(index_columns + [taxi_df.Pickup_Count]) \
Ejemplo n.º 15
0
def process_log_data(spark, input_data, output_data):
    """
        In this function we are loading the song_data file and create tables for songplays,users and time tables.
        Input: Sparksession, 
               Input_data filepath for songs data 
               Output_data filepath for songs data
               
        Output: We produce parquet files for songplays,users and time tables.
    """
    # get filepath to log data file
    log_data = input_data

    # read log data file
    df = spark.read.json(log_data)

    # filter by actions for song plays
    df = df.where(col("page") == "NextSong")

    # extract columns for users table
    users_table = df['userId', 'firstName', 'lastName', 'gender', 'level']

    # write users table to parquet files
    users_table = users_table.write.partitionBy('userId').parquet(
        os.path.join(output_data, 'users.parquet'), 'overwrite')
    print("users_table partitioned!")

    # create timestamp column from original timestamp column
    get_timestamp = udf(lambda x: tstodatetime(x))
    df = df.withColumn('daytime', get_timestamp(col("ts")))

    # extract columns to create time table
    time_table = df.select(
        col("ts").alias('start_time'),
        year('daytime').alias('year'),
        month('daytime').alias('month'),
        dayofmonth('daytime').alias('day'),
        hour('daytime').alias('hour'),
        weekofyear('daytime').alias('weekofyear'))
    #We are going to partition later in the code!

    # read in song data to use for songplays table
    sqlContext = SQLContext(spark)
    songs_table = sqlContext.read.parquet(
        'data/outputs/song_data/songs.parquet')

    # extract columns from joined song and log datasets to create songplays table
    songplays_table = df['ts', 'userId', 'level', 'sessionId', 'location',
                         'userAgent', 'song']
    #add artists id and song id by joining with songs_table
    songplays_table = songplays_table.alias('s').join(songs_table.alias('e'),col('e.title') == col('s.song'))\
    .select(col('s.ts').alias('start_time'),
        col('s.userId'),
        col('s.level'),
        col('s.sessionId'),
        col('s.location'),
        col('s.userAgent'),
        col('s.song'),
        col('e.artist_id').alias('artist_id'),
        col('e.song_id').alias('song_id'))
    #add month and year for partitioning later based on those
    time_table_short = time_table['start_time', 'month', 'year']
    songplays_table = songplays_table.alias('s').join(time_table_short.alias('t'),col('t.start_time') == col('s.start_time'))\
    .select(col('s.start_time'),
        col('s.userId'),
        col('s.level'),
        col('s.sessionId'),
        col('s.location'),
        col('s.userAgent'),
        col('s.song'),
        col('s.artist_id'),
        col('s.song_id'),
        col('t.year'),
        col('t.month'),
       )
    # write time table to parquet files partitioned by year and month
    time_table = time_table.write.partitionBy('year', 'month').parquet(
        os.path.join(output_data, 'times.parquet'), 'overwrite')
    print("time_table partitioned!")

    # write songplays table to parquet files partitioned by year and month
    songplays_table = songplays_table.write.partitionBy(
        'year',
        'month').parquet(os.path.join(output_data, 'songplays.parquet'),
                         'overwrite')
    print("songplays_table partitioned!")
def main():
    """
    Get observations near locations from SmartMet Server
    
    Data start and end time and timestep is fetched from the
    data. Dataset is assumed coherent in means of time and
    locations. I.e. timestep is assumed to be constant between start
    and end time. 
    """
    log1=logging.getLogger("driver")


    output_directory = 'gs://{}/hadoop/tmp/bigquery/pyspark_output'.format(bucket)
    output_files = output_directory + '/part-*'

    # The trains stations data in stations.json
    # The type of trains and their delay in gratu_a_b_2010-14.csv

    JSON_PATH="gs://trains-data/data/stations.json"
    CSV_PATH="gs://trains-data/data/full/gratu_a_b_2010-14.csv"

    train_stations_df = spark.read \
        .json(JSON_PATH)

    # parameters for weather data to be fetched from Smartmet server
    params, names = read_parameters('parameters_shorten.txt')

    # base URL for the surface data
    baseurl = 'http://data.fmi.fi/fmi-apikey/9fdf9977-5d8f-4a1f-9800-d80a007579c9/timeseries?format=ascii&separator=,&producer=fmi&tz=local&timeformat=xml&timestep=60&numberofstations=5&maxdistance=100000&param={params}'.format(params=','.join(params))


    urlist= train_stations_df.rdd.flatMap(lambda x : ['%s#%s&latlons=%s,%s' % (x.stationShortCode,baseurl,x.latitude,x.longitude)]).repartition(16)


    data = urlist.map(read_from_URL)\
                 .filter(lambda x: x != -1)\
                 .flatMap(lambda x:x.splitlines())\
                 .map(lambda x: x.split(','))

    newColumns=names+["trainstation"]
    schemaString = ' '.join(str(x) for x in newColumns)

    fields = [StructField(field_name, StringType(), True) for field_name in schemaString.split()]
    schema = StructType(fields)

    # Apply the schema to the RDD.
    station_weather_df = spark.createDataFrame(data, schema)
    station_weather_df = station_weather_df.withColumn("time", to_utc_timestamp(station_weather_df.time, "%Y-%m-%dT%H"))

    # calculate max_precipitation 3h and max_precipitation6h
    col="max_precipitation1h"

    # to change the "no precipiation values" -1.0 to 0.0
    station_weather_df = station_weather_df.withColumn(col, f.when(station_weather_df[col] == -1.0, 0.0).otherwise(station_weather_df[col]))


    # using window functions to calculate the precipitation for the
    # previous 3 hours and 6 hours
    w3 = w.partitionBy("trainstation")\
          .orderBy(station_weather_df["time"])\
          .rowsBetween(-2,0)
        
    station_weather_df =station_weather_df.withColumn("max_precipitation3h",f.sum("max_precipitation1h").over(w3))


    w6 = w.partitionBy("trainstation")\
          .orderBy(station_weather_df["time"])\
          .rowsBetween(-5,0)
    
    station_weather_df =station_weather_df.withColumn("max_precipitation6h",f.sum("max_precipitation1h").over(w6))

    # making the surface observation dataframe

    cols = station_weather_df.columns  # list of all columns
    for col in cols:
        station_weather_df = station_weather_df.fillna({col:"-99"})
        station_weather_df = station_weather_df.withColumn(col, f.when(station_weather_df[col].isin("null", "nan", "NaN", "NULL"),"-99").otherwise(station_weather_df[col]))

    log1.info("Retrieved surface data")

   
   
    ## Get flash data

    baseurl = 'http://data.fmi.fi/fmi-apikey/9fdf9977-5d8f-4a1f-9800-d80a007579c9/timeseries?param=time,peak_current&producer=flash&tz=local&timeformat=xml&format=ascii&separator=,'
    urlist= train_stations_df.rdd.flatMap(lambda x : ['%s#%s&latlon=%s,%s:30' % (x.stationShortCode,baseurl,x.latitude,x.longitude)])
   
    data = urlist.map(getFlash)\
            .filter(lambda x: x != -1)\
            .flatMap(lambda x:x.splitlines())\
            .map(lambda x: x.split(','))


    schemaString = 'time peakcurrent trainstation'

    fields = [StructField(field_name, StringType(), True) for field_name in schemaString.split()]
    schema = StructType(fields)

    flash_df =  spark.createDataFrame(data, schema)
    flash_df = flash_df.withColumn("time", to_utc_timestamp(flash_df.time, "%Y%m%dT%HMS"))

    # find the count of flashes in each hour
    extended = (flash_df
                .withColumn("date", f.col("time").cast("date"))
                .withColumn("hour", f.hour(f.col("time"))))

    flash_aggs = extended.groupBy("trainstation", "date", "hour").count()

    flash_aggs = flash_aggs.withColumn('time', f.concat(f.col("date"), f.lit("T"), f.col("hour")))

    flash =flash_aggs.withColumn('time',to_utc_timestamp(flash_aggs.time,"%Y-%m-%dT%H")).select("time", f.col("count").alias("flashcount"),"trainstation")

    log1.info("Retrieved flash data")
    
    # Combining surface and flash data

    cond = [flash.time == station_weather_df.time, flash.trainstation == station_weather_df.trainstation ]
    
    station_weather_flash_df = station_weather_df.alias('a').join(flash.alias('b'),cond, 'outer').select('a.*', 'b.flashcount').fillna({'flashcount':'0'}) 

    # Reading the train type and delay data
    df = spark.read \
            .csv(CSV_PATH)

    # combining the date and time columns and selecting the relevant columns
    df = df.withColumn('t', f.concat(f.col("_c0"), f.lit("T"), f.col("_c1"))).select("t","_c3", "_c4", "_c9", "_c7", "_c5")


    # converting the time to utc timestamp and adding 1 hour
    df = df.withColumn('t',to_utc_timestamp(df.t,"%Y-%m-%dT%H") + f.expr('INTERVAL 1 HOUR'))

    trains_df = df.select(f.col("t").alias("time"),f.col("_c3").alias("trainstation"), f.col("_c4").alias("train_type"), f.col("_c9").alias("train_count"), f.col("_c7").alias("total_delay"), f.col("_c5").alias("delay"))

    # Combining the weather data both surface and flash with
    #he train delay and type data
    cond = [trains_df.time == station_weather_flash_df.time, trains_df.trainstation == station_weather_flash_df.trainstation ]

    trains_station_weather_flash_delay_df = trains_df.join(station_weather_flash_df, cond).drop(station_weather_flash_df.time).drop(station_weather_flash_df.trainstation)

    log1.info("Created the dataframe with train delay and weather observations Finished!\n")

    # Saving the data to BigQuery

    (trains_station_weather_flash_delay_df
     .write.format('json').save(output_directory))

    # Shell out to bq CLI to perform BigQuery import.
    subprocess.check_call(
        'bq load --source_format NEWLINE_DELIMITED_JSON '
        '--replace '
        '--autodetect '
        '{dataset}.{table} {files}'.format(
            dataset=output_dataset, table=output_table, files=output_files
        ).split())

    # Manually clean up the staging_directories, otherwise BigQuery
    # files will remain indefinitely.
    output_path = spark._jvm.org.apache.hadoop.fs.Path(output_directory)
    output_path.getFileSystem(spark._jsc.hadoopConfiguration()).delete(
        output_path, True)

    
    elapsed_time = time.time() - start_time
    log1.info("Elapsed time to retreive train delay and observation data and save to bq {:10.3f}".format(elapsed_time))
Ejemplo n.º 17
0
def process_log_data(spark, input_data, output_data):
    # get filepath to log data file
    log_data = input_data + "log-data/*.json"

    # read log data file
    df = spark.read.json(log_data).dropDuplicates()

    # filter by actions for song plays
    df = df.filter(df.page == "NextSong")

    # extract columns for users table
    users_table = df.select(
        ["userId", "firstName", "lastName", "gender", "level"]).distinct()
    users_table.createOrReplaceTempView("users")

    # write users table to parquet files
    users_table.write.parquet(output_data + "users/users.parquet", "overwrite")

    # create timestamp column from original timestamp column
    get_timestamp = udf(lambda x: datetime.fromtimestamp(x / 1000),
                        TimestampType())

    df = df.withColumn("timestamp", get_timestamp("ts"))

    # create datetime column from original timestamp column
    df = df.withColumn("datetime", get_timestamp("ts"))

    # extract columns to create time table
    df = df.withColumn("start_time", get_timestamp("ts"))
    df = df.withColumn("hour", hour("timestamp"))
    df = df.withColumn("day", dayofmonth("timestamp"))
    df = df.withColumn("week", weekofyear("timestamp"))
    df = df.withColumn("month", month("timestamp"))
    df = df.withColumn("year", year("timestamp"))

    time_table = df.select(
        ["start_time", "hour", "day", "week", "month", "year"]).distinct()
    time_table.createOrReplaceTempView("time")

    # write time table to parquet files partitioned by year and month
    time_table.write.partitionBy('year', 'month').parquet(
        output_data + "time/time.parquet", "overwrite")

    # read in song data to use for songplays table
    # And create a log table
    # No need to create a new / separate 'song_df' var here, since we already have the 'songs' table created above
    df.createOrReplaceTempView("log_df")

    # extract columns from joined song and log datasets to create songplays table
    songplays_table = spark.sql("""
        SELECT monotonically_increasing_id() as songplay_id, 
                log_df.start_time as start_time,
                time.year as year,
                time.month as month,
                log_df.userId as user_id,
                log_df.level as level,
                songs.song_id as song_id,
                songs.artist_id as artist_id,
                log_df.sessionId as session_id,
                log_df.location as location,
                log_df.userAgent as user_agent
        FROM log_df
        JOIN songs
            ON log_df.song == songs.title
        JOIN time
            ON log_df.start_time == time.start_time
    """)

    # write songplays table to parquet files partitioned by year and month
    songplays_table.write.partitionBy('year', 'month').parquet(
        output_data + "songplays/songplays.parquet", "overwrite")
Ejemplo n.º 18
0
    StructField('total_amount', DoubleType()),
    StructField('payment_type', IntegerType()),
    StructField('trip_type', IntegerType()),
    StructField('congestion_surcharge', DoubleType()),
    ])

trip_data = spark.read \
    .option("header", True) \
    .schema(trip_schema) \
    .csv("./data/green/*")
trip_data.printSchema()
# trip_data.write.mode("overwrite").parquet("./values/taxi_green")
# trip_data = spark.read.parquet("./values/taxi_green")
extended_trips = trip_data \
    .withColumn("pick_date", f.to_date(trip_data["lpep_pickup_datetime"])) \
    .withColumn("pick_hour", f.hour(trip_data["lpep_pickup_datetime"]))\
    .withColumn("drop_date", f.to_date(trip_data["lpep_dropoff_datetime"])) \
    .withColumn("drop_hour", f.hour(trip_data["lpep_dropoff_datetime"])) \
    .withColumn("duration", f.unix_timestamp(trip_data["lpep_dropoff_datetime"]) - f.unix_timestamp(trip_data["lpep_pickup_datetime"]))
extended_trips = extended_trips.filter((trip_data["lpep_pickup_datetime"] > '2020-01-01 00:00:00'))

hourly_taxi_trips = extended_trips \
    .groupBy("pick_date", "pick_hour").agg(
        f.count(extended_trips["fare_amount"]).alias("trip_count"),
        f.sum(extended_trips["passenger_count"]).alias("passenger_count"),
        f.sum(extended_trips["fare_amount"]).alias("fare_amount"),
        f.sum(extended_trips["tip_amount"]).alias("tip_amount"),
        f.sum(extended_trips["total_amount"]).alias("total_amount"),
        f.avg(extended_trips["duration"]).alias("avg_duration")
    )
# hourly_taxi_trips.write.mode("overwrite").parquet("./values/taxi-trips-hourly")
Ejemplo n.º 19
0
# 5g

# Top days for 404 errors

top_err_date_df = errors_by_date_sorted_df.sort('count', ascending=False)

print 'Top Five Dates for 404 Requests:\n'
top_err_date_df.show(5)

# 5h

# Sort 

from pyspark.sql.functions import hour

hour_records_sorted_df = not_found_df.groupBy(hour('time').alias('hour')).count().sort('hour', ascending=True).cache()

print 'Top hours for 404 requests:\n'
hour_records_sorted_df.show(24)

# 5i

# Plot 404 errors by hour

hours_with_not_found = [(row[0]) for row in hour_records_sorted_df.collect()]
not_found_counts_per_hour = [(row[1]) for row in hour_records_sorted_df.collect()]

print hours_with_not_found
print not_found_counts_per_hour

fig, ax = prepareSubplot(np.arange(0, 25, 5), np.arange(0, 500, 50))
Ejemplo n.º 20
0
def process_log_data(spark, input_data, output_data):
    """
    Description:
            Process the event log file and extract data for table time, users and songplays from it.

    :param spark: a spark session instance
    :param input_data: input file path
    :param output_data: output file path
    """

    # get filepath to log data file
    log_data = os.path.join(input_data, "log-data/")

    # read log data file
    df = spark.read.json(
        log_data,
        mode='PERMISSIVE',
        columnNameOfCorruptRecord='corrupt_record').drop_duplicates()

    # filter by actions for song plays
    df = df.filter(df.page == "NextSong")

    # extract columns for users table
    users_table = df.select("userId", "firstName", "lastName", "gender",
                            "level").drop_duplicates()

    # write users table to parquet files
    users_table.write.parquet(os.path.join(output_data, "users/"),
                              mode="overwrite")

    # create timestamp column from original timestamp column
    get_timestamp = udf(lambda x: datetime.utcfromtimestamp(int(x) / 1000),
                        TimestampType())
    df = df.withColumn("start_time", get_timestamp("ts"))

    # extract columns to create time table
    time_table = df.withColumn("hour",hour("start_time"))\
                    .withColumn("day",dayofmonth("start_time"))\
                    .withColumn("week",weekofyear("start_time"))\
                    .withColumn("month",month("start_time"))\
                    .withColumn("year",year("start_time"))\
                    .withColumn("weekday",dayofweek("start_time"))\
                    .select("ts","start_time","hour", "day", "week", "month", "year", "weekday").drop_duplicates()

    # write time table to parquet files partitioned by year and month
    time_table.write.parquet(os.path.join(output_data, "time_table/"),
                             mode='overwrite',
                             partitionBy=["year", "month"])

    # read in song data to use for songplays table
    song_df = spark.read\
                .format("parquet")\
                .option("basePath", os.path.join(output_data, "songs/"))\
                .load(os.path.join(output_data, "songs/*/*/"))

    # extract columns from joined song and log datasets to create songplays table
    songplays_table = df.join(song_df, df.song == song_df.title, how='inner')\
                        .select(monotonically_increasing_id().alias("songplay_id"),col("start_time"),col("userId").alias("user_id"),"level","song_id","artist_id", col("sessionId").alias("session_id"), "location", col("userAgent").alias("user_agent"))

    songplays_table = songplays_table.join(time_table, songplays_table.start_time == time_table.start_time, how="inner")\
                        .select("songplay_id", songplays_table.start_time, "user_id", "level", "song_id", "artist_id", "session_id", "location", "user_agent", "year", "month")

    # write songplays table to parquet files partitioned by year and month
    songplays_table.drop_duplicates().write.parquet(
        os.path.join(output_data, "songplays/"),
        mode="overwrite",
        partitionBy=["year", "month"])
Ejemplo n.º 21
0
def process_log_data(spark, input_data, output_data):
    '''
    Process the log data from the file(s) specified in the parameters.
    
    Args:
        spark: the spark session
        input_data: 
        output_data:
    
    Returns:
        modeled data from logs and songs json files that are written to parquet files back on S3
    '''
    # get filepath to log data file
    log_data = input_data + "log_data/*/*"

    # read log data file
    df = spark.read.json(log_data)

    # filter by actions for song plays
    df = df.where(df.page == 'NextSong')

    # extract columns for users table
    users_table = df.select(
        col('userId').alias('user_id'),
        col('firstName').alias('first_name'),
        col('lastName').alias('last_name'),
        col('gender').alias('gender'),
        col('level').alias('level')).distinct()

    # write users table to parquet files
    users_table.write.parquet(output_data + "users.parquet", mode="overwrite")

    # create timestamp column from original timestamp column
    df = df.withColumn(
        'timestamp',
        f.to_timestamp(
            f.from_unixtime((col('ts') / 1000),
                            'yyyy-MM-dd HH:mm:ss.SSS')).cast("Timestamp"))

    # create datetime column from original timestamp column
    df = df.withColumn('ts_datetime',
                       f.to_datetime(col['ts']).cast('Datetime'))

    # extract columns to create time table
    time_table = df.withColumn("hour", hour(col("timestamp"))) \
          .withColumn("day", dayofmonth(col("timestamp"))) \
          .withColumn("week", weekofyear(col("timestamp"))) \
          .withColumn("month", month(col("timestamp"))) \
          .withColumn("year", year(col("timestamp"))) \
          .withColumn("weekday", datetime.datetime(col("timestamp")).weekday()) \
          .select(
            col("timestamp").alias("start_time"),
            col("hour"),
            col("day"),
            col("week"),
            col("month"),
            col("year"),
            col("weekday")
          )

    # write time table to parquet files partitioned by year and month
    time_table.parquet(output_data + "time.parquet", mode="overwrite")

    # read in song data to use for songplays table
    song_df = spark.read.parquet(output_data + "songs.parquet")

    # extract columns from joined song and log datasets to create songplays table
    songplays_table = df.withColumn(
        'songplay_id', F.monontonically_increasing_id()).join(
            song_df, song_df.title == df.song).select(
                'songplay_id',
                col().alias('start_time'),
                col('userId').alias('user_id'), 'level', 'song_id',
                'artist_id',
                col('sessionId').alias('session_id'), 'location',
                col('userAgent').alias('user_agent'))

    # write songplays table to parquet files partitioned by year and month
    songplays_table.write.parquet(output_data + "songplays.parquet",
                                  mode="overwrite")
Ejemplo n.º 22
0
def process_log_data(spark, input_data, output_data):
    """
        Description: This function loads log_data from S3 and extracts the songs and artist tablesafter processing
        and then write those generated tables to S3 in parquet format. Also output from previous function is used in by spark.read.json command
        
        Parameters:
            spark       : Spark Session
            input_data  : Location of log_data files
            output_data : S3 bucket where extracted tables are written in parquet format.
            
    """
    # get filepath to log data file
    log_data = input_data + 'log_data/*/*/*.json'

    # read log data file
    df = spark.read.json(log_data)

    # filter by actions for song plays
    df = df.filter(df.page == 'NextSong')

    # extract columns for users table
    users_table = df.select('userId','firstName', 'lastname', 'gender', 'level').dropDuplicates()\
                    .where(df.userId.isNotNull())

    # write users table to parquet files
    users_table.write.parquet(output + 'users/')

    # create UDF for timestamp column from original timestamp column
    @udf(TimestampType())
    def conv_timestamp(ms):
        return datetime.fromtimestamp(ms / 1000.0)

    # Lets add one more column with correct usable time stamp format
    df = df.withColumn("start_time", conv_timestamp('ts'))

    # Create a dataframe which only has start_time
    log_time_data = df.select('start_time').dropDuplicates()\
                    .where(df.start_time.isNotNull())

    # extract columns to create time table
    time_table = log_time_data.withColumn('hour',hour('start_time'))\
                              .withColumn('day',dayofmonth('start_time'))\
                              .withColumn('week', weekofyear('start_time'))\
                              .withColumn('month', month('start_time'))\
                              .withColumn('year',year('start_time'))\
                              .withColumn("weekday", date_format("start_time", 'E'))

    # write time table to parquet files partitioned by year and month
    time_table.write.partitionBy("year",
                                 "month").parquet(output_data + 'times/')

    # create a view for the log_data and we already have the view for song_data as song created at the start
    df.createOrReplaceTempView('log_data_filtered_timeformatted')

    # extract columns from joined song and log datasets to create songplays table
    songplays_table = spark.sql(
        """SELECT monotonically_increasing_id() AS songplay_id,
                                  start_time,
                                  userId AS user_id,
                                  level,
                                  song_id,
                                  artist_id,
                                  sessionId AS session_id,
                                  location,
                                  userAgent AS user_agent
                                  FROM  log_data_filtered_timeformatted 
                                  JOIN song 
                                  ON artist = artist_name AND song = title """)

    # write songplays table to parquet files partitioned by year and month
    songplays_table.write.partitionBy("year", "month").parquet(output_data +
                                                               'songplays/')
Ejemplo n.º 23
0
def process_log_data(spark, input_data, output_data):
    """
    Reads logs data in a dataframe which is then used to create new dataframes for creating users and time tables.
    Reads songs data and join it with logs dataframe to create a data for songplays table.
    Drop duplicates, rename columns and finally saves all tables in parquet format.

    :param spark: Spark session object
    :param input_data: S3 or local dir containing song data
    :param output_data: Path for parquet output files
    """
    # get filepath to log data file
    log_data = input_data + "log_data/*/*/*.json"  # S3 dir structure
    # log_data = input_data + "log_data/*.json"           # local dir structure

    # read log data file
    logger.info('Reading log data json files')
    df = spark.read.json(log_data)

    # filter by actions for song plays
    df = df[df['page'] == 'NextSong']

    # extract columns for users table
    users_table = df[['userId', 'firstName', 'lastName', 'gender', 'level']]
    users_table = users_table \
        .withColumnRenamed('userId', 'user_id') \
        .withColumnRenamed('firstName', 'first_name') \
        .withColumnRenamed('lastName', 'last_name') \
        .dropDuplicates()

    # write users table to parquet files
    logger.info('Writing users table in parquet format')
    users_table.write.parquet(output_data + '/tbl_users.parquet')

    # create timestamp column from original timestamp column
    get_timestamp = udf(lambda x: datetime.fromtimestamp(x / 1000.0),
                        TimestampType())
    df = df.withColumn('start_time', get_timestamp(df.ts))

    # create datetime columns from derived start_time column
    df = df.withColumn('hour', hour(df.start_time))
    df = df.withColumn('day', dayofmonth(df.start_time))
    df = df.withColumn('week', weekofyear(df.start_time))
    df = df.withColumn('month', month(df.start_time))
    df = df.withColumn('year', year(df.start_time))
    df = df.withColumn('weekday', dayofweek(df.start_time))

    # extract columns to create time table
    time_table = df[[
        'start_time', 'hour', 'day', 'week', 'month', 'year', 'weekday'
    ]]
    time_table = time_table.dropDuplicates()

    # write time table to parquet files partitioned by year and month
    logger.info(
        'Writing time table partitioned by year and month in parquet format')
    time_table.write.partitionBy('year', 'month').parquet(output_data +
                                                          '/tbl_time.parquet')

    # read in song data to use for songplays table
    logger.info("Reading song data for join")
    song_df = spark.read.json(input_data + 'song_data/*/*/*/*.json')
    song_df = song_df.withColumnRenamed('year', 'song_year')

    # extract columns from joined song and log datasets to create songplays table
    songplays_table = df.join(song_df, song_df.artist_name == df.artist,
                              'inner')
    songplays_table = songplays_table.withColumn(
        "songplay_id", F.monotonically_increasing_id())
    songplays_table = songplays_table[[
        'songplay_id', 'start_time', 'userId', 'level', 'song_id', 'artist_id',
        'sessionId', 'location', 'userAgent', 'month', 'year'
    ]]
    songplays_table = songplays_table \
        .withColumnRenamed('userId', 'user_id') \
        .withColumnRenamed('sessionId', 'session_id') \
        .withColumnRenamed('userAgent', 'user_agent')

    # write songplays table to parquet files partitioned by year and month
    logger.info(
        'Writing songplays table partitioned by year and month in parquet format'
    )
    songplays_table.write.partitionBy(
        'year', 'month').parquet(output_data + '/tbl_songplays.parquet')
Ejemplo n.º 24
0
def process_log_data(spark, input_data, output_data):
    """
    Process log_data from input_data path and save users, time and songplays tables in paquet format in output_data path
    
    Parameters:
        spark: SparkSession object to process data
        input_data: path to input data
        output_data: path to output data
    """
    
    # get filepath to log data file
    log_data = input_data + 'log_data/*'

    # read log data file
    log_df = spark.read.json(log_data)
    
    # filter by actions for song plays 
    log_df = log_df.filter('page = "NextSong"') \
                   .withColumn('user_id', log_df['userId'].cast('integer')) \
                   .withColumn('session_id', log_df['sessionId'].cast('integer')) \
                   .withColumnRenamed('firstName', 'first_name') \
                   .withColumnRenamed('lastName', 'last_name')

    # extract columns for users table    
    users_table = log_df[['user_id', 'first_name', 'last_name', 'gender', 'level']]
    
    # write users table to parquet files
    users_table.where(users_table.user_id.isNotNull()).distinct().write.mode('overwrite').parquet(output_data + 'users/')

    # create timestamp column from original timestamp column
    time_df = log_df[['ts']]
    
    # create datetime column from original timestamp column
    time_df = time_df.withColumn('ts', to_timestamp(col('ts')/1000))
    
    # extract columns to create time table
    time_table = time_df.withColumnRenamed('ts', 'start_time') \
                        .withColumn('hour', hour(col('start_time'))) \
                        .withColumn('day', dayofmonth(col('start_time'))) \
                        .withColumn('week', weekofyear(col('start_time'))) \
                        .withColumn('month', month(col('start_time'))) \
                        .withColumn('year', year(col('start_time'))) \
                        .withColumn('weekday', date_format(col('start_time'), 'u').cast('integer'))
     
    # write time table to parquet files partitioned by year and month
    time_table.distinct().write.partitionBy('year', 'month').mode('overwrite').parquet(output_data + 'time/')

    # read in song data to use for songplays table
    song_df = spark.read.json(input_data + 'song_data/*/*/*')

    # extract columns from joined song and log datasets to create songplays table 
    songplays_table = log_df.join(song_df, [log_df.song == song_df.title, log_df.artist == song_df.artist_name]) \
                             .selectExpr('monotonically_increasing_id() as songplay_id', \
                                         'to_timestamp(ts/1000) as start_time', \
                                         'month(to_timestamp(ts/1000)) as month', \
                                         'year(to_timestamp(ts/1000)) as year', \
                                         'user_id as user_id', \
                                         'level as level', \
                                         'song_id as song_id', \
                                         'artist_id as artist_id', \
                                         'session_id as session_id', \
                                         'location as location', \
                                         'userAgent as user_agent') 

    # write songplays table to parquet files partitioned by year and month
    songplays_table.write.mode('overwrite').partitionBy('year', 'month').parquet(output_data+'songplays/')
Ejemplo n.º 25
0
def process_log_data(spark, input_data, output_data):
    """
    Function that read and transform log_data files to

    save user_table, time_table and songplays_table on S3 (in parquet extension)

    """

    # get filepath to log data file
    log_data = input_data + "log_data/*/*/*.json"  #real path
    # log_data = input_data + "log_data/2018/11/2018-11-12-events.json"

    # read log data file
    df = spark.read.json(log_data)

    # filter by actions for song plays
    df = df.where("page='NextSong'")

    # extract columns for users table
    user_table = df.select(col("userId").cast("int").alias("user_id"),\
                           col("firstName").alias("first_name"),\
                           col("lastName").alias("last_name"),"gender","level")

    user_table = user_table.dropDuplicates()

    # write users table to parquet files
    user_table.write.parquet(output_data + 'users/', 'overwrite')

    # create timestamp column from original timestamp column
    get_timestamp = udf(lambda x: int(x) / 1000, IntegerType())
    df = df.withColumn("timestamp", get_timestamp("ts"))

    # create datetime column from original timestamp column
    get_datetime = udf(lambda x: datetime.fromtimestamp(x), TimestampType())
    df = df.withColumn("datetime", get_datetime("timestamp"))

    #     print(df.limit(5).toPandas().head())
    # extract columns to create time table
    time_table = df.select(col("timestamp").alias("start_time"),\
                           hour("datetime").alias("hour"),\
                           dayofmonth("datetime").alias("day"),\
                           weekofyear("datetime").alias("week"),\
                           month("datetime").alias("month"),\
                           year("datetime").alias("year"),\
                           date_format('datetime','E').alias('weekday')
                          )
    #     print(time_table.limit(5).toPandas().head())
    time_table = time_table.dropDuplicates()

    # write time table to parquet files partitioned by year and month
    time_table.write.partitionBy("year",
                                 "month").parquet(output_data + 'time/',
                                                  'overwrite')

    # read in song data to use for songplays table
    #     song_df = spark.read.json(input_data + "song_data/A/B/C/TRABCEI128F424C983.json")
    song_df = spark.read.json(input_data + "song_data/*/*/*/*.json")

    # extract columns from joined song and log datasets to create songplays table
    songplays_table = df.alias("a").join(song_df.alias("b"),\
                                         (df.song == song_df.title) & (df.artist == song_df.artist_name) & (df.length == song_df.duration)).\
    select(col("a.ts").alias("start_time"),col("a.userId").cast("int").alias("a.user_id"),"level",\
           col("a.sessionId").alias("session_id"),"a.location","a.userAgent","b.song_id","b.artist_id")

    get_start_time = udf(lambda x: datetime.fromtimestamp(int(x) / 1000),
                         TimestampType())
    songplays_table = songplays_table.withColumn("start_time",
                                                 get_start_time("start_time"))
    songplays_table = songplays_table.withColumn("songplay_id",
                                                 monotonically_increasing_id())
    songplays_table = songplays_table.withColumn("year", year("start_time"))
    songplays_table = songplays_table.withColumn("month", month("start_time"))

    #     print(songplays_table.limit(5).toPandas().head())

    songplays_table = songplays_table.dropDuplicates()

    # write songplays table to parquet files partitioned by year and month
    songplays_table.write.partitionBy("year", "month").parquet(
        output_data + 'songplays/', 'overwrite')
def process_log_data(spark, input_data, output_data):
    # get filepath to log data file
    log_data = input_data + "log-data/*/*/*.json"

    # read log data file
    print('Input log data json file read started')
    df = spark.read.json(log_data,
                         mode='PERMISSIVE',
                         columnNameOfCorruptRecord='corrupt_record')
    print('Input log data json file read completed')

    # filter by actions for song plays
    df = df.filter(df.page == 'NextSong')

    # extract columns for users table
    print('users_table data exteaction started \n')
    users_table = df.select("userId", "firstName", "lastName", "gender",
                            "level").drop_duplicates()
    print('users_table data exteaction completed \n')

    # write users table to parquet files
    print('users_table data write started \n')
    users_table.write.parquet(output_data + "users_table/", mode="overwrite")
    print('users_table write Completed')

    # create timestamp column from original timestamp column
    get_timestamp = udf(lambda x: datetime.fromtimestamp(int(x) / 1000),
                        TimestampType())
    df = df.withColumn("start_time", get_timestamp("ts"))

    # create datetime column from original timestamp column
    #get_datetime = udf()
    #df =

    # extract columns to create time table
    print('time_table data extraction started \n')
    time_table=df.select('start_time').drop_duplicates() \
        .withColumn('hour', hour(col('start_time'))) \
        .withColumn('day', dayofmonth(col('start_time'))) \
        .withColumn('week', weekofyear(col('start_time'))) \
        .withColumn('month', month(col('start_time'))) \
        .withColumn('year', year(col('start_time'))) \
        .withColumn('weekday', dayofweek(col('start_time')))
    #time_table.show()
    print('time_table data extraction completed \n')

    # write time table to parquet files partitioned by year and month
    print('time_table data write started \n')
    time_table.write.parquet(output_data + "time_table/", mode="overwrite")
    print('time_table data write Completed \n')

    # read in song data to use for songplays table
    song_df = spark.read.format("parquet").option(
        "basePath",
        os.path.join(output_data,
                     "songs/")).load(os.path.join(output_data, "songs/*/*/"))

    # extract columns from joined song and log datasets to create songplays table
    print('songplays_table data extraction started \n')
    songplays_table = df.join(song_df, df.song == song_df.title, how='inner')\
                        .select(monotonically_increasing_id().alias("songplay_id"),
                         col("start_time"),
                         col("userId").alias("user_id"),
                         col("level"),
                         col("song_id"),
                         col("artist_id"),
                         col("sessionId").alias("session_id"),
                         col("location"),
                         col("userAgent").alias("user_agent")
                        )
    print('songplays_table data extraction completed \n')
    # write songplays table to parquet files partitioned by year and month
    print('songplays_table data write started \n')
    songplays_table = songplays_table.write.parquet(output_data + "songplays/",
                                                    mode="overwrite")
    print('songsplay_table data write completed \n')
Ejemplo n.º 27
0
# Dates and Timestamps
from pyspark.sql import SparkSession
spark= SparkSession.builder.appName('dates').getOrCreate()

df= spark.read.csv('appl_stock.csv',header=True,inferSchema=True)
df.head(1)
df.show()
df.select(['Date','Open']).show() # formato date: year-month-day hour

from pyspark.sql.functions import (dayofmonth,hour,
                                  dayofyear,month,
                                  year,weekofyear,
                                  format_number,date_format)
                                  
df.select(dayofmonth(df['Date'])).show()
df.select(hour(df['Date'])).show()
df.select(month(df['Date'])).show()
df.select(year(df['Date'])).show()
df.withColumn("Year",year(df['Date'])).show() # Añade al final la columna Year
newdf=df.withColumn("Year",year(df['Date']))
newdf.groupBy("Year").mean().show() # Calcula un promedio de los valores de las columnas para cada año
newdf.groupBy("Year").mean().select(["Year","avg(Close)" ]).show()  # Calcula un promedio de los valores de las columnas para cada año mostrando solo las columnas Year y avg(Close)
result=newdf.groupBy("Year").mean().select(["Year","avg(Close)" ])
result.show()
result.withColumnRenamed("avg(Close)","Average Closing Price").show() # Cambiamos de nombre avg(Close) a Average Closing Price
new= result.withColumnRenamed("avg(Close)","Average Closing Price")
new.select(['Year',format_number('Average Closing Price',2)]).show() # La columna ahora se llama (Average Closing Price) sus valores tienen 2 decimales
new.select(['Year',format_number('Average Closing Price',2).alias("avg(Close)")]).show() # Cambiamos el nombre a avg(Close)


#--------#
# Total: 834648
# Unique: 11691
#totalCount = allAddId.select("advertisement_id").count()
#uniqueCount = allAddId.select("advertisement_id").distinct().count()

# Drop distance column
allAdId = allAdId.drop("distance")

# Create date column for indexing and aggregation
allAdIdDate = allAdId.select("*", col("date_time").cast("date").alias("date"))

# Create hour column for indexing and aggregation
allAdIdHour = allAdIdDate.select(
    "*",
    hour("date_time").cast("int").alias("broadcast_hour"))

#bukitBintang = allAdIdHour.toPandas()
#bukitBintang.to_csv("bukit_bintang.csv", index = False)

# Filter IDFAs (advertisement_id) that are from February 1st and horizontal
# accuracy are within 30.00m or not null
allAdIdFilter = allAdIdHour.filter((allAdIdHour.date == "2019-02-01") & (
    (allAdIdHour.horizontal_accuracy <= 30)
    & (allAdIdHour.horizontal_accuracy.isNotNull())))

# Drop horizontal accuracy since it's redundant for future analysis
allAdIdClean = allAdIdFilter.drop("horizontal_accuracy", "date", "latitude",
                                  "longitude")

# Aggregate advertisement_id by hour
Ejemplo n.º 29
0
def process_log_data(spark, input_data, output_data):
    """
        Description: This function loads log_data from S3 and processes it by extracting the songs and artist tables
                     and then again loaded back to S3. Also output from previous function is used in by spark.read.json command
        
        Parameters:
            spark       : Spark Session
            input_data  : location of log_data json files with the events data
            output_data : S3 bucket were dimensional tables in parquet format will be stored
            
    """
    # get filepath to log data file
    log_data = os.path.join(input_data, "log_data/*.json")

    # read log data file
    df = spark.read.json(log_data)

    # filter by actions for song plays
    actions_df = df.filter(df.page == 'NextSong') \
                   .select('ts', 'userId', 'level', 'song', 'artist',
                           'sessionId', 'location', 'userAgent')

    # extract columns for users table
    users_table = df.select('userId', 'firstName', 'lastName', 'gender',
                            'level').dropDuplicates()
    users_table.createOrReplaceTempView('users')
    # write users table to parquet files
    users_table.write.parquet(os.path.join(output_data, 'users/users.parquet'),
                              'overwrite')

    # create timestamp column from original timestamp column
    get_timestamp = udf(lambda x: str(int(int(x) / 1000)))
    actions_df = actions_df.withColumn('timestamp',
                                       get_timestamp(actions_df.ts))

    # create datetime column from original timestamp column
    get_datetime = udf(lambda x: str(datetime.fromtimestamp(int(x) / 1000)))
    actions_df = actions_df.withColumn('datetime', get_datetime(actions_df.ts))

    # extract columns to create time table
    time_table = actions_df.select('datetime') \
                           .withColumn('start_time', actions_df.datetime) \
                           .withColumn('hour', hour('datetime')) \
                           .withColumn('day', dayofmonth('datetime')) \
                           .withColumn('week', weekofyear('datetime')) \
                           .withColumn('month', month('datetime')) \
                           .withColumn('year', year('datetime')) \
                           .withColumn('weekday', dayofweek('datetime')) \
                           .dropDuplicates()

    # write time table to parquet files partitioned by year and month
    time_table.write.partitionBy('year', 'month') \
                    .parquet(os.path.join(output_data,
                                          'time/time.parquet'), 'overwrite')

    # read in song data to use for songplays table
    song_data = os.path.join(input_data, "song_data/*/*/*/*.json")
    song_df = spark.read.json(song_data)

    # extract columns from joined song and log datasets to create songplays table
    actions_df = actions_df.alias('log_df')
    song_df = song_df.alias('song_df')
    joined_df = actions_df.join(
        song_df,
        col('log_df.artist') == col('song_df.artist_name'), 'inner')
    songplays_table = joined_df.select(
        col('log_df.datetime').alias('start_time'),
        col('log_df.userId').alias('user_id'),
        col('log_df.level').alias('level'),
        col('song_df.song_id').alias('song_id'),
        col('song_df.artist_id').alias('artist_id'),
        col('log_df.sessionId').alias('session_id'),
        col('log_df.location').alias('location'),
        col('log_df.userAgent').alias('user_agent'),
        year('log_df.datetime').alias('year'),
        month('log_df.datetime').alias('month')) \
        .withColumn('songplay_id', monotonically_increasing_id())

    songplays_table.createOrReplaceTempView('songplays')
    # write songplays table to parquet files partitioned by year and month
    time_table = time_table.alias('timetable')

    songplays_table.write.partitionBy('year', 'month'). \
        parquet(os.path.join(output_data, 'songplays/songplays.parquet'), \
                'overwrite')
    print("--- songplays.parquet completed ---")
    print("*** process_log_data completed ***\n\nEND")
Ejemplo n.º 30
0
def main(base_path):
    APP_NAME = "train_spark_mllib_model.py"

    # SparkSession이 없으면 환경 생성
    try:
        sc and spark
    except NameError as e:
        import findspark
        findspark.init()
        import pyspark
        import pyspark.sql

        sc = pyspark.SparkContext()
        spark = pyspark.sql.SparkSession(sc).builder.appName(
            APP_NAME).getOrCreate()

    #
    # {
    #   "ArrDelay":5.0,"CRSArrTime":"2015-12-31T03:20:00.000-08:00","CRSDepTime":"2015-12-31T03:05:00.000-08:00",
    #   "Carrier":"WN","DayOfMonth":31,"DayOfWeek":4,"DayOfYear":365,"DepDelay":14.0,"Dest":"SAN","Distance":368.0,
    #   "FlightDate":"2015-12-30T16:00:00.000-08:00","FlightNum":"6109","Origin":"TUS"
    # }
    #
    from pyspark.sql.types import StringType, IntegerType, FloatType, DoubleType, DateType, TimestampType
    from pyspark.sql.types import StructType, StructField
    from pyspark.sql.functions import udf

    schema = StructType([
        StructField("ArrDelay", DoubleType(), True),
        StructField("CRSArrTime", TimestampType(), True),
        StructField("CRSDepTime", TimestampType(), True),
        StructField("Carrier", StringType(), True),
        StructField("DayOfMonth", IntegerType(), True),
        StructField("DayOfWeek", IntegerType(), True),
        StructField("DayOfYear", IntegerType(), True),
        StructField("DepDelay", DoubleType(), True),
        StructField("Dest", StringType(), True),
        StructField("Distance", DoubleType(), True),
        StructField("FlightDate", DateType(), True),
        StructField("FlightNum", StringType(), True),
        StructField("Origin", StringType(), True),
        StructField("Route", StringType(), True),
        StructField("TailNum", StringType(), True),
        StructField("EngineManufacturer", StringType(), True),
        StructField("EngineModel", StringType(), True),
        StructField("Manufacturer", StringType(), True),
        StructField("ManufacturerYear", StringType(), True),
        StructField("OwnerState", StringType(), True),
        StructField("FlightTime", IntegerType(), True),
    ])

    input_path = "{}/data/simple_flight_delay_features_flight_times.json".format(
        base_path)
    features = spark.read.json(input_path, schema=schema)
    features.first()

    #
    # 예정된 도착/출발 시간 추가
    #
    from pyspark.sql.functions import hour
    features_with_hour = features.withColumn("CRSDepHourOfDay",
                                             hour(features.CRSDepTime))
    features_with_hour = features_with_hour.withColumn(
        "CRSArrHourOfDay", hour(features.CRSArrTime))
    features_with_hour.select("CRSDepTime", "CRSDepHourOfDay", "CRSArrTime",
                              "CRSArrHourOfDay").show()

    #
    # Spark ML을 사용하기 전 특징에 널 값이 있는지 확인
    #
    null_counts = [
        (column,
         features_with_hour.where(features_with_hour[column].isNull()).count())
        for column in features_with_hour.columns
    ]
    cols_with_nulls = filter(lambda x: x[1] > 0, null_counts)
    print("\nNull Value Report")
    print("-----------------")
    print(tabulate(cols_with_nulls, headers=["Column", "Nulls"]))

    #
    # pysmark.ml.feature.Bucketizer를 사용해서 ArrDelay를 on-time, slightly late, very late (0, 1, 2)으로 구간화
    #
    from pyspark.ml.feature import Bucketizer

    # 구간화 모델 설정
    splits = [-float("inf"), -15.0, 0, 30.0, float("inf")]
    arrival_bucketizer = Bucketizer(splits=splits,
                                    inputCol="ArrDelay",
                                    outputCol="ArrDelayBucket")

    # 모델 저장
    arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format(
        base_path)
    arrival_bucketizer.write().overwrite().save(arrival_bucketizer_path)

    # 모델 적용
    ml_bucketized_features = arrival_bucketizer.transform(features_with_hour)
    ml_bucketized_features.select("ArrDelay", "ArrDelayBucket").show()

    #
    # pyspark.ml.feature의 특징 도구 임포트
    #
    from pyspark.ml.feature import StringIndexer, VectorAssembler

    # 범주 필드를 인덱스로 전환
    string_columns = ["Carrier", "Origin", "Dest", "Route", "TailNum"]
    for column in string_columns:
        string_indexer = StringIndexer(inputCol=column,
                                       outputCol=column + "_index")

        string_indexer_model = string_indexer.fit(ml_bucketized_features)
        ml_bucketized_features = string_indexer_model.transform(
            ml_bucketized_features)

        # Save the pipeline model
        string_indexer_output_path = "{}/models/string_indexer_model_4.0.{}.bin".format(
            base_path, column)
        string_indexer_model.write().overwrite().save(
            string_indexer_output_path)

    # 연속형 숫자 필드를 범주형 필드의 인덱스와 결합해서 하나의 특징 벡터를 만듦
    numeric_columns = [
        "DepDelay", "Distance", "DayOfYear", "CRSDepHourOfDay",
        "CRSArrHourOfDay", "FlightTime"
    ]
    index_columns = [column + "_index" for column in string_columns]

    vector_assembler = VectorAssembler(inputCols=numeric_columns +
                                       index_columns,
                                       outputCol="Features_vec")
    final_vectorized_features = vector_assembler.transform(
        ml_bucketized_features)

    # 수치 벡터 어셈블러 저장
    vector_assembler_path = "{}/models/numeric_vector_assembler_6.0.bin".format(
        base_path)
    vector_assembler.write().overwrite().save(vector_assembler_path)

    # 인덱스 열 제거
    for column in index_columns:
        final_vectorized_features = final_vectorized_features.drop(column)

    # 확정된 특징 검사
    final_vectorized_features.show()

    #
    # 분류 모델 교차 검증, 훈련, 평가: 4개의 지표에 대해 5번 반복
    #

    from collections import defaultdict
    scores = defaultdict(list)
    feature_importances = defaultdict(list)
    metric_names = ["accuracy", "weightedPrecision", "weightedRecall", "f1"]
    split_count = 3

    for i in range(1, split_count + 1):
        print("\nRun {} out of {} of test/train splits in cross validation...".
              format(
                  i,
                  split_count,
              ))

        #  테스트/훈련 데이터 분할
        training_data, test_data = final_vectorized_features.randomSplit(
            [0.8, 0.2])

        # 모든 데이터에 대해 랜덤 포레스트 분류 모델 인스턴스화 및 적합
        from pyspark.ml.classification import RandomForestClassifier
        rfc = RandomForestClassifier(
            featuresCol="Features_vec",
            labelCol="ArrDelayBucket",
            predictionCol="Prediction",
            maxBins=4896,
        )
        model = rfc.fit(training_data)

        # 새 모델을 이전 모델 위에 덮어쓰기
        model_output_path = "{}/models/spark_random_forest_classifier.flight_delays.flight_time.bin".format(
            base_path)
        model.write().overwrite().save(model_output_path)

        # 테스트 데이터로 모델 평가
        predictions = model.transform(test_data)

        # 이 테스트/훈련 데이터 분할의 결과를 각 지표별로평가
        from pyspark.ml.evaluation import MulticlassClassificationEvaluator
        for metric_name in metric_names:
            evaluator = MulticlassClassificationEvaluator(
                labelCol="ArrDelayBucket",
                predictionCol="Prediction",
                metricName=metric_name)
            score = evaluator.evaluate(predictions)

            scores[metric_name].append(score)
            print("{} = {}".format(metric_name, score))

        #
        # 특징 중요도 수집
        #
        feature_names = vector_assembler.getInputCols()
        feature_importance_list = model.featureImportances
        for feature_name, feature_importance in zip(feature_names,
                                                    feature_importance_list):
            feature_importances[feature_name].append(feature_importance)

    #
    # 지표별 평균과 표준편차 평가 및 표로 출력
    #
    import numpy as np
    score_averages = defaultdict(float)

    # 표 데이터 계산
    average_stds = []  # ha
    for metric_name in metric_names:
        metric_scores = scores[metric_name]

        average_accuracy = sum(metric_scores) / len(metric_scores)
        score_averages[metric_name] = average_accuracy

        std_accuracy = np.std(metric_scores)

        average_stds.append((metric_name, average_accuracy, std_accuracy))

    # 표 출력
    print("\nExperiment Log")
    print("--------------")
    print(tabulate(average_stds, headers=["Metric", "Average", "STD"]))

    #
    # 점수를 실행 사이에 존재하는 점수 로그에 유지
    #
    import pickle

    # 점수 로그를 적재하거나 빈 로그를 초기화
    try:
        score_log_filename = "{}/models/score_log.pickle".format(base_path)
        score_log = pickle.load(open(score_log_filename, "rb"))
        if not isinstance(score_log, list):
            score_log = []
    except IOError:
        score_log = []

    # 기존 점수 로그 계산
    score_log_entry = {
        metric_name: score_averages[metric_name]
        for metric_name in metric_names
    }

    # 각 지표에 대한 점수 변화를 계산하고 디스플레이
    try:
        last_log = score_log[-1]
    except (IndexError, TypeError, AttributeError):
        last_log = score_log_entry

    experiment_report = []
    for metric_name in metric_names:
        run_delta = score_log_entry[metric_name] - last_log[metric_name]
        experiment_report.append((metric_name, run_delta))

    print("\nExperiment Report")
    print("-----------------")
    print(tabulate(experiment_report, headers=["Metric", "Score"]))

    # 기존 평균 점수를 로그에 추가
    score_log.append(score_log_entry)

    # 다음 번 실행을 위해 로그 유지
    pickle.dump(score_log, open(score_log_filename, "wb"))

    #
    # 특징 중요도의 변화를 분석하고 보고
    #

    # 각 특징에 대한 평균 계산
    feature_importance_entry = defaultdict(float)
    for feature_name, value_list in feature_importances.items():
        average_importance = sum(value_list) / len(value_list)
        feature_importance_entry[feature_name] = average_importance

    # 특징 중요도를 내림차순으로 정렬하고 출력
    import operator
    sorted_feature_importances = sorted(feature_importance_entry.items(),
                                        key=operator.itemgetter(1),
                                        reverse=True)

    print("\nFeature Importances")
    print("-------------------")
    print(tabulate(sorted_feature_importances, headers=['Name', 'Importance']))

    #
    # 이번 실행 결과인 특징 중요도와 이전 실행 결과와 비교
    #

    # 특징 중요도 로그를 적재하거나 빈 로그를 초기화
    try:
        feature_log_filename = "{}/models/feature_log.pickle".format(base_path)
        feature_log = pickle.load(open(feature_log_filename, "rb"))
        if not isinstance(feature_log, list):
            feature_log = []
    except IOError:
        feature_log = []

    # 각 특징에 대한 점수 변화를 계산하고 디스플레이
    try:
        last_feature_log = feature_log[-1]
    except (IndexError, TypeError, AttributeError):
        last_feature_log = defaultdict(float)
        for feature_name, importance in feature_importance_entry.items():
            last_feature_log[feature_name] = importance

    # 변동 값(delta) 계산
    feature_deltas = {}
    for feature_name in feature_importances.keys():
        run_delta = feature_importance_entry[feature_name] - last_feature_log[
            feature_name]
        feature_deltas[feature_name] = run_delta

    # 특징 변동 값을 정렬해 가장 큰 변동이 있는 특징을 먼저 나오게 한다
    import operator
    sorted_feature_deltas = sorted(feature_deltas.items(),
                                   key=operator.itemgetter(1),
                                   reverse=True)

    # 정렬된 특징 변동 값 디스플레이
    print("\nFeature Importance Delta Report")
    print("-------------------------------")
    print(tabulate(sorted_feature_deltas, headers=["Feature", "Delta"]))

    # 로그에 기존 평균 변동 값을 추가
    feature_log.append(feature_importance_entry)

    # 다음 실행을 위해 로그 유지
    pickle.dump(feature_log, open(feature_log_filename, "wb"))
Ejemplo n.º 31
0
def process_log_data(spark, input_data, output_data):
    
    """
    This function takes the log data from Udacity's S3 input file and processes it. This is done by 
    extracting the user, time and songplay tables and then loading it back to the S3 buckegt I've created in AWS.
   
    Parameters:
            spark       : Spark Session
            input_data  : The S3 bucket location of song_data, think 'input'
            output_data : The S3 bucket location of the song_data, think 'ouput'
    """ 
    
    #Using print statement to understand where in spark statement we are
    print("\n Taking in log data as variable from S3's input location....")
    # get full filepath to song data file
    #log_data = input_data + 'log_data/*/*/*.json'
    #utilizing exact folder set of data set to speed up execution in WorkSpace (please use commented out log_data variable above to run full etl with wildcards)
    log_data = input_data + 'log_data/2018/11/*.json'
    
    
    #Using print statement to understand where in spark statement we are
    print("\n Defining log Schema....")
    log_schema = Struct([SFld("artist", Str()), SFld("auth", Str()),
                         SFld("firstName", Str()), SFld("gender", Str()),
                         SFld("itemInSession", Lng()), SFld("lastName", Str()),
                         SFld("length", Dbl()), SFld("level", Str()),
                         SFld("location", Str()), SFld("method", Str()),
                         SFld("page", Str()), SFld("registration", Dbl()),
                         SFld("sessionId", Lng()), SFld("song", Str()),
                         SFld("status", Str()), SFld("ts", Str()),
                         SFld("userAgent", Str()), SFld("userId", Str())])
    
    
    #Using print statement to understand where in spark statement we are
    print("\n Reading log data JSON files from S3's input location....")
    # read log data file
    df = spark.read.json(log_data, schema = log_schema, mode='PERMISSIVE', columnNameOfCorruptRecord='corruptRecord').drop_duplicates()
    
    
    #Using print statement to understand where in spark statement we are
    print("\n Filtering page by NextSong....")
    # filter by actions for song plays
    df = df.filter(df.page == 'NextSong').drop_duplicates()

          
    #Using print statement to understand where in spark statement we are
    print("\n Creating select statement for users data creation....")     
    # extract columns for users table    
    users_table = df.select('userId', 'firstName', 'lastName', 'gender', 'level').where(df.userId != None).drop_duplicates()
    
          
    #Using print statement to understand where in spark statement we are
    print("\n Writing parquet file for users table....")
    # write users table to parquet files
    users_table.write.mode('overwrite').parquet(output_data + 'users_table/')
          
          
    #Using print statement to understand where in spark statement we are
    print("\n Creating timeStamp variable....")
    # create timestamp column from original timestamp column
    df = df.withColumn("timestamp", to_timestamp(from_unixtime(col("ts") / 1000)))
      
    
    #Using print statement to understand where in spark statement we are
    print("\n Creating select statement for time data creation....")      
    # extract columns to create time table
    time_table = ( df.select("timestamp").withColumn("hour", hour("timestamp")).withColumn("day", dayofmonth("timestamp")) \
                    .withColumn("week", weekofyear("timestamp")).withColumn("weekday", dayofweek("timestamp")).withColumn("weekdayName", date_format("timestamp", "E")) \
                    .withColumn("month", month("timestamp")).withColumn("year", year("timestamp")).drop_duplicates()
                 )
    
    
    #Using print statement to understand where in spark statement we are
    print("\n Writing parquet file for time table and partitioned by year and month....")        
    # write time table to parquet files partitioned by year and month
    time_table.write.mode('overwrite').partitionBy('year', 'month').parquet(output_data + 'time_table/')

          
    #Using print statement to understand where in spark statement we are
    print("\n Reading song data JSON files from S3's input location....")      
    # read in song data to use for songplays table
    song_df = spark.read.parquet(output_data + 'songs_table/')

          
    #Using print statement to understand where in spark statement we are
    print("\n Creating select statement for song play data creation....")       
    # extract columns from joined song and log datasets to create songplays table 
    songplays_table = df.withColumn('songplayId', F.monotonically_increasing_id()).join(song_df, song_df.title == df.song) \
                        .select('songplayId', col('timestamp').alias('start_time'), col('userId'),
                         'level', 'song_id', 'artist_id', col('sessionId'), 'location', col('userAgent'))
    
    
    songplays_table = songplays_table.join(time_table, songplays_table.start_time == time_table.timestamp, how="inner")\
                                     .select("songplayId", songplays_table.start_time, "userId", "level", "song_id", "artist_id", "sessionId", "location", "userAgent", "month", "year").drop_duplicates()

    
    
          
    #Using print statement to understand where in spark statement we are
    print("\n Writing parquet file for song paly table and partitioned by year and month....")       
    # write songplays table to parquet files partitioned by year and month
    songplays_table.write.mode('overwrite').partitionBy("year", "month").parquet(output_data + 'songplays_table/')
Ejemplo n.º 32
0
def process_log_data(spark, input_data, output_data):
    '''
    Process log data to build the user, time and songsplays tables and write them to parquet files
    
    Inputs:
    spark: spark session
    input_data: path to data files to extract the data
    output_data: path where the created tables will be stored
    
    '''
    # get filepath to log data file
    log_data = input_data + 'log_data/*/*/*.json'

    # read log data file
    df = spark.read.json(log_data)

    # filter by actions for song plays
    actions_df = df.filter(df.page == 'NextSong').select(
        'ts', 'userId', 'level', 'song', 'artist', 'sessionId', 'location',
        'userAgent')

    # extract columns for users table
    users_table = df.select('userId', 'firstName', 'lastName', 'gender',
                            'level').dropDuplicates()

    # write users table to parquet files
    users.write.parquet((output_data + 'users/users.parquet'), 'overwrite')

    # create timestamp column from original timestamp column
    get_timestamp = udf(lambda x: str(int(int(x) / 1000)))
    df = actions_df.withColumn('timestamp', get_timestamp(actions_df.ts))

    # create datetime column from original timestamp column
    get_datetime = udf(lambda x: str(datetime.fromtimestamp(int(x) / 1000)))
    df = df.withColumn('start_time', get_datetime(df.ts))

    # extract columns to create time table
    df = df.withColumn('hour', hour('start_time'))
    df = df.withColumn('day', dayofmonth('start_time'))
    df = df.withColumn('month', month('start_time'))
    df = df.withColumn('year', year('start_time'))
    df = df.withColumn('week', weekofyear('start_time'))
    df = df.withColumn('weekday', dayofweek('start_time'))

    time_table = df.select('start_time', 'hour', 'day', 'week', 'month',
                           'year', 'weekday').dropDuplicates()

    # write time table to parquet files partitioned by year and month
    time_table.write.partitionBy('year', 'month').parquet(
        (output_data + 'time/time.parquet'), 'overwrite')

    # read in song data to use for songplays table
    song_df = spark.read.json(input_data + 'song_data/A/*/*/*.json')
    df = df.join(song_df, song_df.title == df.song)

    # extract columns from joined song and log datasets to create songplays table
    songplays_table = df.select(
        'start_time', 'userId', 'level', 'song_id', 'artist_id', 'ssessionId',
        'location', 'userAgent').withColumn('songplay_id',
                                            monotonically_increasing_id())

    # write songplays table to parquet files partitioned by year and month
    songplays_table.write.partitionBy('year', 'month').parquet(
        (output_data + 'songplays/songplays.parquet'), 'overwrite')
def main(base_path):
  APP_NAME = "train_spark_mllib_model.py"
  
  # If there is no SparkSession, create the environment
  try:
    sc and spark
  except NameError as e:
    import findspark
    findspark.init()
    import pyspark
    import pyspark.sql
    
    sc = pyspark.SparkContext()
    spark = pyspark.sql.SparkSession(sc).builder.appName(APP_NAME).getOrCreate()
  
  #
  # {
  #   "ArrDelay":5.0,"CRSArrTime":"2015-12-31T03:20:00.000-08:00","CRSDepTime":"2015-12-31T03:05:00.000-08:00",
  #   "Carrier":"WN","DayOfMonth":31,"DayOfWeek":4,"DayOfYear":365,"DepDelay":14.0,"Dest":"SAN","Distance":368.0,
  #   "FlightDate":"2015-12-30T16:00:00.000-08:00","FlightNum":"6109","Origin":"TUS"
  # }
  #
  from pyspark.sql.types import StringType, IntegerType, FloatType, DoubleType, DateType, TimestampType
  from pyspark.sql.types import StructType, StructField
  from pyspark.sql.functions import udf
  
  schema = StructType([
    StructField("ArrDelay", DoubleType(), True),
    StructField("CRSArrTime", TimestampType(), True),
    StructField("CRSDepTime", TimestampType(), True),
    StructField("Carrier", StringType(), True),
    StructField("DayOfMonth", IntegerType(), True),
    StructField("DayOfWeek", IntegerType(), True),
    StructField("DayOfYear", IntegerType(), True),
    StructField("DepDelay", DoubleType(), True),
    StructField("Dest", StringType(), True),
    StructField("Distance", DoubleType(), True),
    StructField("FlightDate", DateType(), True),
    StructField("FlightNum", StringType(), True),
    StructField("Origin", StringType(), True),
    StructField("Route", StringType(), True),
    StructField("TailNum", StringType(), True),
    StructField("EngineManufacturer", StringType(), True),
    StructField("EngineModel", StringType(), True),
    StructField("Manufacturer", StringType(), True),
    StructField("ManufacturerYear", StringType(), True),
    StructField("OwnerState", StringType(), True),
  ])
  
  input_path = "{}/data/simple_flight_delay_features_airplanes.json".format(
    base_path
  )
  features = spark.read.json(input_path, schema=schema)
  features.first()
  
  #
  # Add the hour of day of scheduled arrival/departure
  #
  from pyspark.sql.functions import hour
  features_with_hour = features.withColumn(
    "CRSDepHourOfDay",
    hour(features.CRSDepTime)
  )
  features_with_hour = features_with_hour.withColumn(
    "CRSArrHourOfDay",
    hour(features.CRSArrTime)
  )
  features_with_hour.select("CRSDepTime", "CRSDepHourOfDay", "CRSArrTime", "CRSArrHourOfDay").show()
  
  #
  # Check for nulls in features before using Spark ML
  #
  null_counts = [(column, features_with_hour.where(features_with_hour[column].isNull()).count()) for column in features_with_hour.columns]
  cols_with_nulls = filter(lambda x: x[1] > 0, null_counts)
  print("\nNull Value Report")
  print("-----------------")
  print(tabulate(cols_with_nulls, headers=["Column", "Nulls"]))
  
  #
  # Use pysmark.ml.feature.Bucketizer to bucketize ArrDelay into on-time, slightly late, very late (0, 1, 2)
  #
  from pyspark.ml.feature import Bucketizer
  
  # Setup the Bucketizer
  splits = [-float("inf"), -15.0, 0, 30.0, float("inf")]
  arrival_bucketizer = Bucketizer(
    splits=splits,
    inputCol="ArrDelay",
    outputCol="ArrDelayBucket"
  )
  
  # Save the model
  arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format(base_path)
  arrival_bucketizer.write().overwrite().save(arrival_bucketizer_path)
  
  # Apply the model
  ml_bucketized_features = arrival_bucketizer.transform(features_with_hour)
  ml_bucketized_features.select("ArrDelay", "ArrDelayBucket").show()
  
  #
  # Extract features tools in with pyspark.ml.feature
  #
  from pyspark.ml.feature import StringIndexer, VectorAssembler
  
  # Turn category fields into indexes
  string_columns = ["Carrier", "Origin", "Dest", "Route",
                    "TailNum"]
  for column in string_columns:
    string_indexer = StringIndexer(
      inputCol=column,
      outputCol=column + "_index"
    )
    
    string_indexer_model = string_indexer.fit(ml_bucketized_features)
    ml_bucketized_features = string_indexer_model.transform(ml_bucketized_features)
    
    # Save the pipeline model
    string_indexer_output_path = "{}/models/string_indexer_model_4.0.{}.bin".format(
      base_path,
      column
    )
    string_indexer_model.write().overwrite().save(string_indexer_output_path)
  
  # Combine continuous, numeric fields with indexes of nominal ones
  # ...into one feature vector
  numeric_columns = [
    "DepDelay", "Distance",
    "DayOfYear",
    "CRSDepHourOfDay",
    "CRSArrHourOfDay"]
  index_columns = [column + "_index" for column in string_columns]
  
  vector_assembler = VectorAssembler(
    inputCols=numeric_columns + index_columns,
    outputCol="Features_vec"
  )
  final_vectorized_features = vector_assembler.transform(ml_bucketized_features)
  
  # Save the numeric vector assembler
  vector_assembler_path = "{}/models/numeric_vector_assembler_5.0.bin".format(base_path)
  vector_assembler.write().overwrite().save(vector_assembler_path)
  
  # Drop the index columns
  for column in index_columns:
    final_vectorized_features = final_vectorized_features.drop(column)
  
  # Inspect the finalized features
  final_vectorized_features.show()
  
  #
  # Cross validate, train and evaluate classifier: loop 5 times for 4 metrics
  #
  
  from collections import defaultdict
  scores = defaultdict(list)
  feature_importances = defaultdict(list)
  metric_names = ["accuracy", "weightedPrecision", "weightedRecall", "f1"]
  split_count = 3
  
  for i in range(1, split_count + 1):
    print("\nRun {} out of {} of test/train splits in cross validation...".format(
      i,
      split_count,
    )
    )
    
    # Test/train split
    training_data, test_data = final_vectorized_features.randomSplit([0.8, 0.2])
    
    # Instantiate and fit random forest classifier on all the data
    from pyspark.ml.classification import RandomForestClassifier
    rfc = RandomForestClassifier(
      featuresCol="Features_vec",
      labelCol="ArrDelayBucket",
      predictionCol="Prediction",
      maxBins=4896,
    )
    model = rfc.fit(training_data)
    
    # Save the new model over the old one
    model_output_path = "{}/models/spark_random_forest_classifier.flight_delays.baseline.bin".format(
      base_path
    )
    model.write().overwrite().save(model_output_path)
    
    # Evaluate model using test data
    predictions = model.transform(test_data)
    
    # Evaluate this split's results for each metric
    from pyspark.ml.evaluation import MulticlassClassificationEvaluator
    for metric_name in metric_names:
      evaluator = MulticlassClassificationEvaluator(
        labelCol="ArrDelayBucket",
        predictionCol="Prediction",
        metricName=metric_name
      )
      score = evaluator.evaluate(predictions)
      
      scores[metric_name].append(score)
      print("{} = {}".format(metric_name, score))
    
    #
    # Collect feature importances
    #
    feature_names = vector_assembler.getInputCols()
    feature_importance_list = model.featureImportances
    for feature_name, feature_importance in zip(feature_names, feature_importance_list):
      feature_importances[feature_name].append(feature_importance)
  
  #
  # Evaluate average and STD of each metric and print a table
  #
  import numpy as np
  score_averages = defaultdict(float)
  
  # Compute the table data
  average_stds = []  # ha
  for metric_name in metric_names:
    metric_scores = scores[metric_name]
    
    average_accuracy = sum(metric_scores) / len(metric_scores)
    score_averages[metric_name] = average_accuracy
    
    std_accuracy = np.std(metric_scores)
    
    average_stds.append((metric_name, average_accuracy, std_accuracy))
  
  # Print the table
  print("\nExperiment Log")
  print("--------------")
  print(tabulate(average_stds, headers=["Metric", "Average", "STD"]))
  
  #
  # Persist the score to a sccore log that exists between runs
  #
  import pickle
  
  # Load the score log or initialize an empty one
  try:
    score_log_filename = "{}/models/score_log.pickle".format(base_path)
    score_log = pickle.load(open(score_log_filename, "rb"))
    if not isinstance(score_log, list):
      score_log = []
  except IOError:
    score_log = []
  
  # Compute the existing score log entry
  score_log_entry = {
    metric_name: score_averages[metric_name] for metric_name in metric_names
  }
  
  # Compute and display the change in score for each metric
  try:
    last_log = score_log[-1]
  except (IndexError, TypeError, AttributeError):
    last_log = score_log_entry
  
  experiment_report = []
  for metric_name in metric_names:
    run_delta = score_log_entry[metric_name] - last_log[metric_name]
    experiment_report.append((metric_name, run_delta))
  
  print("\nExperiment Report")
  print("-----------------")
  print(tabulate(experiment_report, headers=["Metric", "Score"]))
  
  # Append the existing average scores to the log
  score_log.append(score_log_entry)
  
  # Persist the log for next run
  pickle.dump(score_log, open(score_log_filename, "wb"))
  
  #
  # Analyze and report feature importance changes
  #
  
  # Compute averages for each feature
  feature_importance_entry = defaultdict(float)
  for feature_name, value_list in feature_importances.items():
    average_importance = sum(value_list) / len(value_list)
    feature_importance_entry[feature_name] = average_importance
  
  # Sort the feature importances in descending order and print
  import operator
  sorted_feature_importances = sorted(
    feature_importance_entry.items(),
    key=operator.itemgetter(1),
    reverse=True
  )
  
  print("\nFeature Importances")
  print("-------------------")
  print(tabulate(sorted_feature_importances, headers=['Name', 'Importance']))
  
  #
  # Compare this run's feature importances with the previous run's
  #
  
  # Load the feature importance log or initialize an empty one
  try:
    feature_log_filename = "{}/models/feature_log.pickle".format(base_path)
    feature_log = pickle.load(open(feature_log_filename, "rb"))
    if not isinstance(feature_log, list):
      feature_log = []
  except IOError:
    feature_log = []
  
  # Compute and display the change in score for each feature
  try:
    last_feature_log = feature_log[-1]
  except (IndexError, TypeError, AttributeError):
    last_feature_log = defaultdict(float)
    for feature_name, importance in feature_importance_entry.items():
      last_feature_log[feature_name] = importance
  
  # Compute the deltas
  feature_deltas = {}
  for feature_name in feature_importances.keys():
    run_delta = feature_importance_entry[feature_name] - last_feature_log[feature_name]
    feature_deltas[feature_name] = run_delta
  
  # Sort feature deltas, biggest change first
  import operator
  sorted_feature_deltas = sorted(
    feature_deltas.items(),
    key=operator.itemgetter(1),
    reverse=True
  )
  
  # Display sorted feature deltas
  print("\nFeature Importance Delta Report")
  print("-------------------------------")
  print(tabulate(sorted_feature_deltas, headers=["Feature", "Delta"]))
  
  # Append the existing average deltas to the log
  feature_log.append(feature_importance_entry)
  
  # Persist the log for next run
  pickle.dump(feature_log, open(feature_log_filename, "wb"))
Ejemplo n.º 34
0
def process_log_data(spark, input_data, output_data):
    """Process user log data creating the tables user, time and songplays

    Args:
        spark (SparkSession): The spark session object
        input_data (str): The input files path
        output_data (str): The output files path
    """
    # read log data file
    LOGGER.info('read log data file')
    log_df = spark.read.json(input_data)

    # filter by actions for song plays
    LOGGER.info('filter by actions for song plays')
    log_df = log_df.where(F.col('page') == 'NextSong')

    # extract columns for users table
    LOGGER.info('extract columns for users table')
    user_table = log_df.select(
        ['userId', 'firstName', 'lastName', 'gender', 'level'])

    # write users table to parquet files
    LOGGER.info('write users table to parquet files')
    user_path = os.path.join(output_data, 'user')
    user_table.coalesce(1).write.mode('overwrite').parquet(user_path)

    # create datetime column from original timestamp column
    LOGGER.info('create datetime column from original timestamp column')
    get_timestamp = F.udf(lambda x: datetime.utcfromtimestamp(int(x) / 1000),
                          TimestampType())
    log_df = log_df.withColumn("start_time", get_timestamp("ts"))

    # extract columns to create time table
    LOGGER.info('extract columns to create time table')
    time_table = log_df.select(
        'start_time',
        F.hour('start_time').alias('hour'),
        F.dayofmonth('start_time').alias('day'),
        F.weekofyear('start_time').alias('weekofyear'),
        F.month('start_time').alias('month'),
        F.year('start_time').alias('year'),
        F.dayofweek('start_time').alias('weekday')).drop_duplicates(
            ['start_time'])

    # write time table to parquet partitioned by year and month
    LOGGER.info('write time table to parquet partitioned by year and month')
    time_table.coalesce(1).write.mode('overwrite')\
        .partitionBy('year', 'month')\
        .parquet(os.path.join(output_data, 'time'))

    # read in song data to use for songplays table
    LOGGER.info('read in song data to use for songplays table')
    song_df = spark.read.parquet(os.path.join(output_data, 'song'))
    artist_df = spark.read.parquet(os.path.join(output_data, 'artist'))

    # join artist and song data
    LOGGER.info('join artist and song data')
    song_df = artist_df.select(['artist_name', 'artist_id'])\
        .join(song_df, on='artist_id', how='inner')

    # extract columns from joined song and log datasets to create songplays
    LOGGER.info('extract columns from joined song and log datasets to create '
                'songplays')
    on_clause = \
        (song_df.title == log_df.song) \
        & (song_df.artist_name == log_df.artist) \
        & (song_df.duration == log_df.length)
    songplays_table = log_df.join(song_df, on_clause, how='inner')

    # select columns and create year and month columns
    LOGGER.info('select columns and create year and month columns')
    songplays_table = songplays_table.select(
        'start_time',
        F.col('userId').alias('user_id'), 'level', 'song_id', 'artist_id',
        F.col('itemInSession').alias('session_id'), 'location',
        F.col('userAgent').alias('user_agent'),
        F.month('start_time').alias('month'),
        F.year('start_time').alias('year'))

    # create songplay_id and drop duplicates by this column
    LOGGER.info('create songplay_id and drop duplicates by this column')
    key_columns = [
        'start_time', 'user_id', 'song_id', 'artist_id', 'session_id'
    ]
    songplays_table = songplays_table.withColumn(
        'songplay_id', F.sha2(F.concat_ws("||", *key_columns),
                              256)).drop_duplicates(['songplay_id'])

    # write songplays table to parquet files partitioned by year and month
    LOGGER.info('write songplays table to parquet partitioned by year/month')
    songplays_table.coalesce(1).write.mode('overwrite')\
        .partitionBy('year', 'month')\
        .parquet(os.path.join(output_data, 'songplays'))
Ejemplo n.º 35
0
t4 = t3.groupby('df2.hashtag',
                'df1.hashtag').agg(f.count("df1.tweet_id").alias("count"))
w = Window.partitionBy('df2.hashtag')
t5 = t4.withColumn(
    'max',
    f.max('count').over(w)).where(f.col('count') == f.col('max')).selectExpr(
        'df2.hashtag as hashtag', 'df1.hashtag as other_combination_tag',
        'count as other_tag_count')
# writing data to db
t5.write.jdbc(url=url,
              table="popular_tags_popular_combination",
              mode="overwrite",
              properties=properties)
t5.show()

# query 3 : find out the per hour frequency of popular #tag tweet for each location.
w = Window.partitionBy("place", "hashtag", "date", "hour")
per_hour_frequency = most_popular_tags.\
    withColumn("date", f.to_date(f.col("created_at"))).\
    withColumn("hour", f.hour(f.col("created_at"))).\
    withColumn("tag_count", f.count('id').over(w)).\
    select('place', 'date', 'hour', 'hashtag', 'tag_count').\
    distinct().\
    sort(f.asc('place'), f.asc('hashtag'), f.asc('date'), f.asc('hour'), f.desc('tag_count'))
# storing data to db
per_hour_frequency.write.jdbc(url=url,
                              table="tags_frequency",
                              mode="overwrite",
                              properties=properties)
per_hour_frequency.show()
Ejemplo n.º 36
0
def process_log_data(spark, input_data, output_data):
    '''
    create user, time and songplays table
    '''

    print("# extract log data")

    # get filepath to log data file
    log_data = input_data + "log-data/*/*/*.json"

    # read log data file
    df = spark.read.json(log_data)

    # filter by actions for song plays
    df = df.filter(df.page == "NextSong")

    print("# process users")

    # extract columns for users table
    users_table = df.select("userId", "firstName", "lastName", "gender",
                            "level").dropDuplicates()

    # write users table to parquet files
    users_table.write.mode("overwrite").parquet(
        os.path.join(output_data, "users"))

    print("# process time")

    # create timestamp column from original timestamp column
    get_timestamp = udf(lambda ts: datetime.fromtimestamp(int(ts) / 1000.0),
                        TimestampType())
    df = df.withColumn("timestamp", get_timestamp(df.ts))

    # create datetime column from original timestamp column
    get_datetime = udf(lambda ts: datetime.fromtimestamp(int(ts) / 1000.0),
                       DateType())
    df = df.withColumn("datetime", get_datetime(df.ts))

    # extract columns to create time table
    time_table = df.select("ts", "timestamp", "datetime",
                           hour(df.timestamp).alias("hour"),
                           dayofmonth(df.timestamp).alias("day"),
                           weekofyear(df.datetime).alias("week"),
                           month(df.datetime).alias("month"),
                           year(df.datetime).alias("year"),
                           date_format(df.timestamp,
                                       "E").alias("weekday")).dropDuplicates()

    # write time table to parquet files partitioned by year and month
    time_table.write.partitionBy("year", "month").mode("overwrite").parquet(
        os.path.join(output_data, "time"))

    print("# extract song data")

    # read in song data to use for songplays table
    song_df = spark.read.json(input_data +
                              "song-data/A/A/A/TRAAAAK128F9318786.json")
    song_df.createOrReplaceTempView("song_data")

    print("# process songsplays")

    # extract columns from joined song and log datasets to create songplays
    # table
    df.createOrReplaceTempView("log_data")
    songplays_table = spark.sql("""
    SELECT DISTINCT
        row_number() OVER (PARTITION BY sd.song_id ORDER BY ld.userId DESC) as songplay_id,
        ts as start_time, month(timestamp) as month, year(timestamp) as year,
        ld.userId as user_id, ld.level, sd.song_id, sd.artist_id,
        ld.sessionId as session_id, ld.location, ld.userAgent as user_agent
    FROM log_data ld
    JOIN song_data sd ON
        ld.artist = sd.artist_name
        and ld.song = sd.title
        and ld.length = sd.duration
    """)

    # write songplays table to parquet files partitioned by year and month
    songplays_table.write.partitionBy("year",
                                      "month").mode("overwrite").parquet(
                                          os.path.join(output_data,
                                                       "songplays"))

    print("ETL done.")
Ejemplo n.º 37
0
# MAGIC Using the DataFrame `not_found_df` you cached in the part (5a) and sorting by hour of the day in increasing order, create a DataFrame containing the number of requests that had a 404 return code for each hour of the day (midnight starts at 0). Cache the resulting DataFrame `hour_records_sorted_df` and print that as a list.

# COMMAND ----------

# TODO: Replace <FILL IN> with appropriate code
from pyspark.sql.functions import hour
hour_records_sorted_df = not_found_df.<FILL IN>

print 'Top hours for 404 requests:\n'
hour_records_sorted_df.show(24)

# COMMAND ----------

# TODO: Replace <FILL IN> with appropriate code
from pyspark.sql.functions import hour
hour_records_sorted_df = not_found_df.groupBy(hour("time").alias("hr")).count().orderBy("hr").cache()

print 'Top hours for 404 requests:\n'
hour_records_sorted_df.show(24)

# COMMAND ----------

# TEST Hourly 404 response codes (5h)

errs_by_hour = [(row[0], row[1]) for row in hour_records_sorted_df.collect()]

expected = [
  (0, 175),
  (1, 171),
  (2, 422),
  (3, 272),
Ejemplo n.º 38
0
def spark_process(sqlContext, sc, validate, path_to_file):

	######################
	#
	# HDFS to DataFrame 
	#
	######################

	
	## all fields:
	#  ['vendor_id', 'pickup_datetime', 'dropoff_datetime', 'passenger_count', 'trip_distance', 
	#   'pickup_longitude', 'pickup_latitude', 'rate_code', 'store_and_fwd_flag', 'dropoff_longitude', 
	#   'dropoff_latitude', 'payment_type', 'fare_amount', 'surcharge', 'mta_tax', 'tip_amount', 
	#   'tolls_amount', 'total_amount']

	# columns to select
	feature_columns = [1,2,3,5,6,9,10]

	# read file and convert to DataFrame
	# dataframe = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load(path_to_file).cache()
	customSchema = StructType([
    							StructField("vendor_id", StringType(), True),
							    StructField("pickup_datetime", TimestampType(), True),
							    StructField("dropoff_datetime", TimestampType(), True),
							    StructField("passenger_count", StringType(), True),
							    StructField("trip_distance", StringType(), True),
							    StructField("pickup_longitude", DoubleType(), True),
							    StructField("pickup_latitude", DoubleType(), True),
							    StructField("rate_code", StringType(), True),
							    StructField("store_and_fwd_flag", StringType(), True),
							    StructField("dropoff_longitude", DoubleType(), True),
							    StructField("dropoff_latitude", DoubleType(), True),
							    StructField("payment_type", StringType(), True),
							    StructField("fare_amount", StringType(), True),
							    StructField("surcharge", StringType(), True),
							    StructField("mta_tax", StringType(), True),
							    StructField("tip_amount", StringType(), True),
							    StructField("tolls_amount", StringType(), True),
							    StructField("total_amount", StringType(), True)
							    ])

	dataframe = sqlContext.read.format('com.databricks.spark.csv').options(header='true', schema = customSchema).load(path_to_file)
	# create dataframe with selected columns
	dataframe = dataframe.select(*(dataframe.columns[n] for n in feature_columns))
	
	# this number does not include the header
	# number_of_trips = dataframe.count()

	sqlContext.clearCache()
	######################
	#
	# Preprocess data 
	#
	######################

	# filter rows with null fields
	# if passenger count is missing assign it a value of 1
	# filter invalid location: keep only areas near NYC
	dataframe = dataframe.na.drop(how='any',subset=['pickup_datetime','dropoff_datetime','pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude']) \
						.fillna(1,subset=["passenger_count"])     \
						.filter(dataframe.pickup_latitude>40.0)   \
						.filter(dataframe.pickup_latitude<41.0)   \
						.filter(dataframe.pickup_longitude<-73.0) \
						.filter(dataframe.pickup_longitude>-74.0) \
						.filter(dataframe.dropoff_latitude>40.0)  \
						.filter(dataframe.dropoff_latitude<41.0)  \
						.filter(dataframe.dropoff_longitude<-73.0)\
						.filter(dataframe.dropoff_longitude>-74.0)


	######################
	#
	# features engineering
	#
	######################

	# create new column based on time-delta (minutes)
	# convert pickup-datetime column to hour
		
	time_delta_udf = udf(time_delta_minutes,FloatType())

	dataframe = dataframe.withColumn('time_delta', time_delta_udf(dataframe.pickup_datetime,dataframe.dropoff_datetime)) \
						 .withColumn('pick_up_hour', hour(dataframe.pickup_datetime))

 	dataframe = dataframe.select(dataframe.pick_up_hour,    \
 								dataframe.passenger_count.cast("integer"),  \
								dataframe.pickup_longitude.cast("double"), \
								dataframe.pickup_latitude.cast("double"),  \
								dataframe.dropoff_longitude.cast("double"),\
								dataframe.dropoff_latitude.cast("double"), \
								dataframe.time_delta.cast("double"))

 	dataframe = dataframe.filter(dataframe.time_delta > 1.0).cache()


 	# split dataframe into feature and label vector
	# create feature vectors and labels for model training
	feature_assembler = VectorAssembler(inputCols = ['pick_up_hour','pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude'],outputCol = 'features')

	transformed = feature_assembler.transform(dataframe)
	vector_dataframe = transformed.select(col("time_delta").alias("label"),col("features")).cache()

	######################
	#
	# train model
	#
	######################

	if validate:

		################################
		#
		# validate model on 60/40 split
		#
		################################

		# split 
		training, test = vector_dataframe.randomSplit([0.6, 0.4], seed=0)

		decision_tree_reg = DecisionTreeRegressor(maxDepth=12,maxBins=25)
		model = decision_tree_reg.fit(training)

		train_pred = model.transform(training)
		test_pred = model.transform(test)

		evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2")
		r2_train = evaluator.evaluate(train_pred)

		evaluator_test = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2")
		r2_test = evaluator_test.evaluate(test_pred)

		output = test_pred.select("prediction", "label", "features")

		return output, r2_test, r2_train
	
	else:

		###################
		#
		# train on all data
		#
		###################

		decision_tree_reg = DecisionTreeRegressor(maxDepth=12,maxBins=25)
		model = decision_tree_reg.fit(vector_dataframe)

		predictions = model.transform(vector_dataframe)

		output = predictions.select("prediction", "label", "features")

		###########################
		#
		# process to send to Kafka
		#
		###########################

		schema = StructType([StructField("prediction_mins", FloatType(), True),
							StructField("pick_up_hour", IntegerType(), True),
							StructField("pickup_longitude", DoubleType(), True),
							StructField("pickup_latitude", DoubleType(), True),
							StructField("dropoff_longitude", DoubleType(), True),
							StructField("dropoff_latitude", DoubleType(), True)])

		features_from_predictions = output.map(lambda row: (float(row.prediction),int(row.features[0]),float(row.features[1]),float(row.features[2]),float(row.features[3]),float(row.features[4]) ) ).collect()
		sqlContext.clearCache()
		dataframe_from_prediction_vector = sqlContext.createDataFrame(features_from_predictions,schema).cache()

		return dataframe_from_prediction_vector