Exemple #1
0
def schema_song_data():
    """
    Description:
        Schema design for song datasets.
    """
    try:
        print("schema_song_data fuction is statrting.")
        print("**************************************")

        schema = R([
            Fld("artist_id", Str()),
            Fld("artist_latitude", Dbl()),
            Fld("artist_location", Str()),
            Fld("artist_longitude", Dbl()),
            Fld("artist_name", Str()),
            Fld("duration", Dbl()),
            Fld("num_songs", Int()),
            Fld("song_id", Str()),
            Fld("title", Str()),
            Fld("year", Int()),
        ])

        print("schema_song_data is successfull created")
        print("***************************************")
        return schema

    except:
        print("schema_song_data function is successful created.")
        print("************************************************")
Exemple #2
0
def create_log_data():
    """
    Create schema for log data.
    
    return: schema
    """
    log_schema = StructType([
        StructField("artist", Str()), 
        StructField('auth', Str()),
        StructField('firstName', Str()),
        StructField('gender', Str()),
        StructField('itemInSession', Int()),
        StructField('lastName', Str()),
        StructField('length', Dbl()),
        StructField('level', Str()),
        StructField('location', Str()),
        StructField('method', Str()),
        StructField('page', Str()),
        StructField('registration', Dec()),
        StructField('sessionId', Int()),
        StructField('song', Str()),
        StructField('status', Int()),
        StructField('ts', Long()),
        StructField('userAgent', Str()),
        StructField('userId', Int())
    ])
    return log_schema
Exemple #3
0
def process_song_data(spark, input_data, output_data):
    # get filepath to song data file
    song_data = input_data + 'song_data/*/*/*/*.json'
    
    songSchema = R([
        Fld("artist_id",Str()),
        Fld("artist_latitude",Dbl()),
        Fld("artist_location",Str()),
        Fld("artist_longitude",Dbl()),
        Fld("artist_name",Str()),
        Fld("duration",Dbl()),
        Fld("num_songs",Int()),
        Fld("title",Str()),
        Fld("year",Int()),
    ])
    
    # read song data file
    df = spark.read.json(song_data, schema=songSchema)

    song_fields = ["title", "artist_id","year", "duration"]
    
    # extract columns to create songs table
    songs_table = df.select(song_fields).dropDuplicates().withColumn("song_id", monotonically_increasing_id())
    
    # write songs table to parquet files partitioned by year and artist
    songs_table.write.partitionBy("year", "artist_id").parquet(output_data + 'songs/')
    
    artists_fields = ["artist_id", "artist_name as name", "artist_location as location", "artist_latitude as latitude", "artist_longitude as longitude"]

    # extract columns to create artists table
    artists_table = df.selectExpr(artists_fields).dropDuplicates()
    
    # write artists table to parquet files
    artists_table.write.parquet(output_data + 'artists/')
def get_log_src_schema():
    """
    Get the source spark schema definition
    :return: The schema definition
    """

    return R([
        Fld("artist", Str()),
        Fld("auth", Str()),
        Fld("firstName", Str()),
        Fld("gender", Str()),
        Fld("itemInSession", Int()),
        Fld("lastName", Str()),
        Fld("length", Dbl()),
        Fld("level", Str()),
        Fld("location", Str()),
        Fld("method", Str()),
        Fld("page", Str()),
        Fld("registration", Str()),
        Fld("sessionId", Int()),
        Fld("song", Str()),
        Fld("status", Int()),
        Fld("ts", Str()),
        Fld("userAgent", Str()),
        Fld("userId", Str())
    ])
Exemple #5
0
def i94_immigrations(df, f, cols):
    """
    Build the i94_immigrations DataFrame which is one of the dimensions
    :params df - A Pyspark raw DataFrame
    :params f - A UDF
    :params cols - A list of columns
    Returns - A cleaned dimensional DataFrame
    """
    try:
        immigrations = (
            df.select(cols).dropDuplicates().withColumn(
                'custom_client_id',
                df['cicid'].cast(Int()).cast(Str())).drop('cicid').withColumn(
                    'i94_year',
                    df['i94yr'].cast(Int())).drop('i94yr').withColumn(
                        'i94_month', df['i94mon'].cast(Int())).drop('i94mon').
            withColumnRenamed('i94port', 'i94_port').withColumn(
                'mode_of_entry',
                df['i94mode'].cast(Int())).drop('i94mode').withColumnRenamed(
                    'visapost', 'visa_post').withColumnRenamed(
                        'entdepa', 'arrival_flag').withColumnRenamed(
                            'entdepd', 'depature_flag').withColumnRenamed(
                                'entdepu', 'update_flag').withColumnRenamed(
                                    'matflag', 'match_flag')
            #        .withColumn('i94_entry_date', F.to_date('dtadfile', 'yyyymmdd').cast(Date()))
            .withColumn('i94_entry_date',
                        f('dtadfile')).drop(F.col('dtadfile')).withColumn(
                            'i94_valid_till',
                            f('dtaddto')).drop(F.col('dtaddto')))
    except Exception as e:
        logger.error('Failed to create i94_immigrations DataFrame...')
        logger.error(e)
        raise
    return immigrations
def process_song_data(spark, input_data, output_data):
    '''
        Description: This function can be used to load the song data from the input S3 bucket
                     and write the parquet files to the output S3 bucket.
        Arguments:
            spark: SparkSession
            input_data: location for the input data
            output_data: location for the output data
        Returns:
            None
    '''
    # get filepath to song data file
    song_data = os.path.join(input_data, "song_data/*/*/*/*.json")
    print(song_data)

    # read song data file
    songsSchema = R([
        Fld("artist_id", Str()),
        Fld("artist_latitude", Dbl()),
        Fld("artist_location", Str()),
        Fld("artist_longitude", Dbl()),
        Fld("artist_name", Str()),
        Fld("duration", Dbl()),
        Fld("num_songs", Int()),
        Fld("song_id", Str()),
        Fld("title", Str()),
        Fld("year", Int())
    ])

    df = spark.read.json(song_data, schema=songsSchema).distinct()
    print(df.count())
    print(df.show(5, truncate=False))

    df.printSchema()

    # extract columns to create songs table

    songs_table = df.select("song_id", "title", "artist_id", "year",
                            "duration").distinct()
    songs_table.printSchema()
    songs_table.show(5)
    print('songs', songs_table.count())

    # write songs table to parquet files partitioned by year and artist
    songs_table.write.mode('overwrite').partitionBy(
        "year", "artist_id").parquet(output_data + "songs")

    # extract columns to create artists table
    df.createOrReplaceTempView("df")
    artists_table = spark.sql(
        "select artist_id, artist_name as name, artist_location as location, artist_latitude as latitude, artist_longitude as longitude from df"
    ).distinct()
    artists_table.printSchema()
    artists_table.show(5)
    print('artists', artists_table.count())

    # write artists table to parquet files
    artists_table.repartitionByRange(
        3,
        "artist_id").write.mode('overwrite').parquet(output_data + "artists")
Exemple #7
0
def process_song_data(spark, input_data, output_data):
    """ Process song_data json files which located in S3
        Create table songs_table and artists_table
        Store the table in parque format in S3
        Return the table to be used in process_log_data function
    
    Args:
      spark                 : Spark Session
      input_data  (string)  : location json files (input)
      output_data (string)  : location parque files (output)
      
    Returns:
      songs_data    (Spark Dataframe) : Song Data tables
    
    """

    # get filepath to song data file
    song_data = input_data + 'song_data/*/*/*/*.json'

    # set schema song data
    songSchema = StructType([
        Fld("artist_id", Str()),
        Fld("artist_latitude", Dbl()),
        Fld("artist_location", Str()),
        Fld("artist_longitude", Dbl()),
        Fld("artist_name", Str()),
        Fld("duration", Dbl()),
        Fld("num_songs", Int()),
        Fld("song_id", Str()),
        Fld("title", Str()),
        Fld("year", Int()),
    ])

    # read song data file
    df = spark.read.json(song_data, schema=songSchema)

    # extract columns to create songs table
    songs_table = df.select("song_id", "title", "artist_id", "year", "duration") \
                    .where("song_id is not null") \
                    .dropDuplicates(['song_id'])

    # write songs table to parquet files partitioned by year and artist
    songs_table.write.partitionBy("year", "artist_id").parquet(
        os.path.join(output_data, 'songs'), 'overwrite')

    # extract columns to create artists table
    artists_table = df.select(col("artist_id"),
                              col("artist_name").alias("name"),
                              col("artist_location").alias("location"),
                              col("artist_latitude").alias("latitude"),
                              col("artist_longitude").alias("longitude")) \
                      .where("artist_id is not null") \
                      .dropDuplicates(['artist_id'])

    # write artists table to parquet files
    artists_table.write.parquet(os.path.join(output_data, 'artists'),
                                'overwrite')

    # return song_data table to be used in process_log_data
    return df
Exemple #8
0
def process_song_data(spark, input_data, output_data):
    """This function loads song_data from S3 and processes it by extracting the songs and artist tables
        and then again loaded back to S3
    Args:
        spark(:obj:`pyspark.sql.session.SparkSession`): SparkSession
        input_data (str): S3 bucket where song files are stored
        output (str): S3 bucket file path to store resulting files

    Returns:
        None
    """
    print("**** Starting to process song data *****")
    # get filepath to song data file
    song_data = input_data+'song_data/*/*/*/*.json'
    
    # read song data file
    
    songSchema = R([
        Fld("artist_id",Str()),
        Fld("artist_latitude",Dbl()),
        Fld("artist_location",Str()),
        Fld("artist_longitude",Dbl()),
        Fld("artist_name",Str()),
        Fld("song_id",Str()),
        Fld("duration",Dbl()),
        Fld("num_songs",Int()),
        Fld("title",Str()),
        Fld("year",Int()),
    ])
    
    
    try:
        df = spark.read.json(song_data, schema=songSchema)
    except Exception as e:
        print(e)
        
    # extract columns to create songs table
    songs_fields = ["song_id", "title", "artist_id", "year", "duration"]
    songs_table = df.select(songs_fields).dropDuplicates(["song_id"])
    
    # write songs table to parquet files partitioned by year and artist
    try:
        songs_table.write.parquet(output_data + "songs.parquet", partitionBy=("year", "artist_id"), mode="overwrite")
    except Exception as e:
        print(e)
    
    print("**** songs table data load is complete *****")
    
    # extract columns to create artists table
    artists_fields = ["artist_id", "artist_name as name", "artist_location as location", "artist_latitude as lattitude", "artist_longitude as longitude"]
    artists_table = df.selectExpr(artists_fields).dropDuplicates(["artist_id"])
    
    # write artists table to parquet files
    try:
        artists_table.write.parquet(output_data + "artists.parquet",  mode="overwrite")
    except Exception as e:
        print(e)
    print("**** artists table data load is complete *****")
    
    print("**** song data processing is finished *****")
Exemple #9
0
def process_song_data(spark, input_data, output_data):
    """
    This function takes the song data from Udacity's S3 input file and processes it. This is done by 
    extracting the artist and songs tables and then loading it back to the S3 bucket I've created in AWS.

    Parameters:
            spark       : Spark Session
            input_data  : The S3 bucket location of song_data, think 'input'
            output_data : The S3 bucket location of the song_data, think 'ouput'
            
    """ 
    #Using print statement to understand where in spark statement we are
    print("\n Taking in song data as variable from S3's input location....")
    # get full filepath to song data file
    #song_data = input_data + 'song_data/*/*/*/*.json'
    # utilizing smaller data set to speed up execution in WorkSpace (please use commented out song_data variable above to run full etl)
    song_data = input_data + 'song_data/A/A/B/*.json'
    
    
    #Using print statement to understand where in spark statement we are
    print("\n Defining song Schema....")
    
    songSchema = Struct([SFld("artist_id",Str()), SFld("artist_latitude", Flt()),
                        SFld("artist_location",Str()), SFld("artist_longitude",Flt()),
                        SFld("artist_name",Str()), SFld("duration",Flt()),
                        SFld("num_songs",Int()), SFld("title",Str()),
                        SFld("year",Int())])
    
    
    #Using print statement to understand where in spark statement we are
    print("\n Reading song data JSON files from S3's input location....")
    # read song data file
    df = spark.read.json(song_data, schema= songSchema, mode= 'PERMISSIVE', columnNameOfCorruptRecord= 'corruptRecord').drop_duplicates()
    

    #Using print statement to understand where in spark statement we are
    print("\n Creating select statement for song data creation....")
    # extract columns to create songs table
    songs_table = df.select('title', 'artist_id', 'year', 'duration').drop_duplicates().withColumn("song_id", F.monotonically_increasing_id())
    
    
    #Using print statement to understand where in spark statement we are
    print("\n Writing parquet file for song table and partitioned by year and artist....")    
    # write songs table to parquet files partitioned by year and artist
    songs_table.write.mode('overwrite').partitionBy('year', 'artist_id').parquet(output_data + 'songs_table/')
    
    
    #Using print statement to understand where in spark statement we are
    print("\n Reading select statement for artist data creation....") 
    # extract columns to create artists table
    artists_table = df.select('artist_id', 'artist_name', 'artist_location', 'artist_latitude', 'artist_longitude').where(df.artist_id != None).drop_duplicates()
    

    #Using print statement to understand where in spark statement we are
    print("\n Writing parquet file for artist table....")
    # write artists table to parquet files
    artists_table.write.mode('overwrite').parquet(output_data + 'artists_table/')
Exemple #10
0
def process_song_data(spark, input_data, output_data):
    """
    This function processes the song data of sparkify and creates
    facts/dimensions via spark and saves them to our data lake afterwards
	Arguments:
	    spark {SparkSession}: Spark session to launch the program
	    input_data {str}: location (local/s3) where the (root) input song data resides
	    output_data {str}: location (local/s3) where the (root) output files should be written
    """
    # get filepath to song data file
    # song_data = f"{input_data}song_data/A/A/A/*.json"
    song_data = f"{input_data}song_data/*/*/*/*.json"

    # read song data file
    songSchema = R([
        Fld("num_songs", Int()),
        Fld("artist_id", Str()),
        Fld("artist_latitude", Str()),
        Fld("artist_longitude", Str()),
        Fld("artist_location", Str()),
        Fld("artist_name", Str()),
        Fld("song_id", Str()),
        Fld("title", Str()),
        Fld("duration", Dbl()),
        Fld("year", Int()),
    ])

    # since schema can not infered automatically, we need to specify it beforehand
    df_song = spark.read.json(song_data, schema=songSchema)
    df_song.cache()

    # extract columns to create songs table
    songs_table = df_song.filter(df_song.song_id != '') \
        .select(['song_id',
                 'title',
                 'artist_id',
                 'year',
                 'duration']) \
        .dropDuplicates(['song_id'])

    # write songs table to parquet files partitioned by year and artist
    output_song_data = f"{output_data}song_data/"
    songs_table.write.mode('overwrite').partitionBy(
        "year", "artist_id").parquet(output_song_data)

    # extract columns to create artists table
    artists_table = df_song.filter(df_song.artist_id != '') \
        .selectExpr(['artist_id',
                     'artist_name as name',
                     'artist_location as location',
                     'artist_latitude as latitude',
                     'artist_longitude as longitude']) \
        .dropDuplicates(['artist_id'])

    # write artists table to parquet files
    output_artist_data = f"{output_data}artist_data/"
    artists_table.write.mode('overwrite').parquet(output_artist_data)
Exemple #11
0
def process_song_data(spark, input_data, output_data):
    """
    This function loads the songs JSON dataset from S3, 
    then uses the data to create the songs and artists tables
    
    Input:
    spark = SparkSession object
    input_data = Start of path variable for input files
    output_data = Start of path variable for output files
    
    Output: None
    """

    # get filepath to song data file
    song_data = os.path.join(input_data, 'song_data/*/*/*/*.json')

    # Define schema
    SongSchema = R([
        Fld("num_songs", Int()),
        Fld("artist_id", Str()),
        Fld("artist_latitude", Dbl()),
        Fld("artist_longitude", Dbl()),
        Fld("artist_location", Str()),
        Fld("artist_name", Str()),
        Fld("song_id", Str()),
        Fld("title", Str()),
        Fld("duration", Dbl()),
        Fld("year", Int())
    ])

    # read song data file
    df = spark.read.json(song_data, schema=SongSchema)

    # extract columns to create songs table
    songs_table = df.select("song_id", "title", "artist_id", "year",
                            "duration").dropduplicates()

    # write songs table to parquet files partitioned by year and artist
    output_path = os.path.join(output_data, 'songs_table.parquet')
    songs_table.write.partitionBy("year",
                                  "artist_id").parquet(output_path,
                                                       mode="overwrite")

    # extract columns to create artists table
    artists_table = df.select("artist_id", "artist_name", "artist_location",
                              "artist_latitude",
                              "artist_longitude").dropduplicates()

    # write artists table to parquet files
    output_path = os.path.join(output_data, 'artists_table.parquet')
    artists_table.write.parquet(output_path, mode="overwrite")

    #export whole songs data file to parquet
    output_path = os.path.join(output_data, 'songs_data_table.parquet')
    df.write.parquet(output_path, mode="overwrite")
Exemple #12
0
def process_song_data(spark, input_data, output_data):
    '''
    load song data in json format from S3 bucket and process these data by extracting 
    songs table and artists table, and save these tables back to S3 bucket
    
    :param spark: spark session
    :param input_data: data location for input data
    :param output_data: data location for output data
    :return: no return value
    '''
    # get filepath to song data file
    song_data = input_data + 'song_data/*/*/*/*.json'

    # create songs schema
    songSchema = R([
        Fld('artist_id', Str()),
        Fld('artist_latitude', Dbl()),
        Fld('artist_location', Str()),
        Fld('artist_longitude', Dbl()),
        Fld('artist_name', Str()),
        Fld('duration', Dbl()),
        Fld('num_songs', Int()),
        Fld('title', Str()),
        Fld('year', Int()),
    ])

    # load songs json files from S3
    df_songs = spark.read.json(song_data, schema=songSchema)

    # select columns for songs_table
    songs_attr = ['title', 'artist_id', 'year', 'duration']
    songs_table = df_songs.select(songs_attr)\
    .dropDuplicates()\
    .withColumn('song_id', monotonically_increasing_id())

    # write songs_table to S3
    songs_table.write.partitionBy('year',
                                  'artist_id').parquet(output_data + 'songs/')

    # select artists columns
    artists_attr = [
        'artist_id', 'artist_name', 'artist_location', 'artist_latitude',
        'artist_longitude'
    ]
    artists_table = df_songs.select(artists_attr)\
    .dropDuplicates()

    artists_table = artists_table\
    .withColumnRenamed('artist_name','name')\
    .withColumnRenamed('artist_location','location')\
    .withColumnRenamed('artist_latitude','latitude')\
    .withColumnRenamed('artist_longitude','longitude')

    # write artists_table to S3
    artists_table.write.parquet(output_data + 'artists/')
Exemple #13
0
def process_song_data(spark, input_data, output_data):
    """Process song data, transform the data into songs and artists tables
    and store it in parquet files on S3.

    Parameters
    ----------
    spark : SparkSession
        cursor to the sparkify database connection
    input_data : string
        input data prepend path
    output_data : string
        output data prepend path
    """
    # get filepath to song data file
    song_data = os.path.join(input_data, "song_data/*/*/*/*.json")

    song_schema = R([
        Fld("num_songs", Int()),
        Fld("artist_id", Str(), False),
        Fld("artist_latitude", Dbl()),
        Fld("artist_longitude", Dbl()),
        Fld("artist_location", Str()),
        Fld("artist_name", Str(), False),
        Fld("song_id", Str(), False),
        Fld("title", Str(), False),
        Fld("duration", Dbl(), False),
        Fld("year", Int())
    ])

    # read song data file
    df = spark.read.json(song_data, song_schema)

    # extract columns to create songs table
    songs_table = df.select(
        ["song_id", "title", "artist_id", "year", "duration"])

    # write songs table to parquet files partitioned by year and artist
    songs_table.write.mode("overwrite").partitionBy('year', 'artist_id') \
        .parquet(os.path.join(output_data, 'analytics/songs'))

    # extract columns to create artists table
    artists_table = df.select([
        "artist_id", "artist_name", "artist_location", "artist_latitude",
        "artist_longitude"
    ])
    artists_table = artists_table.withColumnRenamed("artist_name", "name") \
        .withColumnRenamed("artist_location", "location") \
        .withColumnRenamed("artist_latitude", "latitude") \
        .withColumnRenamed("artist_longitude", "longitude")

    # write artists table to parquet files
    artists_table.write.mode("overwrite") \
        .parquet(os.path.join(output_data, 'analytics/artists'))
def process_song_data(spark, input_data, output_data):
    """
    Processes song data and stores them as parquet files

    Loads song data into a spark DataFrame and transforms them into songs
    and artists DataFrames which are subsequently written as parquet files
    to songs and artists folders in the specified output path.

    Parameters:
    spark : SparkSession instance
    input_data (str) : Path of the directory of song_data
    output_data (str) : Path of the directory where the parquet files will be stored

    """

    # specify schema for song data
    songs_schema = Struct([
        Fld('num_songs', Int()),
        Fld('artist_id', Str()),
        Fld('artist_latitude', Double()),
        Fld('artist_longtitude', Double()),
        Fld('artist_location', Str()),
        Fld('artist_name', Str()),
        Fld('song_id', Str()),
        Fld('title', Str()),
        Fld('duration', Double()),
        Fld('year', Int())
    ])

    # get filepath to song data file
    song_data = input_data + 'song_data/*/*/*/*.json'

    # read song data file
    df = spark.read.json(song_data, songs_schema)

    # extract columns to create songs table
    songs_table = df[['song_id', 'title', 'artist_id', 'year', 'duration']] \
        .dropDuplicates(['song_id'])

    # write songs table to parquet files partitioned by year and artist
    songs_table.write.parquet(os.path.join(output_data, 'songs'),
                              'overwrite',
                              partitionBy=['year', 'artist_id'])

    # extract columns to create artists table
    artists_table = df[[
        'artist_id', 'artist_name', 'artist_location', 'artist_latitude',
        'artist_longtitude'
    ]].dropDuplicates(['artist_id'])

    # write artists table to parquet files
    artists_table.write.parquet(os.path.join(output_data, 'artists'),
                                'overwrite')
Exemple #15
0
def process_song_data(spark, input_data, output_data):

    print('%%%%% Starting up the SONG data process')

    # get filepath to song data file
    song_data = 'song_data/A/*/*/*.json'

    # setting up the schema for the data that we're about to pull
    songSchema = ST([
        Fld("num_songs", Int()),
        Fld("artist_id", Str()),
        Fld("artist_latitude", Dbl()),
        Fld("artist_longitude", Dbl()),
        Fld("artist_location", Str()),
        Fld("artist_name", Str()),
        Fld("song_id", Str()),
        Fld("title", Str()),
        Fld("duration", Dbl()),
        Fld("year", Int())
    ])

    # read song data file - Uncomment the line below to download from S3, otherwise line 30 will access the data locally
    #df = spark.read.json(input_data + song_data)
    raw_song_df = spark.read.json(input_data + song_data, songSchema)

    # extract columns to create songs table
    songs_table = raw_song_df.select(raw_song_df.song_id, \
                                 raw_song_df.title, \
                                 raw_song_df.artist_id, \
                                 raw_song_df.year.cast(Int()), \
                                 raw_song_df.duration.cast(Dbl()))

    # write songs table to parquet files partitioned by year and artist
    songs_table.write.mode('overwrite').partitionBy(
        'year', 'artist_id').parquet(output_data + 'songs')

    print('%%%%% Songs table has been created and written to the S3 Bucket')

    # extract columns to create artists table
    artists_table = raw_song_df.select(raw_song_df.artist_id , \
                                  raw_song_df.artist_latitude.alias('latitude'), \
                                  raw_song_df.artist_location.alias('location'), \
                                  raw_song_df.artist_longitude.alias('longitude'), \
                                  raw_song_df.artist_name.alias('name')).dropDuplicates(['artist_id','name'])

    # write artists table to parquet files
    artists_table.write.mode('overwrite').parquet(output_data + 'artist')

    print('%%%%% Artists table has been created and written to the S3 Bucket')
    print('%%%%% SONG data has been completed and returning the raw_song_df')
    return raw_song_df
Exemple #16
0
def process_song_data(spark, input_data, output_data):
    """
    read song data from s3 and then create the songs_table and artists_table. load them back to s3.
    
    parameters:
    spark: spark session
    input_data: path of song data
    output_data: path of output table
    
    """
    # get filepath to song data file
    # song_data = input_data + "song_data/*/*/*/*.json"
    song_data = input_data + "song_data/A/B/C/TRABCEI128F424C983.json"

    # create song table schema
    songSchema = R([
        Fld("num_songs", Int()),
        Fld("artist_id", Str()),
        Fld("artist_latitude", Dbl()),
        Fld("artist_longitude", Dbl()),
        Fld("artist_location", Str()),
        Fld("artist_name", Str()),
        Fld("title", Str()),
        Fld("duration", Dbl()),
        Fld("year", Int()),
    ])

    # read song data file
    df = spark.read.json(song_data, schema=songSchema)

    # extract columns to create songs table, drop if year and artist_id are missing and year should not equal to 0
    song_field = ["title", "duration", "year", "artist_id"]
    songs_table = df.select(song_field).dropDuplicates().withColumn("song_id",F.monotonically_increasing_id())\
    .filter(~col("year").isin([0]) & col("year").isNotNull() & col("artist_id").isNotNull())

    # extract columns to create artists table, drop if artist_id and name containing any null values
    artist_field = [
        "artist_id", "artist_name", "artist_location", "artist_latitude",
        "artist_longitude"
    ]
    artists_table = df.select(artist_field).dropDuplicates().dropna(
        subset=["artist_id", "artist_name"])

    # write songs table to parquet files partitioned by year and artist
    songs_table.write.partitionBy("year",
                                  "artist_id").parquet(output_data + "songs/",
                                                       mode="overwrite")

    # write artists table to parquet files
    artists_table.write.parquet(output_data + "artists/", mode="overwrite")
Exemple #17
0
def process_song_data(spark, input_data, output_data):
    """
    Loads the song_data from AWS S3 (input_data) and extracts the songs and artist tables
    and then loaded the processed data back to S3 (output_data)
    
    :param spark: Spark Session object
    :param input_data: Location (AWS S3 path) of songs metadata (song_data) JSON files
    :param output_data: Location (AWS S3 path) where dimensional tables will be stored in parquet format 
    """

    # Get filepath to song data file
    song_data = input_data + "song_data/*/*/*/*.json"

    songSchema = R([
        Fld("artist_id", Str()),
        Fld("artist_latitude", Dbl()),
        Fld("artist_location", Str()),
        Fld("artist_longitude", Dbl()),
        Fld("artist_name", Str()),
        Fld("duration", Dbl()),
        Fld("num_songs", Int()),
        Fld("title", Str()),
        Fld("year", Int()),
    ])

    # Read song data file
    print("Reading song_data JSON files from S3")
    df = spark.read.json(song_data, mode='PERMISSIVE', schema=songSchema, \
                         columnNameOfCorruptRecord='corrupt_record').dropDuplicates()
    print("Read completed")

    # Extract columns to create songs table
    songs_table = df.select("title", "artist_id", "year", "duration").dropDuplicates() \
                    .withColumn("song_id", monotonically_increasing_id())

    print("Writing Songs table to S3 after processing")
    # Write songs table to parquet files partitioned by year and artist
    songs_table.write.parquet(output_data + "songs/",
                              mode="overwrite",
                              partitionBy=["year", "artist_id"])
    print("Completed")

    # Extract columns to create artists table
    artists_table = df.select("artist_id", "artist_name", "artist_location", "artist_latitude", "artist_longitude") \
                        .dropDuplicates()

    print("Writing Artists table to S3 after processing")
    # Write artists table to parquet files
    artists_table.write.parquet(output_data + "artists/", mode="overwrite")
    print("Completed")
def process_song_data(spark, input_data, output_data):
    """
    Method to process song data and create tables: songs, artists
    :param spark: Spark session
    :param input_data: S3 bucket
    :param output_data: S3 bucket
    :return: Data frame of song data
    """
    # get filepath to song data file
    song_data = input_data + '/song-data/A/A/B/*.json'

    songs_schema = R([
        Fld("artist_id", Str()),
        Fld("artist_latitude", Dbl()),
        Fld("artist_location", Str()),
        Fld("artist_longitude", Dbl()),
        Fld("artist_name", Str()),
        Fld("duration", Dbl()),
        Fld("num_songs", Int()),
        Fld("song_id", Str()),
        Fld("title", Str()),
        Fld("year", Int())
    ])

    # read song data file
    print('Reading song data.')
    df = spark.read.json(song_data, schema=songs_schema)

    song_columns = ['song_id', 'title', 'artist_id', 'year', 'duration']

    # extract columns to create songs table
    songs_table = df.selectExpr(song_columns).dropDuplicates()

    # write songs table to parquet files partitioned by year and artist
    print('Writing songs to parquet.')
    write_parquet(songs_table, output_data, 'songs', 'year', 'artist_id')

    artist_columns = [
        'artist_id', 'artist_name as name', 'artist_location as location',
        'artist_latitude as latitude', 'artist_longitude as longitude'
    ]

    # extract columns to create artists table
    artists_table = df.selectExpr(artist_columns).dropDuplicates()

    # write artists table to parquet files
    print('Writing artists to parquet.')
    write_parquet(artists_table, output_data, 'artists', None, None)

    return df
Exemple #19
0
def process_song_data(spark, input_data, output_data):
    """
        Description: This function fetches song_data from S3 into a staging dataframe, 
        then extracts the songs and artist tables,
        and eventually exports data back to S3
        
        Parameters:
            spark       : object for Spark Session
            input_data  : location of song_data 
            output_data : location of target S3 bucket
            
    """

    # get filepath to song data file
    song_data = input_data + 'song_data/*/*/*/*.json'
    
    # define schema
    songdata_schema = R([
    Fld("artist_id",Str()),
    Fld("artist_latitude",Dbl()),
    Fld("artist_location",Str()),
    Fld("artist_longitude",Dbl()),
    Fld("artist_name",Str()),
    Fld("duration",Dbl()),
    Fld("num_songs",Int()),
    Fld("title",Str()),
    Fld("year",Int()),
    ])
    
    # read song data file
    df = spark.read.json(song_data, schema=songdata_schema)

    # extract columns to create songs table
    songs_table = df.select(['artist_name', 'artist_id', 'year', 'duration'])

    songs_table = songs_table.dropDuplicates().withColumn('song_id', monotonically_increasing_id()).\
    select(['song_id', 'artist_name', 'artist_id', 'year', 'duration'])
    
    # write songs table to parquet files partitioned by year and artist
    songs_table.write.partitionBy('year', 'artist_id').parquet(output_data + 'songs/')

    # extract columns to create artists table
    selection = ['artist_id', 'artist_name as name', \
                 'artist_location as location', 'artist_latitude as latitude', \
                 'artist_longitude as longitude']
    artists_table = df.selectExpr(selection).dropDuplicates()
    
    # write artists table to parquet files
    artists_table.write.partitionBy('year', 'artist_id').parquet(output_data + 'artists/')
Exemple #20
0
def process_song_data(spark, input_data, output_data):
    # get filepath to song data file
    song_data = input_data + "./data/song_data/*/*/*/*.json"
    """Creating the song_data file schema that we are going to add to spark"""
    songSchema = R([
        Fld("artist_id", Str()),
        Fld("artist_latitude", Dbl()),
        Fld("artist_location", Str()),
        Fld("artist_longitude", Dbl()),
        Fld("artist_name", Str()),
        Fld("duration", Dbl()),
        Fld("num_songs", Int()),
        Fld("title", Str()),
        Fld("year", Int()),
    ])

    # reading song data file json structure
    df = spark.read.json(song_data, schema=songSchema)
    """Filtering out only the needed columns for the songs table"""
    song_fields = ["title", "artist_id", "year", "duration"]

    print('Creating the songs table and dropping duplicates')
    songs_table = df.select(song_fields).dropDuplicates().withColumn(
        "song_id", monotonically_increasing_id())
    print(
        "--- All duplicate songs have been dropped and the songs table created ---"
    )
    print('Printing some rows from the songs_table')
    songs_table.show(15)
    print('Saving the songs table to the s3 bucket')
    songs_table.write.partitionBy('year',
                                  'artist_id').parquet(output_data + "songs")
    print("--- songs.parquet completed ---")
    """Filtering out only the needed columns for the artists table"""
    artists_data = [
        'artist_id', 'artist_name', 'artist_location', 'artist_latitude',
        'artist_longitude'
    ]

    print("--- Starting to drop duplicate artists....")
    artists_table = df.selectExpr(artists_data).dropDuplicates()
    print("All duplicate artists have been dropped......")

    print('Printing some rows from the artists_table')
    artists_table.show(15)
    """writing the artists table to the parquets file"""
    artists_table.write.parquet(output_data + "artists")
    print("--- artists.parquet completed ---")
    print("*** process_song_data completed ***\n\n")
Exemple #21
0
def process_song_data(spark, input_data, output_data):
    """
    process_song_data - Loads the song data files from S3, and saves the song information to a parquet file
    (parititioned by year and artist_id), and then extracts the distinct artists and saves them to a parquet file.
    """

    # Get filepath to song data file
    song_data = os.path.join(input_data, 'song_data/*/*/*/*.json')
    #    song_data = os.path.join(input_data,'song_data/A/A/A/TRAAAAK128F9318786.json')

    songSchema = R([
        Fld("num_songs", Int()),
        Fld("artist_id", Str()),
        Fld("artist_latitude", Dbl()),
        Fld("artist_longitude", Dbl()),
        Fld("artist_location", Str()),
        Fld("artist_name", Str()),
        Fld("song_id", Str()),
        Fld("title", Str()),
        Fld("duration", Dbl()),
        Fld("year", Int())
    ])

    # Read song data file
    df = spark.read.json(song_data, schema=songSchema)

    # Extract columns to create songs table
    songs_table = df.select(
        ['song_id', 'title', 'artist_id', 'year', 'duration'])

    # Write songs table to parquet files partitioned by year and artist
    songs_table.write.partitionBy("year",
                                  "artist_id").mode('overwrite').parquet(
                                      os.path.join(output_data,
                                                   'songs.parquet'))

    # Extract columns to create artists table, and find the distinct artists
    artists_table = df.select([
        'artist_id', 'artist_name', 'artist_location', 'artist_latitude',
        'artist_longitude'
    ]).withColumnRenamed('artist_name', 'name').withColumnRenamed(
        'artist_location', 'location').withColumnRenamed(
            'artist_latitude',
            'latitude').withColumnRenamed('artist_longitude',
                                          'longitude').distinct()

    # Write artists table to parquet files
    artists_table.write.mode('overwrite').parquet(
        os.path.join(output_data, 'artists.parquet'))
Exemple #22
0
def process_song_data(spark, input_data, output_data):
    """
		Loads song_data from S3, extracting needed columns for 'song_table' and 'artist_table' 
		and writting their parquet format on S3
		
		Parameters:
			spark       : Spark Session
			input_data  : Location of song_data json files with the songs metadata
			output_data : S3 bucket were tables in parquet format store
	"""

    # get filepath to song data file
    song_data = os.path.join(input_data, 'song_data/*/*/*/*.json')

    # Making right type for input json structure
    songSchema = R([
        Fld("artist_id", Str()),
        Fld("artist_latitude", Dbl()),
        Fld("artist_location", Str()),
        Fld("artist_longitude", Dbl()),
        Fld("artist_name", Str()),
        Fld("duration", Dbl()),
        Fld("num_songs", Int()),
        Fld("title", Str()),
        Fld("year", Int()),
    ])

    # read song data file
    df = spark.read.json(song_data, schema=songSchema)

    # extract columns to create songs table
    songs_table = songs_table = df.selectExpr(
        "song_id", "title", "artist_id", "year",
        "duration").orderBy("song_id").drop_duplicates()

    # write songs table to parquet files partitioned by year and artist
    songs_table.write.partitionBy('year', 'artist_id').parquet(
        os.path.join(output_data, 'songs'))

    # extract columns to create artists table
    artists_table = df.selectExpr("artist_id", "artist_name as name",
                                  "artist_location as location",
                                  "artist_latitude as latitude",
                                  "artist_longitude as longitude").orderBy(
                                      "artist_id").drop_duplicates()

    # write artists table to parquet files
    artists_table.write.parquet(os.path.join(output_data, 'artists'))
Exemple #23
0
def read_and_process_airport_data(spark, filename, df_dimension_state_table):
    """ Load the airport codes join with state dimension data to get airports with state_key"""
    logging.info("Reading airport data")
    # load the airport codes so we can map them to states
    airport_schema = R([
        Fld("ident", Str()),
        Fld("type", Str()),
        Fld("name", Str()),
        Fld("elevation_ft", Int()),
        Fld("continent", Str()),
        Fld("iso_country", Str()),
        Fld("iso_region", Str()),
        Fld("municipality", Str()),
        Fld("gps_code", Str()),
        Fld("iata_code", Str()),
        Fld("local_code", Str()),
        Fld("coordinates", Str())
    ])

    df_airport = spark.read.options(Header=True,
                                    Delimter=",").csv(filename, airport_schema)

    # cleanse: we only want the airports in the US which map to the states that we have in the states table

    df_airport = df_airport.filter(df_airport.iso_country == "US") \
        .join(df_dimension_state_table, F.substring(df_airport.iso_region, 4, 2) == df_dimension_state_table.state_key,
              "inner") \
        .select(df_airport.ident, df_airport.local_code, df_dimension_state_table.state_key)

    return df_airport
Exemple #24
0
def process_song_data(spark, input_data, output_data):
    """
    Reads from song files, 
    transforms them into songs and artists data, 
    and writes them in parquet format.
    
    params:
    - spark: spark session object
    - input_data: input data path
    - output_data: output data path
    """

    # get filepath to song data file
    song_data = input_data + "/song_data/*/*/*/*.json"

    # use schema when read json files
    song_schema = St([
        Fld("num_songs", Int()),
        Fld("artist_id", Str()),
        Fld("artist_latitude", Dbl()),
        Fld("artist_longitude", Dbl()),
        Fld("artist_location", Str()),
        Fld("artist_name", Str()),
        Fld("song_id", Str()),
        Fld("title", Str()),
        Fld("duration", Dbl()),
        Fld("year", Int())
    ])

    # read song data file
    df = spark.read.json(song_data, schema=song_schema)

    # extract columns to create songs table
    songs_table = df.select("song_id", "title", "artist_id", "year",
                            "duration").dropDuplicates()

    # write songs table to parquet files partitioned by year and artist
    songs_table.write.parquet(output_data + "songs", mode="overwrite", \
                              partitionBy=["year", "artist_id"])

    # extract columns to create artists table
    artists_table = df.selectExpr("artist_id", "artist_name as name", "artist_location as location", \
                                  "artist_latitude as latitude", "artist_longitude as longitude") \
                                  .dropDuplicates()

    # write artists table to parquet files
    artists_table.write.parquet(output_data + "artists", mode="overwrite")
def set_data_schema(schema):
    """ 
        This Function return the schema definition
        Param:
            schema name
        Output:
            schema
    """
    print("...Setting data schema")
    try:
        songtSchema = R([    
            Fld("artist_id", Str()), 
            Fld("artist_latitude", Str()), 
            Fld("artist_longitude", Str()), 
            Fld("artist_location", Str()),          
            Fld("artist_name", Str()), 
            Fld("song_id", Str()), 
            Fld("title", Str()), 
            Fld("duration", Dbl()),
            Fld("year", Int())
        ]) 

        eventSchema = R([  
            Fld("artists", Str()),
            Fld("auth", Str()),
            Fld("first_name", Str()),
            Fld("gender", Str()),
            Fld("item_in_session", Int()),
            Fld("last_name", Str()),
            Fld("lenght", Str()),
            Fld("level", Str()),
            Fld("location", Str()),
            Fld("method", Str()),
            Fld("page", Str()),
            Fld("registration", Str()),
            Fld("session_id", Int()),
            Fld("song", Str()),
            Fld("status", Int()),
            Fld("ts", Str()),
            Fld("user_agente", Str()),
            Fld("user_id", Int())
        ])
        create_schema = {'Song': songtSchema, 'Event': eventSchema}
        return create_schema[schema]
    except:
        return "Error schema not exists"
Exemple #26
0
def get_song_src_schema():
    """
    Get the source spark schema definition
    :return: The schema definition
    """
    return R([
        Fld("num_songs", Int()),
        Fld("artist_id", Str()),
        Fld("artist_latitude", Dbl()),
        Fld("artist_longitude", Dbl()),
        Fld("artist_location", Str()),
        Fld("artist_name", Str()),
        Fld("song_id", Str()),
        Fld("title", Str()),
        Fld("duration", Dbl()),
        Fld("year", Int())
    ])
Exemple #27
0
def process_song_data(spark, input_data, output_data):
    """
    Extract data from song_data and write songs and artists table
    
    Arguments:
    - spark : SparkSession object
    - input_data : input data root dir path
    - output_data : output data root dir path
    """
    # schema for song_data 
    songSchema = R([
        Fld("num_songs", Int()),
        Fld("artist_id", Str()),
        Fld("artist_latitude", Dbl()),
        Fld("artist_longitude", Dbl()),
        Fld("artist_location", Str()),
        Fld("artist_name", Str()),
        Fld("song_id", Str()),
        Fld("title", Str()),
        Fld("duration", Dbl()),
        Fld("year",Int())
    ])
    
    # get filepath to song data file
    song_data = input_data + "song_data/*/*/*"
    
    # read song data file
    df = spark.read.json(song_data, schema=songSchema).dropDuplicates(["song_id"])

    # extract columns to create songs table
    df.createOrReplaceTempView("song_data")
    songs_table = spark.sql("""
        SELECT song_id, title, artist_id, year, duration FROM song_data
    """)
    
    # write songs table to parquet files partitioned by year and artist
    songs_table.write.partitionBy("year", "artist_id").parquet(output_data + "songs_table", 'overwrite')

    # extract columns to create artists table
    artists_table = spark.sql("""
        SELECT artist_id, artist_name as name, artist_location as location, artist_latitude as latitude, artist_longitude as longitude 
        FROM song_data
    """).dropDuplicates(["artist_id"])
    
    # write artists table to parquet files
    artists_table.write.parquet(output_data + "artists_table")
Exemple #28
0
def process_song_data(spark, input_data, output_data):
    # get filepath to song data file
    song_data = input_data + '*/*/*/*.json'

    # creating schema fo our song_data

    songs_model = St([
        Sfld("num_songs", Int()),
        Sfld("artist_id", Str()),
        Sfld("artist_latitude", Dbl()),
        Sfld("artist_longitude", Dbl()),
        Sfld("artist_location", Str()),
        Sfld("artist_name", Str()),
        Sfld("song_id", Str()),
        Sfld("title", Str()),
        Sfld("duration", Dbl()),
        Sfld("year", Int())
    ])

    # read song data file
    df = spark.read.json(song_data, schema=songs_model)

    # extract columns to create songs table
    songs_table = df.select(["title", "artist_id", "year",
                             "duration"]).dropDuplicates().withColumn(
                                 "song_id", monotonically_increasing_id())

    # write songs table to parquet files partitioned by year and artist
    songs_table.write.partitionBy("year", "artist_id").parquet(
        output_data + 'songs_table/', mode='overwrite')

    # extract columns to create artists table
    artists_fields = [
        "artist_id", "artist_name", "artist_location", "artist_latitude",
        "artist_longitude"
    ]
    artists_table = df.select(artists_fields).withColumnRenamed(
        'artist_name', 'name').withColumnRenamed(
            'artist_location', 'location').withColumnRenamed(
                'artist_latitude',
                'latitude').withColumnRenamed('artist_longitude',
                                              'longitude').dropDuplicates()

    # write artists table to parquet files
    artists_table.write.parquet(output_data + 'artists_table/',
                                mode='overwrite')
Exemple #29
0
def process_song_data(spark, input_data, output_data):
    """
    Description:
                Function that processes the raw data from the S3 bucket
    Parameters:
    
                :spark:       uses the earlier instantiated spark session
                :input_data:  path of the location where the files are residing
                :output_data: path of the location where the files will be saved after processing
                :return:      none
    
    """

    song_data = input_data + 'song_data/A/A/A/*.json'

    songSchema = R([
        Fld("artist_id", Str()),
        Fld("artist_latitude", Dbl()),
        Fld("artist_location", Str()),
        Fld("artist_longitude", Dbl()),
        Fld("artist_name", Str()),
        Fld("duration", Dbl()),
        Fld("num_songs", Int()),
        Fld("title", Str()),
        Fld("year", Int()),
    ])

    df = spark.read.json(song_data, schema=songSchema)

    song_fields = ["title", "artist_id", "year", "duration"]

    songs_table = df.select(song_fields).dropDuplicates().withColumn(
        "song_id", monotonically_increasing_id())

    songs_table.write.mode('overwrite').partitionBy(
        "year", "artist_id").parquet(output_data + 'songs/')

    artists_fields = [
        "artist_id", "artist_name as name", "artist_location as location",
        "artist_latitude as latitude", "artist_longitude as longitude"
    ]

    artists_table = df.selectExpr(artists_fields).dropDuplicates()

    artists_table.write.mode('overwrite').parquet(output_data + 'artists/')
Exemple #30
0
def process_song_data(spark, input_data, output_data):
    """
        The funtion process song data using spark on AWS.
        Input: 
              song_log in  .json format, 
        output:
              Processed data in parquet format loaded back to S3
        args:
              Spark session, input_data, output_data
        Return:
              none
    """

    song_data = input_data + 'song_data/*/*/*/*.json'

    songSchema = R([
        Fld("artist_id", Str()),
        Fld("artist_latitude", Dbl()),
        Fld("artist_location", Str()),
        Fld("artist_longitude", Dbl()),
        Fld("artist_name", Str()),
        Fld("duration", Dbl()),
        Fld("num_songs", Int()),
        Fld("title", Str()),
        Fld("year", Int()),
    ])

    df = spark.read.json(song_data, schema=songSchema)

    song_fields = ["title", "artist_id", "year", "duration"]

    songs_table = df.select(song_fields).dropDuplicates().withColumn(
        "song_id", monotonically_increasing_id())

    songs_table.write.partitionBy("year",
                                  "artist_id").parquet(output_data + 'songs/')

    artists_fields = [
        "artist_id", "artist_name as name", "artist_location as location",
        "artist_latitude as latitude", "artist_longitude as longitude"
    ]

    artists_table = df.selectExpr(artists_fields).dropDuplicates()

    artists_table.write.parquet(output_data + 'artists/')