#----------------------------------------------------------------------------
## Main functionality
if __name__ == "__main__":

    main_config_file_filter = None
    errorCount = 0

    workflowStartTime = datetime.datetime.now()
    if len(sys.argv) > 1:
        main_config_file = sys.argv[1]
    if len(sys.argv) > 2:
        main_config_file_filter = sys.argv[2]

    spark.udf.register('udfConvertInt', convertInt, IntegerType())
    spark.udf.register('udfConvertDouble', convertDouble, DoubleType())
    spark.udf.register('udfConvertDatetime', convertDatetime, TimestampType())

    mainConfig = spark.read.load(main_config_file,
                                 format="csv",
                                 delimiter="|",
                                 header=True)

    #Opretaion|LoadType|threads|Server|Database|t|WhereClause|DeltaColumn|UniqueIdentifiers|PartitionColumn|TargetLocationRaw|TargetLocationCooked|TargetLocationTableSchema|HiveDatabase|HiveTable|Comments

    if (main_config_file_filter is not None):
        mainConfig = mainConfig.filter(main_config_file_filter)

    for row in mainConfig.collect():
        try:
            print(
                "===================================================================================================="
Ejemplo n.º 2
0
from datetime import datetime

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, date_format, udf
from pyspark.sql.types import (DateType, IntegerType, FloatType, StructField,
                               StructType, TimestampType)

spark = SparkSession.builder.appName("Read Transactions").getOrCreate()

csv_schema = StructType([
    StructField('customer_id', IntegerType()),
    StructField('amount', FloatType()),
    StructField('purchased_at', TimestampType()),
])

dataframe = spark.read.csv("transactions.csv", schema=csv_schema, header=True)

dataframe.show()

# Add a new column by formatting the original date
formatted_df = dataframe.withColumn(
    "date_string", date_format(col("purchased_at"), 'MM/dd/yyyy'))
formatted_df.show()

# Create a user defined function
string_to_date = \
    udf(lambda text_date: datetime.strptime(text_date, '%m/%d/%Y'),
        DateType())

typed_df = formatted_df.withColumn("date",
                                   string_to_date(formatted_df.date_string))
Ejemplo n.º 3
0
def process_log_data(spark, input_data, output_data):
    """
    Process log data from json files and create users, time, and songplays tables in parquet.
    """
    # get filepath to log data file
    log_data = input_data

    # read log data file
    df = spark.read.json("{}log_data/*/*/*.json".format(log_data))

    # filter by actions for song plays
    df = df.filter(df['page'] == "NextSong")

    # extract columns for users table
    users_table = df.withColumn("last_stamp", max_(col('ts')).over(Window.partitionBy("userId"))) \
        .filter(col('ts') == col('last_stamp')) \
        .select('userId', 'firstName',
                'lastName', 'gender', 'level')

    # write users table to parquet files
    users_table.write.parquet(os.path.join(
        output_data, 'users'), mode='overwrite')

    # create timestamp column from original timestamp column
    get_timestamp = udf(lambda x: datetime.fromtimestamp(
        x/1000.0), TimestampType())
    df = df.withColumn("timestamp", get_timestamp(col('ts')))

    # create datetime column from original timestamp column
    get_datetime = udf(lambda x: datetime.fromtimestamp(x/1000.0), DateType())
    df = df.withColumn("datetime", get_datetime(col('ts')))

    # extract columns to create time table
    time_table = df.withColumn('hour', hour(df.timestamp)) \
        .withColumn('day', dayofmonth(df.timestamp)) \
        .withColumn('week', weekofyear(df.timestamp)) \
        .withColumn('month', month(df.timestamp)) \
        .withColumn('year', year(df.timestamp)) \
        .withColumn('weekday', date_format('timestamp', 'u')) \
        .select('timestamp', 'hour', 'day', 'week', 'month', 'year', 'weekday').distinct()

    # write time table to parquet files partitioned by year and month
    time_table.write.partitionBy('year', 'month').parquet(
        os.path.join(output_data, 'time_tbl'), mode='overwrite')

    # read in song data to use for songplays table
    song_df = spark.read.parquet(os.path.join(output_data, 'songs'))

    artist_df = spark.read.parquet(os.path.join(output_data, 'artists'))

    # extract columns from joined song and log data`sets to create songplays table
    songplays_table = df.join(song_df, (df.song == song_df.title)
                              & (df.length == song_df.duration), 'left_outer') \
                        .join(artist_df, (song_df.artist_id == artist_df.artist_id)
                              & (df.artist == artist_df.artist_name), 'left_outer') \
                        .select(
                            df.timestamp.alias("start_time"),
                            df.userId.alias("user_id"),
                            df.level, song_df.song_id,
                            song_df.artist_id, df.sessionId.alias(
                                "session_id"),
                            df.location, df.userAgent.alias("user_agent")
    ).withColumn("songplay_id", monotonically_increasing_id())

    # write songplays table to parquet files partitioned by year and month
    songplays_table.join(time_table, (songplays_table.start_time == time_table.timestamp)) \
                   .select(songplays_table["*"], time_table.year, time_table.month) \
                   .write.partitionBy('year', 'month').parquet(os.path.join(output_data, 'songplays'), mode='overwrite')
Ejemplo n.º 4
0
    "struct": StructType,
    "array": ArrayType,
    "bigint": LongType,
    "date": DateType,
    "byte": ByteType,
    "short": ShortType,
    "datetime": TimestampType,
    "binary": BinaryType,
    "null": NullType,
    "vector": VectorUDT
}
SPARK_DTYPES_DICT_OBJECTS = \
    {"string": StringType(), "int": IntegerType(), "float": FloatType(),
     "double": DoubleType(), "boolean": BooleanType(), "struct": StructType(), "array": ArrayType(StringType()),
     "bigint": LongType(), "date": DateType(), "byte": ByteType(), "short": ShortType(),
     "datetime": TimestampType(), "binary": BinaryType(), "null": NullType()
     }
PROFILER_COLUMN_TYPES = {
    "categorical", "numeric", "date", "null", "array", "binary"
}
PYTHON_TO_PROFILER = {
    "string": "categorical",
    "boolean": "categorical",
    "int": "numeric",
    "decimal": "numeric",
    "date": "date",
    "array": "array",
    "binaty": "binary",
    "null": "null"
}
SPARK_DTYPES_TO_PROFILER = {
Ejemplo n.º 5
0
 def test_timestamp_microsecond(self):
     tst = TimestampType()
     self.assertEqual(
         tst.toInternal(datetime.datetime.max) % 1000000, 999999)
Ejemplo n.º 6
0
#Before Spark 1.4
train = sqlContext.load(source="com.databricks.spark.csv", path = 'PATH/train.csv', header = True,inferSchema = True)
test = sqlContext.load(source="com.databricks.spark.csv", path = 'PATH/test-comb.csv', header = True,inferSchema = True)

#Current Spark 2.1 and ...
from pyspark .sql import SparkSession
spark = SparkSession.builder.master("yarn").getOrCreate()
df = spark.read.csv('hdfs://hadoop-master:9000/index/train.csv',mode="DROPMALFORMED")

#Defining schema with ArrayType
schema = StructType([StructField('array_column', ArrayType(StructType([StructField('element_of_array', StringType(), True)])), True)])

#From local : The third parameter i.e. Boolean Type : True / False denote whether the corresponding filed can be nullable
from pyspark.sql.types import StructType,StructField,LongType,StringType,TimestampType
schema=StructType([StructField('col0', LongType(), True), StructField('col1', LongType(), True), StructField('col2', StringType(), True), StructField('col3', StringType(), True),StructField('col4',TimestampType(),True),StructField('col5',TimestampType(),True),StructField('col6',StringType(),True)])
df = spark.read.csv('file:///index/data_extract_restart2_without_cert/data_refined.csv',mode="DROPMALFORMED"),schema=schema)

#Creating UDF
def dict(sk):
  new_sk=sk.replace(',','|')#replacing comma by pipe in column col2 and putting the result in column named new_column_name
	return new_sk


udf_dict = udf(dict, StringType())

df.withColumn('new_column_name', udf_dict("col2")).write.csv(path="/index/skill_clean_v3")#col2 is the column to be changed

df.write.csv('/data/file_csv/', mode="overwrite") # May also add:     mode="overwrite", sep="\t"

#TIP : If a dataframe has list / array in one of its' column (like 'student_name_list') it can't be written into a csv file directly
Ejemplo n.º 7
0
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode
from pyspark.sql.functions import split
from pyspark.sql.types import StructField, StructType, StringType, IntegerType, TimestampType
from pyspark.sql.functions import window
import time

spark = SparkSession \
    .builder \
    .appName("carStreaming") \
    .getOrCreate()

schema = StructType([
            StructField('type', StringType(), True),
            StructField('color', StringType(), True),
            StructField('timestamp', TimestampType(), True)])


# Create DataFrame representing the stream of input lines from connection to localhost:9999
fileStreamDf = spark \
    .readStream \
    .option("header", "true") \
    .schema(schema) \
    .option("inferSchema", "true") \
    .csv("/home/hadoop/spark-streaming/data/")

aggDF = fileStreamDf.groupBy("type").count()

# windowedCounts = fileStreamDf \
#     .withWatermark("timestamp", "2 minutes") \
#     .groupBy(window(fileStreamDf.timestamp, "10 minutes", "5 minutes"), fileStreamDf.type).count()
def convert_extract_to_parquet(extract_loc, save_dir, spark=None):

    if not spark:
        spark = SparkSession \
        .builder \
        .appName("shared") \
        .getOrCreate()

    # https://spark.apache.org/docs/latest/api/python/pyspark.sql.html?highlight=read%20csv
    # ignore (escape) " if already within quotes to avoid splitting by , within the columnwise jsons

    # NullPointerException if you try to access something you promised would never be null

    # cannot read nested structs straight from CSV, sadly, so will parse columns individually
    metadata_struct = StructType([
        StructField('source', StringType(), False),
        StructField('session', StringType(), False),
        StructField(
            'viewport',
            StructType([
                StructField('width', StringType(), False),
                StructField('height', StringType(), False)
            ]), False),
        StructField('started_at', TimestampType(), False),
        StructField('user_agent', StringType(), False),
        StructField('utc_offset', StringType(), False),
        StructField('finished_at', TimestampType(), False),
        StructField('live_project', BooleanType(), False),
        StructField('interventions', StringType(), False),  # actually struct
        StructField('user_language', StringType(), False),
        StructField('source', StringType(), False),
        StructField('subject_dimensions', StringType(),
                    False),  # actually struct
        StructField('subject_selection_state', StringType(),
                    False),  # actually struct
        StructField('workflow_translation_id', StringType(),
                    True),  # actually struct, sometimes null
    ])

    # TODO answer (at the very least) is very occasionally null, and this causes either EOF/Null pointer (if not nullable) or raise error like
    # ValueError: Answer None of type <class 'NoneType'> not found in schema for question T0
    # should filter out tasks with missing keys for these
    annotations_struct = ArrayType(
        StructType([
            StructField('task', StringType(), True),
            StructField('task_id', StringType(), True),
            StructField('task_label', StringType(), True),
            StructField('value', StringType(), True),
            StructField('multiple_choice', BooleanType(), True),
        ]))

    # subject_data_internal_struct = StructType(
    #     # StructField('!iauname', StringType(), True),
    #     # StructField('iauname', StringType(), True)
    # )
    # subject_data_struct = ArrayType(MapType(StringType(), subject_data_internal_struct))

    schema = StructType([
        StructField('classification_id', StringType(), False),
        StructField('user_name', StringType(), True),
        StructField('user_id', StringType(), True),
        StructField('user_ip', StringType(), True),
        StructField('workflow_id', StringType(), False),
        StructField('workflow_name', StringType(), False),
        StructField('workflow_version', FloatType(), False),
        StructField('created_at', StringType(), False),
        StructField('gold_standard', StringType(), False),
        StructField('expert', StringType(), False),
        StructField('metadata', StringType(), False),
        StructField('annotations', StringType(), False),
        StructField('subject_data', StringType(), False),
        StructField('subject_ids', StringType(), False)
    ])

    # schema = StructType([
    #     StructField('name', StructType([
    #          StructField('firstname', StringType(), True),
    #          StructField('middlename', StringType(), True),
    #          StructField('lastname', StringType(), True)
    #          ])),
    #      StructField('id', StringType(), True),
    #      StructField('gender', StringType(), True),
    #      StructField('salary', IntegerType(), True)
    #      ])

    ds = spark.read.csv(extract_loc,
                        header=True,
                        quote='"',
                        escape='"',
                        schema=schema,
                        mode='FAILFAST')

    # for debugging
    # ds = ds.sample(withReplacement=False, fraction=.1, seed=42)
    # print(ds.head())

    # need to unpack metadata and subject data
    # print(ds.head()['metadata'])
    # print(ds.head()['annotations'])

    metadata_str_to_struct_udf = udf(metadata_str_to_struct,
                                     returnType=metadata_struct)
    annotations_str_to_struct_udf = udf(annotation_to_struct,
                                        returnType=annotations_struct)
    subject_data_str_to_iauname_udf = udf(subject_data_str_to_iauname,
                                          returnType=StringType())
    get_person_id_udf = udf(get_person_id, returnType=StringType())

    ds = ds.withColumn('metadata', metadata_str_to_struct_udf(ds['metadata']))
    ds = ds.withColumn('annotations',
                       annotations_str_to_struct_udf(ds['annotations']))
    ds = ds.withColumn('iauname',
                       subject_data_str_to_iauname_udf(ds['subject_data']))

    ds = ds.withColumn('person_id',
                       get_person_id_udf(ds['user_id'], ds['user_ip']))

    ds = ds.withColumnRenamed('subject_ids', 'subject_id')
    ds = ds.withColumn(
        'project_id', lit('5733')
    )  # TODO hardcoded for now as not in export. lit to make it a column, as Spark requires.

    flattened = flatten.api_df_to_responses(ds)

    flattened.write.parquet(save_dir, mode='overwrite')
Ejemplo n.º 9
0
        & (song_df.title == logs_df.song)
    ).select(logs_df.ts, logs_df.userId.alias('user_id'), logs_df.level, song_df.song_id, song_df.artist_id, logs_df.sessionId.alias('session_id'), logs_df.location, logs_df.userAgent.alias('user_agent'))\
    .withColumn('songplay_id', F.monotonically_increasing_id())\
    .withColumn('start_time', get_datetime_from(logs_df.ts))\
    .withColumn('year', F.year('start_time'))\
    .withColumn('month', F.month('start_time'))

    # partition songplays_table by year and month
    songplays_table = songplays_table.repartition('year', 'month')

    # write songplays table to parquet files partitioned by year and month
    songplays_table.write.partitionBy('year', 'month').mode('overwrite').save(
        f"{S3_OUTPUT_PATH}/songplays_table.parquet")


@udf(TimestampType())
def get_datetime_from(long_value):
    """
    Converts timestamp of type Long to datetime
    """

    return datetime.fromtimestamp(long_value / 1000.0)


def main():
    """
    Orchestrates the ETL
    """

    spark = create_spark_session()
    input_data = "s3a://udacity-dend"
Ejemplo n.º 10
0
from pyspark.sql.types import StructType, StructField, TimestampType, DecimalType, StringType, DoubleType
from pySparkManager import createSpark
from createTargetList import extractTarget
from createCorpus import createCorpusForUser
from generateResponse import generateTweet

# structure from tweet
dtypes = StructType([
    StructField("created_at", TimestampType(), True),
    StructField("tweet_id", StringType(), False),
    StructField("tweet", StringType(), False),
    StructField("likes", DecimalType(38, 0), False),
    StructField("retweet_count", DecimalType(38, 0), False),
    StructField("source", StringType(), True),
    StructField("user_id", DecimalType(38, 0), False),
    StructField("user_name", StringType(), True),
    StructField("user_screen_name", StringType(), False),
    StructField("user_description", StringType(), True),
    StructField("user_join_date", TimestampType(), True),
    StructField("user_followers_count", DecimalType(38, 0), False),
    StructField("user_location", StringType(), True),
    StructField("lat", DoubleType(), True),
    StructField("long", DoubleType(), True),
    StructField("city", StringType(), True),
    StructField("country", StringType(), True),
    StructField("continent", StringType(), True),
    StructField("state", StringType(), True),
    StructField("state_code", StringType(), True),
    StructField("collected_at", TimestampType(), False)
])
Ejemplo n.º 11
0
    def read_data(self):

        userSchema = StructType([
                StructField('medallion', StringType()),
                StructField('pickup_time', TimestampType()),
                StructField('total_amount', DoubleType()),
                ])

        self.fare = self.spark \
            .readStream \
            .format("kafka") \
            .option("kafka.bootstrap.servers", "localhost:9092") \
            .option("subscribe", "nycfare1") \
            .option("startingOffsets", "earliest") \
            .option('failOnDataLoss','false') \
            .option("maxOffsetsPerTrigger", 1000) \
            .load()

        self.df_fare = self.fare.selectExpr("CAST(value as STRING) as json") \
                   .select(from_json("json", userSchema).alias('data'))\
                   .selectExpr(
                        "data.medallion as medallion_fare",
                        "cast (data.pickup_time as timestamp) as pickup_time_fare",
                        "cast (data.total_amount as float)",
                    )

        userSchema = StructType([
            StructField('medallion', StringType()),
            StructField('pickup_time', TimestampType()),
            StructField('dropoff_time', TimestampType()),
            StructField('passenger_count', IntegerType()),
            StructField('trip_time', IntegerType()),
            StructField('trip_distance', DoubleType()),
            StructField('pickup_loc', MapType(StringType(), DoubleType())),
            StructField('dropoff_loc', MapType(StringType(), DoubleType()))
        ])

        self.trip = self.spark \
            .readStream \
            .format("kafka") \
            .option("kafka.bootstrap.servers", "localhost:9092") \
            .option("subscribe", "nycspeed9") \
            .option("startingOffsets", "earliest") \
            .option('failOnDataLoss', 'false') \
            .option("maxOffsetsPerTrigger", 1000) \
            .load()

        self.df_trip = self.trip.selectExpr("CAST(value as STRING) as json") \
            .select(from_json("json", userSchema).alias('data')) \
            .selectExpr(
            "data.medallion as medallion_trip",
            "cast (data.pickup_time as timestamp) as pickup_time_trip",
            "cast (data.dropoff_time as timestamp)",
            "cast (data.passenger_count as integer)",
            "cast (data.trip_time as integer)",
            "cast (data.trip_distance as float)",
            "cast (data.pickup_loc.lat as float) as pickup_loc_lat",
            # "cast data.pickup_loc.lat as pickup_loc_lat"
            "cast (data.pickup_loc.lon as float) as pickup_loc_lon",
            # "cast data.pickup_loc.lon as pickup_loc_lon",
            "cast (data.dropoff_loc.lat as float) as dropoff_loc_lat",
            # "cast data.dropoff_loc.lat as dropoff_loc_lat",
            "cast (data.dropoff_loc.lon as float) as dropoff_loc_lon",
            # "cast data.dropoff_loc.lon as dropoff_loc_lon"
        )

        print(self.df_trip.printSchema())

        self.df = self.df_trip.join(
            self.df_fare,
            expr("""
            medallion_trip = medallion_fare AND
            pickup_time_trip >= pickup_time_fare - interval 1 hour AND
            pickup_time_trip <= pickup_time_fare + interval 1 hour
            """)
        )

        print((self.df \
              .writeStream \
              .outputMode("append") \
              .format("console") \
              .option('truncate','false')
              .option('numRows', 20)
              .start()
              .awaitTermination()
              ))

        query = self.windowedCounts.writeStream \
            .outputMode("append") \
            .queryName("writing_to_es") \
            .format("org.elasticsearch.spark.sql") \
            .option("checkpointLocation", "/tmp/1") \
            .option("es.nodes", "localhost") \
            .option("es.port", "9200") \
            .option("es.resource", "nycfare2/_doc") \

        query.start().awaitTermination()
Ejemplo n.º 12
0
                                     str(row[5]), \
                                     int(row[6]), \
                                     int(row[7]), \
                                     int(row[8]), \
                                     int(row[9]), \
                                     row[10]))

    resultMap_FilterUlr =  resultMap.map(lambda (a,b,c,d,e,f,g,h,i,j,l): (a,b,c,d,e,f,g,h,i,j,regularExpression(l.split(",")))). \
                                    filter(lambda (a,b,c,d,e,f,g,h,i,j,l): len(l) >1)

    #put on Json
    fields = StructType( \
                        [StructField("GSN", StringType(), False),  \
                        StructField("ChargingID", IntegerType(), False),  \
                        StructField("RecordSequence", IntegerType(), False),  \
                        StructField("RecordOpeningDate", TimestampType(), False),  \
                        StructField("rATType", IntegerType(), False),  \
                        StructField("UserLocation", StringType(), False),  \
                        StructField("Accuracy", IntegerType(), False),  \
                        StructField("BrowsingSession", IntegerType(), False),  \
                        StructField("Uplink", IntegerType(), False),  \
                        StructField("Downlink", IntegerType(), False), \
                        StructField("Urls", ArrayType(StringType(),False))])

    #The new Json Format
    newStructure = StructType( \
                        [StructField("GSN", StringType(), False),  \
                        StructField("ChargingID", IntegerType(), False),  \
                        StructField("RecordSequence", IntegerType(), False),  \
                        StructField("RecordOpeningDate", TimestampType(), False),  \
                        StructField("rATType", IntegerType(), False),  \
Ejemplo n.º 13
0
    def test_as_spark_type_koalas_dtype(self):
        type_mapper = {
            # binary
            np.character: (np.character, BinaryType()),
            np.bytes_: (np.bytes_, BinaryType()),
            np.string_: (np.bytes_, BinaryType()),
            bytes: (np.bytes_, BinaryType()),
            # integer
            np.int8: (np.int8, ByteType()),
            np.byte: (np.int8, ByteType()),
            np.int16: (np.int16, ShortType()),
            np.int32: (np.int32, IntegerType()),
            np.int64: (np.int64, LongType()),
            np.int: (np.int64, LongType()),
            int: (np.int64, LongType()),
            # floating
            np.float32: (np.float32, FloatType()),
            np.float: (np.float64, DoubleType()),
            np.float64: (np.float64, DoubleType()),
            float: (np.float64, DoubleType()),
            # string
            np.str: (np.unicode_, StringType()),
            np.unicode_: (np.unicode_, StringType()),
            str: (np.unicode_, StringType()),
            # bool
            np.bool: (np.bool, BooleanType()),
            bool: (np.bool, BooleanType()),
            # datetime
            np.datetime64: (np.datetime64, TimestampType()),
            datetime.datetime: (np.dtype("datetime64[ns]"), TimestampType()),
            # DateType
            datetime.date: (np.dtype("object"), DateType()),
            # DecimalType
            decimal.Decimal: (np.dtype("object"), DecimalType(38, 18)),
            # ArrayType
            np.ndarray: (np.dtype("object"), ArrayType(StringType())),
            List[bytes]: (np.dtype("object"), ArrayType(BinaryType())),
            List[np.character]: (np.dtype("object"), ArrayType(BinaryType())),
            List[np.bytes_]: (np.dtype("object"), ArrayType(BinaryType())),
            List[np.string_]: (np.dtype("object"), ArrayType(BinaryType())),
            List[bool]: (np.dtype("object"), ArrayType(BooleanType())),
            List[np.bool]: (np.dtype("object"), ArrayType(BooleanType())),
            List[datetime.date]: (np.dtype("object"), ArrayType(DateType())),
            List[np.int8]: (np.dtype("object"), ArrayType(ByteType())),
            List[np.byte]: (np.dtype("object"), ArrayType(ByteType())),
            List[decimal.Decimal]:
            (np.dtype("object"), ArrayType(DecimalType(38, 18))),
            List[float]: (np.dtype("object"), ArrayType(DoubleType())),
            List[np.float]: (np.dtype("object"), ArrayType(DoubleType())),
            List[np.float64]: (np.dtype("object"), ArrayType(DoubleType())),
            List[np.float32]: (np.dtype("object"), ArrayType(FloatType())),
            List[np.int32]: (np.dtype("object"), ArrayType(IntegerType())),
            List[int]: (np.dtype("object"), ArrayType(LongType())),
            List[np.int]: (np.dtype("object"), ArrayType(LongType())),
            List[np.int64]: (np.dtype("object"), ArrayType(LongType())),
            List[np.int16]: (np.dtype("object"), ArrayType(ShortType())),
            List[str]: (np.dtype("object"), ArrayType(StringType())),
            List[np.unicode_]: (np.dtype("object"), ArrayType(StringType())),
            List[datetime.datetime]:
            (np.dtype("object"), ArrayType(TimestampType())),
            List[np.datetime64]:
            (np.dtype("object"), ArrayType(TimestampType())),
            # CategoricalDtype
            CategoricalDtype(categories=["a", "b", "c"]): (
                CategoricalDtype(categories=["a", "b", "c"]),
                LongType(),
            ),
        }

        for numpy_or_python_type, (dtype, spark_type) in type_mapper.items():
            self.assertEqual(as_spark_type(numpy_or_python_type), spark_type)
            self.assertEqual(koalas_dtype(numpy_or_python_type),
                             (dtype, spark_type))

        with self.assertRaisesRegex(TypeError,
                                    "Type uint64 was not understood."):
            as_spark_type(np.dtype("uint64"))

        with self.assertRaisesRegex(TypeError,
                                    "Type object was not understood."):
            as_spark_type(np.dtype("object"))

        with self.assertRaisesRegex(TypeError,
                                    "Type uint64 was not understood."):
            koalas_dtype(np.dtype("uint64"))

        with self.assertRaisesRegex(TypeError,
                                    "Type object was not understood."):
            koalas_dtype(np.dtype("object"))
Ejemplo n.º 14
0
def process_log_data(spark, input_data, output_data):
    """
    Process the event log file and extract data for table time, users and songplays with Spark.
    --------
    Param:
        spark: A spark session instance.
        input_data: input data path.
        output_data: output data path.
    Return:
        None.
    """
    # get filepath to log data file
    log_data = os.path.join(input_data, 'log_data/')

    # read log data file
    df = spark.read.json(log_data)
    
    # filter by actions for song plays
    df = df.filter(df.page == 'NextSong')

    # extract columns for users table    
    users_table = df.selectExpr('userId as user_id', 'firstName as first_name', 'lastName as last_name', 'gender', 'level').drop_duplicates()
    
    # write users table to parquet files
    users_table.write.mode('overwrite').parquet(output_data + 'users/')

    # create timestamp column from original timestamp column
    get_timestamp = udf(lambda x: datetime.utcfromtimestamp(int(x)/1000), TimestampType())
    df = df.withColumn('timestamp', get_timestamp('ts'))
    
    # create datetime column from original timestamp column
    get_datetime = udf(lambda x: datetime.fromtimestamp(x), TimestampType())
    df = df.withColumn('start_time', get_datetime('timestamp')) \
        .withColumn('hour', hour('start_time')) \ 
        .withColumn('day', dayofmonth('start_time')) \ 
        .withColumn('week', weekofyear('start_time')) \ 
        .withColumn('month', month('start_time')) \ 
        .withColumn('year', year('start_time')) \
        .withColumn('weekday', dayofweek('start_time'))
    
    # extract columns to create time table
    time_table = df.select('start_time', 'hour', 'day', 'week', 'month', 'year', 'weekday').drop_duplicates()
    
    # write time table to parquet files partitioned by year and month
    time_table.write.partitionBy('year', 'month').mode('overwrite').parquet(output_data + 'time/')

    # read in song data to use for songplays table
    song_df = spark.read.parquet(os.path.join(output_data, 'songs/*/*/*'))
    songs_logs_df = df.join(song_df, (df.song == song_df.title))
    artists_df = spark.read.parquet(os.path.join(output_data, 'artists'))
    songs_logs_artists_df = songs_logs_df.join(artists_df, (songs_logs_df.artist == artists_df.name))
    songplays_df = songs_logs_artists_df.join(time_table, (songs_logs_artists_df.start_time == time_table.start_time), 'left').drop(songs_logs_artists_df.year)
    
    # extract columns from joined song and log datasets to create songplays table 
    songplays_table = songplays_df.select(
        monotonically_increasing_id().alias('songplay_id'), 
        col('start_time'), 
        col('userId').alias('user_id'),
        col('level'),
        col('song_id'),
        col('artist_id'),
        col('sessionId').alias('session_id'),
        col('location'),
        col('userAgent').alias('user_agent'),
        col('year'),
        col('month')).drop_duplicates()

    # write songplays table to parquet files partitioned by year and month
    songplays_table.write.partitionBy('year', 'month').mode('overwrite').parquet(output_data + 'songplays/')
def test_historical_feature_retrieval_from_local_spark_session(
    spark,
    client,
    driver_entity,
    customer_entity,
    bookings_feature_table,
    transactions_feature_table,
):
    schema = StructType([
        StructField("customer_id", IntegerType()),
        StructField("driver_id", IntegerType()),
        StructField("event_timestamp", TimestampType()),
    ])
    df_data = [
        (
            1001,
            8001,
            datetime(year=2020, month=9, day=1),
        ),
        (
            2001,
            8001,
            datetime(year=2020, month=9, day=2),
        ),
        (
            2001,
            8002,
            datetime(year=2020, month=9, day=1),
        ),
        (
            1001,
            8001,
            datetime(year=2020, month=9, day=2),
        ),
        (
            1001,
            8001,
            datetime(year=2020, month=9, day=3),
        ),
        (
            1001,
            8001,
            datetime(year=2020, month=9, day=4),
        ),
    ]
    temp_dir, file_uri = create_temp_parquet_file(spark,
                                                  "customer_driver_pair",
                                                  schema, df_data)
    customer_driver_pairs_source = FileSource("event_timestamp",
                                              "created_timestamp", "parquet",
                                              file_uri)
    joined_df = client.get_historical_features_df(
        [
            "transactions:total_transactions",
            "bookings:total_completed_bookings"
        ],
        customer_driver_pairs_source,
    )
    expected_joined_df_schema = StructType([
        StructField("customer_id", IntegerType()),
        StructField("driver_id", IntegerType()),
        StructField("event_timestamp", TimestampType()),
        StructField("transactions__total_transactions", DoubleType()),
        StructField("bookings__total_completed_bookings", IntegerType()),
    ])
    expected_joined_df_data = [
        (1001, 8001, datetime(year=2020, month=9, day=1), 100.0, 100),
        (2001, 8001, datetime(year=2020, month=9, day=2), 400.0, 150),
        (2001, 8002, datetime(year=2020, month=9, day=1), 400.0, None),
        (1001, 8001, datetime(year=2020, month=9, day=2), 200.0, 150),
        (1001, 8001, datetime(year=2020, month=9, day=3), 200.0, 150),
        (1001, 8001, datetime(year=2020, month=9, day=4), 300.0, None),
    ]
    expected_joined_df = spark.createDataFrame(
        spark.sparkContext.parallelize(expected_joined_df_data),
        expected_joined_df_schema,
    )
    assert_dataframe_equal(joined_df, expected_joined_df)
    shutil.rmtree(temp_dir)
Ejemplo n.º 16
0
def process_log_data(spark, input_data, output_data):
    """ Processing log data (users, time table, songplay) by the JSON given by S3,
        after data normalization and transformation
        these data are wrote as parquet files """
    """ Proving JSON structure to Spark """
    logdata_schema = StructType([
        StructField("artist", StringType(), True),
        StructField("auth", StringType(), True),
        StructField("firstName", StringType(), True),
        StructField("gender", StringType(), True),
        StructField("itemInSession", LongType(), True),
        StructField("lastName", StringType(), True),
        StructField("length", DoubleType(), True),
        StructField("level", StringType(), True),
        StructField("location", StringType(), True),
        StructField("method", StringType(), True),
        StructField("page", StringType(), True),
        StructField("registration", DoubleType(), True),
        StructField("sessionId", LongType(), True),
        StructField("song", StringType(), True),
        StructField("status", LongType(), True),
        StructField("ts", LongType(), True),
        StructField("userAgent", StringType(), True),
        StructField("userId", StringType(), True),
    ])

    # get filepath to log data file
    log_data = input_data + 'log-data'

    # read log data file, JSON structure
    df = spark.read.json(log_data, schema=logdata_schema)

    # filter by actions for song plays
    df = df.filter(col("page") == 'NextSong')

    # extract columns for users table
    users_table = df.select(
        col("userId").alias("user_id"),
        col("firstName").alias("first_name"),
        col("lastName").alias("last_name"), "gender", "level")

    # write users table to parquet files
    users_table.write.parquet(output_data + "users")

    tsFormat = "yyyy-MM-dd HH:MM:ss z"
    # Converting ts to a timestamp format
    time_table = df.withColumn(
        'ts',
        to_timestamp(
            date_format((df.ts / 1000).cast(dataType=TimestampType()),
                        tsFormat), tsFormat))

    # extract columns to create time table
    time_table = time_table.select(
        col("ts").alias("start_time"),
        hour(col("ts")).alias("hour"),
        dayofmonth(col("ts")).alias("day"),
        weekofyear(col("ts")).alias("week"),
        month(col("ts")).alias("month"),
        year(col("ts")).alias("year"))

    # write time table to parquet files partitioned by year and month
    time_table.write.partitionBy("year", "month").parquet(output_data + "time")

    # read in song data to use for songplays table
    song_data = input_data + "song-data/*/*/*/*.json"
    song_df = spark.read.json(song_data)

    # extract columns from joined song and log datasets to create songplays table
    songplays_table = song_df.join(df, song_df.artist_name==df.artist)\
    .withColumn("songplay_id", monotonically_increasing_id())\
    .withColumn('start_time', to_timestamp(date_format((col("ts") /1000).cast(dataType=TimestampType()), tsFormat),tsFormat))\
    .select("songplay_id",
           "start_time",
           col("userId").alias("user_id"),
           "level",
           "song_id",
           "artist_id",
           col("sessionId").alias("session_id"),
           col("artist_location").alias("location"),
           "userAgent",
           month(col("start_time")).alias("month"),
           year(col("start_time")).alias("year"))

    # write songplays table to parquet files partitioned by year and month
    songplays_table.write.partitionBy("year", "month").parquet(output_data +
                                                               "songplays")
Ejemplo n.º 17
0
def process_log_data(spark, input_data, output_data):
    """
        Description: This function fetches log_data from S3 into a staging dataframe, 
        then extracts the time, users and songplays tables,
        and eventually exports data back to S3
        
        Parameters:
            spark       : object for Spark Session
            input_data  : location of log_data 
            output_data : location of target S3 bucket
            
    """
    
    # get filepath to log data file
    log_data = input_data + 'log_data/*/*/*.json'

    # define schema
    logdata_schema = R([
    Fld("artist",Str()),
    Fld("auth",Str()),
    Fld("firstName",Str()),
    Fld("gender", Str()),
    Fld("itemInSession", Lng()),
    Fld("lastName", Str()),
    Fld("length", Str()),
    Fld("level", Str()),
    Fld("location", Str()),
    Fld("method", Str()),
    Fld("page", Str()),
    Fld("registration", Dbl()),
    Fld("sessionId", Int()),
    Fld("song", Str()),
    Fld("status", Lng()),
    Fld("ts", Lng()),
    Fld("user_agent", Str()),
    Fld("userId", Str())
    ])
    
    # read log data file
    df = spark.read.json(log_data, schema=logdata_schema)
    
    # filter by actions for song plays
    df = df.filter(df1['page'] == 'NextSong')

    # extract columns for users table    
    selection = ['userId as user_id', 'firstName as first_name', \
               'lastName as last_name', 'gender as gender', \
               'level as level']

    users_table = df.selectExpr(selection).dropDuplicates()

    
    # write users table to parquet files
    users_table.write.parquet(output_data + 'users/')

    # create timestamp column from original timestamp column
    get_datetime = udf(lambda x: datetime.fromtimestamp((x/1000.0)), TimestampType())
    df = df.withColumn("start_time", get_datetime('ts'))
    
    # extract columns to create time table
    time_table = df.select('start_time').dropDuplicates()

    time_table = time_table.\
        withColumn("hour", hour(time_table.start_time)).\
        withColumn("day", dayofmonth(time_table.start_time)).\
        withColumn("week", weekofyear(time_table.start_time)).\
        withColumn("month", month(time_table.start_time)).\
        withColumn("year", year(time_table.start_time)).\
        withColumn("weekday", dayofweek(time_table.start_time))
    
    # write time table to parquet files partitioned by year and month
    time_table.write.partitionBy('year', 'month').parquet(output_data + 'time/')

    # read in song data to use for songplays table
    song_df = spark.read.parquet(output_data + 'songs/*/*/*')

    # extract columns from joined song and log datasets to create songplays table 
    selection = ['songplay_id', 'start_time', \
             'userId as user_id', 'level', 'song_id', \
             'artist_id', 'sessionId as session_id', \
             'location', 'user_agent', \
             'year', 'month']
    
    songplays_table = df1.join(songs_table, (df1.song == songs_table.title)).\
        withColumn('songplay_id', monotonically_increasing_id()).\
        withColumn("month", month(songplays_table.start_time)).\
        withColumn("year", year(songplays_table.start_time)).\
        selectExpr(selection)

    # write songplays table to parquet files partitioned by year and month
    songplays_table = songplays_table.write.partitionBy('year', 'month').parquet(output_data + 'songplays/')
Ejemplo n.º 18
0
    def _create_from_pandas_with_arrow(self, pdf: "PandasDataFrameLike",
                                       schema: Union[StructType, List[str]],
                                       timezone: str) -> "DataFrame":
        """
        Create a DataFrame from a given pandas.DataFrame by slicing it into partitions, converting
        to Arrow data, then sending to the JVM to parallelize. If a schema is passed in, the
        data types will be used to coerce the data in Pandas to Arrow conversion.
        """
        from pyspark.sql import SparkSession
        from pyspark.sql.dataframe import DataFrame

        assert isinstance(self, SparkSession)

        from pyspark.sql.pandas.serializers import ArrowStreamPandasSerializer
        from pyspark.sql.types import TimestampType
        from pyspark.sql.pandas.types import from_arrow_type, to_arrow_type
        from pyspark.sql.pandas.utils import (
            require_minimum_pandas_version,
            require_minimum_pyarrow_version,
        )

        require_minimum_pandas_version()
        require_minimum_pyarrow_version()

        from pandas.api.types import is_datetime64_dtype, is_datetime64tz_dtype
        import pyarrow as pa

        # Create the Spark schema from list of names passed in with Arrow types
        if isinstance(schema, (list, tuple)):
            arrow_schema = pa.Schema.from_pandas(pdf, preserve_index=False)
            struct = StructType()
            prefer_timestamp_ntz = is_timestamp_ntz_preferred()
            for name, field in zip(schema, arrow_schema):
                struct.add(name,
                           from_arrow_type(field.type, prefer_timestamp_ntz),
                           nullable=field.nullable)
            schema = struct

        # Determine arrow types to coerce data when creating batches
        if isinstance(schema, StructType):
            arrow_types = [to_arrow_type(f.dataType) for f in schema.fields]
        elif isinstance(schema, DataType):
            raise ValueError(
                "Single data type %s is not supported with Arrow" %
                str(schema))
        else:
            # Any timestamps must be coerced to be compatible with Spark
            arrow_types = [
                to_arrow_type(TimestampType())
                if is_datetime64_dtype(t) or is_datetime64tz_dtype(t) else None
                for t in pdf.dtypes
            ]

        # Slice the DataFrame to be batched
        step = -(-len(pdf) // self.sparkContext.defaultParallelism
                 )  # round int up
        pdf_slices = (pdf.iloc[start:start + step]
                      for start in range(0, len(pdf), step))

        # Create list of Arrow (columns, type) for serializer dump_stream
        arrow_data = [[(c, t)
                       for (_, c), t in zip(pdf_slice.iteritems(), arrow_types)
                       ] for pdf_slice in pdf_slices]

        jsqlContext = self._wrapped._jsqlContext  # type: ignore[attr-defined]

        safecheck = self._wrapped._conf.arrowSafeTypeConversion(
        )  # type: ignore[attr-defined]
        col_by_name = True  # col by name only applies to StructType columns, can't happen here
        ser = ArrowStreamPandasSerializer(timezone, safecheck, col_by_name)

        @no_type_check
        def reader_func(temp_filename):
            return self._jvm.PythonSQLUtils.readArrowStreamFromFile(
                jsqlContext, temp_filename)

        @no_type_check
        def create_RDD_server():
            return self._jvm.ArrowRDDServer(jsqlContext)

        # Create Spark DataFrame from Arrow stream file, using one batch per partition
        jrdd = self._sc._serialize_to_jvm(  # type: ignore[attr-defined]
            arrow_data, ser, reader_func, create_RDD_server)
        jdf = self._jvm.PythonSQLUtils.toDataFrame(  # type: ignore[attr-defined]
            jrdd, schema.json(), jsqlContext)
        df = DataFrame(jdf, self._wrapped)
        df._schema = schema
        return df
Ejemplo n.º 19
0
def process_log_data(spark, input_data, output_data):
    """
    Description: Load data from Log Dataset JSON files in S3, extract it into a DataFrame,
    then write DataFrame as parquet files back to S3

    """
    log_data = input_data + 'log_data/*.json'

    df = spark.read.json(log_data)

    df = df.filter(df.page == "NextSong")

    user_cols = [
        "userId as user_id", "firstName as first_name",
        "lastName as last_name", "gender", "level"
    ]
    users_table = df.selectExpr(user_cols).dropDuplicates()

    users_table.write.parquet(output_data + "users/", mode="overwrite")

    # fromtimestamp function is required to be process in seconds hence, it is divided by 1000
    get_timestamp = udf(lambda x: datetime.fromtimestamp(x / 1000).isoformat())
    df = df.withColumn("start_time", get_timestamp("ts").cast(TimestampType()))

    time_table = df.select("start_time") \
        .withColumn("hour", F.hour("start_time")) \
        .withColumn("day", F.dayofmonth("start_time")) \
        .withColumn("week", F.weekofyear("start_time")) \
        .withColumn("month", F.month("start_time")) \
        .withColumn("year", F.year("start_time")) \
        .withColumn("weekday", F.dayofweek("start_time"))

    time_table.write.partitionBy("year",
                                 "month").parquet(output_data + "time/",
                                                  mode="overwrite")

    song_df = spark.read.json(input_data + 'song_data/*/*/*/*.json')

    df = df.orderBy("ts")
    df = df.withColumn("songplay_id", F.monotonically_increasing_id())

    song_df.createOrReplaceTempView("staging_songs")
    df.createOrReplaceTempView("staging_events")

    songplays_table = spark.sql("""
        SELECT
            se.songplay_id,
            se.start_time,
            se.userId as user_id,
            se.level,
            ss.song_id,
            se.sessionId as session_id,
            ss.artist_id,
            se.location,
            se.userAgent as user_agent,
            YEAR(se.start_time) as year,
            MONTH(se.start_time) as month
        FROM staging_events se
        LEFT JOIN staging_songs ss
        ON (se.song = ss.title 
        AND se.artist = ss.artist_name) 
    """)

    songplays_table.write.partitionBy("year", "month").parquet(
        output_data + "songplays/", mode="overwrite")
Ejemplo n.º 20
0
def create_test_scalar_dataset(tmp_url, num_rows, num_files=4, spark=None):
    shutdown = False
    if not spark:
        spark_session = SparkSession \
            .builder \
            .appName('petastorm_end_to_end_test') \
            .master('local[*]')

        spark = spark_session.getOrCreate()
        shutdown = True

    expected_data = [{
        'id':
        np.int32(i),
        'int_fixed_size_list':
        np.arange(1 + i, 10 + i).astype(np.int32),
        'datetime':
        np.datetime64('2019-01-02'),
        'timestamp':
        np.datetime64('2005-02-25T03:30'),
        'string':
        np.unicode_('hello_{}'.format(i)),
        'string2':
        np.unicode_('world_{}'.format(i)),
        'float64':
        np.float64(i) * .66
    } for i in range(num_rows)]

    expected_data_as_scalars = [{
        k: np.asscalar(v) if isinstance(v, np.generic) else v
        for k, v in row.items()
    } for row in expected_data]

    # np.datetime64 is converted to a timezone unaware datetime instances. Working explicitly in UTC so we don't need
    # to think about local timezone in the tests
    for row in expected_data_as_scalars:
        row['timestamp'] = row['timestamp'].replace(tzinfo=pytz.UTC)
        row['int_fixed_size_list'] = row['int_fixed_size_list'].tolist()

    rows = [Row(**row) for row in expected_data_as_scalars]

    # WARNING: surprisingly, schema fields and row fields are matched only by order and not name.
    # We must maintain alphabetical order of the struct fields for the code to work!!!
    schema = StructType([
        StructField('datetime', DateType(), False),
        StructField('float64', DoubleType(), False),
        StructField('id', IntegerType(), False),
        StructField('int_fixed_size_list', ArrayType(IntegerType(), False),
                    False),
        StructField('string', StringType(), False),
        StructField('string2', StringType(), False),
        StructField('timestamp', TimestampType(), False),
    ])

    dataframe = spark.createDataFrame(rows, schema)
    dataframe. \
        coalesce(num_files). \
        write.option('compression', 'none'). \
        mode('overwrite'). \
        parquet(tmp_url)

    if shutdown:
        spark.stop()

    return expected_data
Ejemplo n.º 21
0
def process_log_data(spark, input_data, output_data):
    """This function loads log_data from S3 and processes it by extracting the users and time dimension tables
        and songplays fact table then again loaded back to S3
    Args:
        spark(:obj:`pyspark.sql.session.SparkSession`): SparkSession
        input_data (str): S3 bucket where song files are stored
        output (str): S3 bucket file path to store resulting files

    Returns:
        None
    """
    print("**** Starting to process log data *****")
    # get filepath to log data file
    log_data = input_data+'log_data/*/*/*.json'

    # read log data file
    try:
        df =spark.read.json(log_data)
    except Exception as e:
        print(e)
    
    # filter by actions for song plays
    df = df.filter(df.page == "NextSong")

    # extract columns for users table    
    users_fields = ["userId as user_id", "firstName as first_name", "lastName as last_name", "gender", "level","ts"]
    users_table = df.selectExpr(users_fields).orderBy("ts",ascending=False).dropDuplicates(["userId"]).drop("ts")
    
    # write users table to parquet files
    try:
        users_table.write.parquet(output_data + "users.parquet",  mode="overwrite")
    except Exception as e:
        print(e)
        
    print("**** users table data load is complete *****")

    # create timestamp column from original timestamp column
    #get_timestamp = udf(date_convert, TimestampType())
    #df = df.withColumn("datetime",get_timestamp(df.ts))
    
    # create datetime column from original timestamp column
    get_datetime = udf(lambda ms: datetime.fromtimestamp(ms // 1000), TimestampType())
    df = df.withColumn("datetime",get_timestamp(df.ts))
       
    # extract columns to create time table
    time_fields = ["datetime as start_time", "hour(datetime) as hour", "dayofmonth(datetime) as day",
                   "weekofyear(datetime) as week", "month(datetime) as month", "year(datetime) as year", 
                   "dayofweek(datetime) as weekday"]
    time_table = df.select(time_fields).dropDuplicates(["start_time"])
    
    # write time table to parquet files partitioned by year and month
    try:
        time_table.write.parquet(output_data + "time.parquet", partitionBy=("year", "month"), mode="overwrite")
    except Exception as e:
        print(e)
    
    print("**** time table data load is complete *****")

    # read in song data to use for songplays table
    songs_df = spark.read.parquet(output_data + "songs.parquet")
    
    artists_df = spark.read.parquet(output_data + "artists.parquet")
    
    song_df = songs_df.join(artists_df.aslias("artists"), 
                            songs_df.artist_id == artists_df.artist_id , 
                            "inner" ).select("title", "name", "duration", "song_id", "artists.artist_id")

    # extract columns from joined song and log datasets to create songplays table 
    songplays_table = df.join(song_df , (df.song == song_df.title) & (df.artist ==song_df.name) & (df.length == song_df.duration), "inner")
    songplays_table = songplays_table.withColumn("songplay_id",monotonically_increasing_id())
    songplays_table = songplays_table.selectExpr("songplay_id", "datetime as start_time", "userId as user_id",
                                                 "month(datetime) as month", "year(datetime) as year",
                                                 "level", "song_id", "artist_id","sessionId as session_id",
                                                 "location", "userAgent as user_agent").dropDuplicates()
    
    # write songplays table to parquet files partitioned by year and month
    try:
        songplays_table.write.parquet(output_data + "songplays.parquet", partitionBy=("year", "month"), mode="overwrite")
    except Exception as e:
        print(e)

    print("**** songplays table data load is complete *****")
    
    print("**** log data processing is finished *****")
Ejemplo n.º 22
0
from listenbrainz_spark.constants import LAST_FM_FOUNDING_YEAR
from listenbrainz_spark.exceptions import HDFSException
from listenbrainz_spark.path import LISTENBRAINZ_DATA_DIRECTORY
from listenbrainz_spark.stats import (offset_days, offset_months, get_day_end,
                                      get_month_end, get_year_end,
                                      replace_days, replace_months, run_query)
from listenbrainz_spark.stats.utils import (filter_listens,
                                            get_last_monday,
                                            get_latest_listen_ts)
from listenbrainz_spark.utils import get_listens
from pyspark.sql.functions import collect_list, sort_array, struct, lit
from pyspark.sql.types import (StringType, StructField, StructType,
                               TimestampType)

time_range_schema = StructType((StructField('time_range', StringType()), StructField(
    'start', TimestampType()), StructField('end', TimestampType())))


def get_listening_activity():
    """ Calculate number of listens for each user in time ranges given in the 'time_range' table """
    # Calculate the number of listens in each time range for each user except the time ranges which have zero listens.
    result_without_zero_days = run_query("""
            SELECT listens.user_name
                 , time_range.time_range
                 , count(listens.user_name) as listen_count
              FROM listens
              JOIN time_range
                ON listens.listened_at >= time_range.start
               AND listens.listened_at <= time_range.end
          GROUP BY listens.user_name
                 , time_range.time_range
Ejemplo n.º 23
0
def get_hrv_features(rr_data, acceptable_percentage=50, window_length=60):
    """

    Args:
        rr_data (DataStream):
        acceptable_percentage (int):
        window_length (int):

    Returns:

    """
    stream_name = 'org.md2k.autosense.ecg.features'

    def get_metadata():
        stream_metadata = Metadata()
        stream_metadata.set_name(stream_name).set_description("HRV Features from ECG RR interval") \
            .add_input_stream(rr_data.metadata.get_name()) \
            .add_dataDescriptor(
            DataDescriptor()
                .set_name("var")
                .set_type("double")
                .set_attribute("description","variance")) \
            .add_dataDescriptor(
            DataDescriptor()
                .set_name("iqr")
                .set_type("double")
                .set_attribute("description","Inter Quartile Range")) \
            .add_dataDescriptor(
            DataDescriptor()
                .set_name("mean")
                .set_type("double")
                .set_attribute("description","Mean RR Interval")) \
            .add_dataDescriptor(
            DataDescriptor()
                .set_name("median")
                .set_type("double")
                .set_attribute("description","Median RR Interval")) \
            .add_dataDescriptor(
            DataDescriptor()
                .set_name("80th")
                .set_type("double")
                .set_attribute("description","80th percentile RR Interval")) \
            .add_dataDescriptor(
            DataDescriptor()
                .set_name("20th")
                .set_type("double")
                .set_attribute("description","20th percentile RR Interval")) \
            .add_dataDescriptor(
            DataDescriptor()
                .set_name("heartrate")
                .set_type("double")
                .set_attribute("description","Heart Rate in BPM")) \
            .add_dataDescriptor(
            DataDescriptor()
                .set_name("vlf")
                .set_type("double")
                .set_attribute("description","Very Low Frequency Energy")) \
            .add_dataDescriptor(
            DataDescriptor()
                .set_name("lf")
                .set_type("double")
                .set_attribute("description","Low Frequency Energy")) \
            .add_dataDescriptor(
            DataDescriptor()
                .set_name("hf")
                .set_type("double")
                .set_attribute("description","High Frequency Energy")) \
            .add_dataDescriptor(
            DataDescriptor()
                .set_name("lfhf")
                .set_type("double")
                .set_attribute("description","Low frequency to High Frequency energy ratio")) \
            .add_dataDescriptor(
            DataDescriptor()
                .set_name("window")
                .set_type("struct")
                .set_attribute("description","window start and end time in UTC")
                .set_attribute('start','start of window')
                .set_attribute('end','end of window')) \
            .add_module(
            ModuleMetadata().set_name("HRV Features from ECG RR Interval")
                .set_attribute("url", "http://md2k.org/")
                .set_attribute('algorithm','ecg feature computation')
                .set_attribute('unit','ms')
                .set_author("Md Azim Ullah", "*****@*****.**"))
        return stream_metadata

    def get_rr_features(a):
        return np.array([
            np.var(a),
            iqr(a),
            np.mean(a),
            np.median(a),
            np.percentile(a, 80),
            np.percentile(a, 20), 60000 / np.median(a)
        ])

    def frequencyDomain(RRints,
                        tmStamps,
                        band_type=None,
                        lf_bw=0.11,
                        hf_bw=0.1,
                        vlf=(0.003, 0.04),
                        lf=(0.04, 0.15),
                        hf=(0.15, 0.4)):
        """

        Args:
            RRints:
            tmStamps:
            band_type:
            lf_bw:
            hf_bw:
            vlf:
            lf:
            hf:

        Returns:

        """
        NNs = RRints
        tss = tmStamps
        frequency_range = np.linspace(0.001, 1, 10000)
        NNs = np.array(NNs)
        NNs = NNs - np.mean(NNs)
        result = signal.lombscargle(tss, NNs, frequency_range)

        #Pwelch w/ zero pad
        fxx = frequency_range
        pxx = result

        if band_type == 'adapted':

            vlf_peak = fxx[np.where(pxx == np.max(pxx[np.logical_and(
                fxx >= vlf[0], fxx < vlf[1])]))[0][0]]
            lf_peak = fxx[np.where(pxx == np.max(pxx[np.logical_and(
                fxx >= lf[0], fxx < lf[1])]))[0][0]]
            hf_peak = fxx[np.where(pxx == np.max(pxx[np.logical_and(
                fxx >= hf[0], fxx < hf[1])]))[0][0]]

            peak_freqs = (vlf_peak, lf_peak, hf_peak)

            hf = (peak_freqs[2] - hf_bw / 2, peak_freqs[2] + hf_bw / 2)
            lf = (peak_freqs[1] - lf_bw / 2, peak_freqs[1] + lf_bw / 2)
            vlf = (0.003, lf[0])

            if lf[0] < 0:
                print(
                    '***Warning***: Adapted LF band lower bound spills into negative frequency range'
                )
                print('Lower thresold of LF band has been set to zero')
                print('Adjust LF and HF bandwidths accordingly')
                lf = (0, lf[1])
                vlf = (0, 0)
            elif hf[0] < 0:
                print(
                    '***Warning***: Adapted HF band lower bound spills into negative frequency range'
                )
                print('Lower thresold of HF band has been set to zero')
                print('Adjust LF and HF bandwidths accordingly')
                hf = (0, hf[1])
                lf = (0, 0)
                vlf = (0, 0)

        df = fxx[1] - fxx[0]
        vlf_power = np.trapz(pxx[np.logical_and(fxx >= vlf[0], fxx < vlf[1])],
                             dx=df)
        lf_power = np.trapz(pxx[np.logical_and(fxx >= lf[0], fxx < lf[1])],
                            dx=df)
        hf_power = np.trapz(pxx[np.logical_and(fxx >= hf[0], fxx < hf[1])],
                            dx=df)
        totalPower = vlf_power + lf_power + hf_power

        #Normalize and take log
        vlf_NU_log = np.log((vlf_power / (totalPower - vlf_power)) + 1)
        lf_NU_log = np.log((lf_power / (totalPower - vlf_power)) + 1)
        hf_NU_log = np.log((hf_power / (totalPower - vlf_power)) + 1)
        lfhfRation_log = np.log((lf_power / hf_power) + 1)

        freqDomainFeats = {
            'VLF_Power': vlf_NU_log,
            'LF_Power': lf_NU_log,
            'HF_Power': hf_NU_log,
            'LF/HF': lfhfRation_log
        }

        return freqDomainFeats

    schema = StructType([
        StructField("timestamp", TimestampType()),
        StructField("start", TimestampType()),
        StructField("end", TimestampType()),
        StructField("localtime", TimestampType()),
        StructField("version", IntegerType()),
        StructField("user", StringType()),
        StructField("features", ArrayType(DoubleType()))
    ])

    @pandas_udf(schema, PandasUDFType.GROUPED_MAP)
    @CC_MProvAgg('org.md2k.autosense.ecg.rr', 'get_hrv_features', stream_name,
                 ['user', 'timestamp'], ['user', 'timestamp'])
    def ecg_r_peak(key, data):
        """

        Args:
            key:
            data:

        Returns:

        """
        if data.shape[0] >= acceptable_percentage * window_length / 100:
            data = data.sort_values('time')
            data['time'] = 1000 * data['time']
            a = data['rr'].values
            features = [
                np.double(
                    np.array(
                        list(get_rr_features(a)) + list(
                            frequencyDomain(
                                np.array(a) / 1000,
                                np.cumsum(a) / 1000).values())))
            ]
            data = data[:1]
            data['features'] = features
            data['start'] = [key[2]['start']]
            data['end'] = [key[2]['end']]
            data = data[[
                'timestamp', 'localtime', 'version', 'user', 'start', 'end',
                'features'
            ]]
            return data
        else:
            return pd.DataFrame([],
                                columns=[
                                    'timestamp', 'localtime', 'version',
                                    'user', 'features', 'start', 'end'
                                ])

    rr_data = rr_data.withColumn('time', F.col('timestamp').cast('double'))
    ecg_features = rr_data.compute(ecg_r_peak,
                                   windowDuration=window_length,
                                   startTime='0 seconds')
    df = ecg_features.select('timestamp',
                             F.struct('start', 'end').alias('window'),
                             'localtime', 'features', 'user', 'version')
    df = df.withColumn('var', F.col('features').getItem(0))
    df = df.withColumn('iqr', F.col('features').getItem(1))
    df = df.withColumn('vlf', F.col('features').getItem(7))
    df = df.withColumn('lf', F.col('features').getItem(8))
    df = df.withColumn('hf', F.col('features').getItem(9))
    df = df.withColumn('lfhf', F.col('features').getItem(10))
    df = df.withColumn('mean', F.col('features').getItem(2))
    df = df.withColumn('median', F.col('features').getItem(3))
    df = df.withColumn('80th', F.col('features').getItem(4))
    df = df.withColumn('20th', F.col('features').getItem(5))
    ecg_features_final = df.withColumn('heartrate',
                                       F.col('features').getItem(6))
    ecg_features_final = ecg_features_final.drop('features')

    feature_names = [
        'var', 'iqr', 'mean', 'median', '80th', '20th', 'heartrate', 'vlf',
        'lf', 'hf', 'lfhf'
    ]
    stress_features = ecg_features_final.withColumn(
        'features', F.array([F.col(i) for i in feature_names]))
    stress_features.metadata = get_metadata()

    return stress_features
Ejemplo n.º 24
0
from pyspark.sql.types import (
    DateType,
    DoubleType,
    IntegerType,
    StringType,
    StructField,
    StructType,
    TimestampType,
)

raw = StructType([StructField("value", StringType(), False)])

bronze = StructType([
    StructField("datasource", StringType(), False),
    StructField("ingesttime", TimestampType(), False),
    StructField("value", StringType(), True),
    StructField("p_ingestdate", DateType(), False),
])

silver = StructType([
    StructField("device_id", IntegerType(), True),
    StructField("device_type", StringType(), True),
    StructField("heartrate", DoubleType(), True),
    StructField("eventtime", TimestampType(), True),
    StructField("name", StringType(), True),
    StructField("p_eventdate", DateType(), True),
])
Ejemplo n.º 25
0
    def test_verify_type_not_nullable(self):
        import array
        import datetime
        import decimal

        schema = StructType([
            StructField('s', StringType(), nullable=False),
            StructField('i', IntegerType(), nullable=True)
        ])

        class MyObj:
            def __init__(self, **kwargs):
                for k, v in kwargs.items():
                    setattr(self, k, v)

        # obj, data_type
        success_spec = [
            # String
            ("", StringType()),
            (u"", StringType()),
            (1, StringType()),
            (1.0, StringType()),
            ([], StringType()),
            ({}, StringType()),

            # UDT
            (ExamplePoint(1.0, 2.0), ExamplePointUDT()),

            # Boolean
            (True, BooleanType()),

            # Byte
            (-(2**7), ByteType()),
            (2**7 - 1, ByteType()),

            # Short
            (-(2**15), ShortType()),
            (2**15 - 1, ShortType()),

            # Integer
            (-(2**31), IntegerType()),
            (2**31 - 1, IntegerType()),

            # Long
            (-(2**63), LongType()),
            (2**63 - 1, LongType()),

            # Float & Double
            (1.0, FloatType()),
            (1.0, DoubleType()),

            # Decimal
            (decimal.Decimal("1.0"), DecimalType()),

            # Binary
            (bytearray([1, 2]), BinaryType()),

            # Date/Timestamp
            (datetime.date(2000, 1, 2), DateType()),
            (datetime.datetime(2000, 1, 2, 3, 4), DateType()),
            (datetime.datetime(2000, 1, 2, 3, 4), TimestampType()),

            # Array
            ([], ArrayType(IntegerType())),
            (["1", None], ArrayType(StringType(), containsNull=True)),
            ([1, 2], ArrayType(IntegerType())),
            ((1, 2), ArrayType(IntegerType())),
            (array.array('h', [1, 2]), ArrayType(IntegerType())),

            # Map
            ({}, MapType(StringType(), IntegerType())),
            ({
                "a": 1
            }, MapType(StringType(), IntegerType())),
            ({
                "a": None
            }, MapType(StringType(), IntegerType(), valueContainsNull=True)),

            # Struct
            ({
                "s": "a",
                "i": 1
            }, schema),
            ({
                "s": "a",
                "i": None
            }, schema),
            ({
                "s": "a"
            }, schema),
            ({
                "s": "a",
                "f": 1.0
            }, schema),
            (Row(s="a", i=1), schema),
            (Row(s="a", i=None), schema),
            (["a", 1], schema),
            (["a", None], schema),
            (("a", 1), schema),
            (MyObj(s="a", i=1), schema),
            (MyObj(s="a", i=None), schema),
            (MyObj(s="a"), schema),
        ]

        # obj, data_type, exception class
        failure_spec = [
            # String (match anything but None)
            (None, StringType(), ValueError),

            # UDT
            (ExamplePoint(1.0, 2.0), PythonOnlyUDT(), ValueError),

            # Boolean
            (1, BooleanType(), TypeError),
            ("True", BooleanType(), TypeError),
            ([1], BooleanType(), TypeError),

            # Byte
            (-(2**7) - 1, ByteType(), ValueError),
            (2**7, ByteType(), ValueError),
            ("1", ByteType(), TypeError),
            (1.0, ByteType(), TypeError),

            # Short
            (-(2**15) - 1, ShortType(), ValueError),
            (2**15, ShortType(), ValueError),

            # Integer
            (-(2**31) - 1, IntegerType(), ValueError),
            (2**31, IntegerType(), ValueError),

            # Float & Double
            (1, FloatType(), TypeError),
            (1, DoubleType(), TypeError),

            # Decimal
            (1.0, DecimalType(), TypeError),
            (1, DecimalType(), TypeError),
            ("1.0", DecimalType(), TypeError),

            # Binary
            (1, BinaryType(), TypeError),

            # Date/Timestamp
            ("2000-01-02", DateType(), TypeError),
            (946811040, TimestampType(), TypeError),

            # Array
            (["1", None], ArrayType(StringType(),
                                    containsNull=False), ValueError),
            ([1, "2"], ArrayType(IntegerType()), TypeError),

            # Map
            ({
                "a": 1
            }, MapType(IntegerType(), IntegerType()), TypeError),
            ({
                "a": "1"
            }, MapType(StringType(), IntegerType()), TypeError),
            ({
                "a": None
            }, MapType(StringType(), IntegerType(),
                       valueContainsNull=False), ValueError),

            # Struct
            ({
                "s": "a",
                "i": "1"
            }, schema, TypeError),
            (Row(s="a"), schema, ValueError),  # Row can't have missing field
            (Row(s="a", i="1"), schema, TypeError),
            (["a"], schema, ValueError),
            (["a", "1"], schema, TypeError),
            (MyObj(s="a", i="1"), schema, TypeError),
            (MyObj(s=None, i="1"), schema, ValueError),
        ]

        # Check success cases
        for obj, data_type in success_spec:
            try:
                _make_type_verifier(data_type, nullable=False)(obj)
            except Exception:
                self.fail("verify_type(%s, %s, nullable=False)" %
                          (obj, data_type))

        # Check failure cases
        for obj, data_type, exp in failure_spec:
            msg = "verify_type(%s, %s, nullable=False) == %s" % (
                obj, data_type, exp)
            with self.assertRaises(exp, msg=msg):
                _make_type_verifier(data_type, nullable=False)(obj)
Ejemplo n.º 26
0
def process_log_data(spark, input_data, output_data):
    """
    Processes all log data JSON files in the given input folder and stores them in parquet format in the output folder.
    :param spark: spark session
    :param input_data: input data path
    :param output_data: output data path
    """
    # get filepath to log data file
    log_data = os.path.join(input_data, 'log_data/*/*/*.json')

    # read log data file
    log_df = spark.read.json(log_data)

    # filter by actions for song plays
    log_df = log_df.filter(log_df.page == 'Nextsong')

    # extract columns for users table
    users_fields = [
        "userId as user_id", "firstName as first_name",
        "lastName as last_name", "gender", "level"
    ]
    users_table = log_df.selectExpr(user_fields).dropDuplicates()

    # write user table to parquet files
    users_table.writen.mode("overwrite").parquet(output_data + 'users')

    # create timestamp column from original timestamp column
    get_timestamp = udf(lambda x: x / 1000, TimestampType())
    log_df = log_df.withColumn("timestamp", get_timestamp(log_df.ts))

    # create datetime column from original timestamp column
    get_datetime = udf(lambda x: datetime.fromtimestamp(x), TimestampType())
    log_df = log_df.withColumn("start_time", get_datetime(log_df.timestamp))

    # extract columns to create time table
    log_df = log_df.withColumn("hour", hour("start_time")) \
        .withColumn("day", dayofmonth("start_time")) \
        .withColumn("week", weekofyear("start_time")) \
        .withColumn("month", month("start_time")) \
        .withColumn("year", year("start_time")) \
        .withColumn("weekday", dayofweek("start_time"))

    time_table = log_df.select("start_time", "hour", "day", "week", "month",
                               "year", "weekday")

    # write time table to parquet files partitioned by year and month
    time_table.write.mode("overwrite").partitionBy(
        "year", "month").parquet(output_data + "time")

    # read in song data to use for songplays table
    songs_df = spark.read.parquet(os.path.join(output_data, "songs/*/*/*"))
    songs_logs = log_df.join(songs_df, (log_df.songs == songs_df.title))

    # extract columns from joined song and log datasets to create songplays table
    artists_df = spark.read.parquet(os.path.join(output_data, "artists"))
    artists_songs_logs = songs_logs.join(
        songs_df, (songs_logs.artist == artists_df.name))

    songplays = artists_songs_logs.join(time_table,
                                        artists_songs_logs.ts == time_table.ts,
                                        'left').drop(artists_songs_logs.year)

    # write songplays table to parquet files partitioned by year and month
    songplays_table = songplays.select(
        col('start_time'),
        col('userId').alias('user_id'),
        col('level'),
        col('song_id'),
        col('artist_id'),
        col('sessionId').alias('session_id'),
        col('location'),
        col('userAgent').alias('user_agent'),
        col('year'),
        col('month'),
    ).repartition("year", "month")

    songplays_table.write.mode("overwrite").partitionBy(
        "year", "month").parquet(output_data + 'songplays')
Ejemplo n.º 27
0
def process_log_data(spark, input_data, output_data):
    """This function will get the data from Udacity s3 bucket available for this project. Extract data from the log_data path, select the columns that the project requires and create the output tables in parquet files for the artist and song table."""

    # get filepath to log data file
    log_data = input_data + "log-data"

    # read log data file
    df = spark.read.json(log_data)
    
    # filter by actions for song plays
    df = df.filter(col("page")=='NextSong').filter(df.userId.isNotNull())

    # extract columns for users table    
    users_table = df.select(col("userId").alias("user_id"), col("firstName").alias("first_name"), col("lastName").alias("last_name"), "gender", "level").dropDuplicates()
    
    # write users table to parquet files
    print("""##### [STARTING] Writing table to the parquet files: 
                   USERS #####
                   """)
    users_table.write.mode("overwrite").parquet(output_data+"users")
    print("""##### [FINISHED] Table USERS already loaded #####
    """)  

    # create timestamp column from original timestamp column
    get_timestamp = udf(lambda x: str(int(int(x) / 1000)))
    df = df.withColumn("timestamp",get_timestamp(col("ts"))) 
    
    # create datetime column from original timestamp column
    get_datetime = udf(lambda x: str(datetime.fromtimestamp(int(x) / 1000.0)))
    df = df.withColumn("datetime", get_datetime(col("ts"))) 
    
    # extract columns to create time table
    time_table = df.select(
         'timestamp',
         hour('datetime').alias('hour'),
         dayofmonth('datetime').alias('day'),
         weekofyear('datetime').alias('week'),
         month('datetime').alias('month'),
         year('datetime').alias('year'),
         date_format('datetime', 'F').alias('weekday')
     )
    
    # write time table to parquet files partitioned by year and month
    print("""##### [STARTING] Writing table to the parquet files: 
                   TIME #####
                   """)
    time_table.write.mode("overwrite").partitionBy("year","month").parquet(output_data+"time")
    print("""##### [FINISHED] Table TIME already loaded #####
    """)  

    # read in song data to use for songplays table
    song_data = input_data + "song_data/*/*/*/*.json"
    song_df = spark.read.json(song_data)

    # extract columns from joined song and log datasets to create songplays table 
    # Creating a string variable with the timestamp format
    tsFormatVar = "yyyy/MM/dd HH:MM:ss z"

    '''In this part, the songplays table are made with a join between 2 dataframes, and after that the columns select with the transformation (if the transformation applies)'''      
    songplays_table = song_df.join(df,(song_df.artist_name==df.artist) & (song_df.title==df.song)).withColumn("songplay_id",monotonically_increasing_id()).withColumn('start_time', to_timestamp(date_format((col("ts") /1000).cast(dataType=TimestampType()), tsFormatVar),tsFormatVar)).select("songplay_id","start_time",col("userId").alias("user_id"),"level","song_id","artist_id",col("sessionId").alias("session_id"),col("artist_location").alias("location"),"userAgent",month(col("start_time")).alias("month"),year(col("start_time")).alias("year"))
    


    # write songplays table to parquet files partitioned by year and month
    print("""##### [STARTING] Writing table to the parquet files: 
                   SONGPLAYS #####
                   """)
    songplays_table.write.mode("overwrite").partitionBy("year","month").parquet(output_data+"songplays")
    print("""##### [FINISHED] Table SONGPLAYS already loaded #####
    """)  
Ejemplo n.º 28
0
def process_log_data(spark, input_data, output_data):
    # get filepath to log data file
    print("Reading logs Data Files")
    log_data = input_data + 'log_data/*/*/*.json'
    #
    # read log data file
    schema = StructType([
        StructField('artist', StringType()),
        StructField('auth', StringType()),
        StructField('firstName', StringType()),
        StructField('gender', StringType()),
        StructField('itemInSession', IntegerType()),
        StructField('lastName', StringType()),
        StructField('length', DoubleType()),
        StructField('level', StringType()),
        StructField('location', StringType()),
        StructField('method', StringType()),
        StructField('page', StringType()),
        StructField('registration', StringType()),
        StructField('sessionId', IntegerType()),
        StructField('song', StringType()),
        StructField('status', IntegerType()),
        StructField('ts', IntegerType()),
        StructField('userAgent', StringType()),
        StructField('userId', IntegerType())
    ])
    # when applying schema to log_data files, it is not working thatswhy kept without schema
    #df = spark.read.schema(schema).json(log_data)
    df = spark.read.json(log_data)

    # extract columns for users table
    users_table = df.select('userId', 'firstName', 'lastName', 'gender',
                            'level')
    users_table.select('userId', 'firstName', 'lastName', 'gender',
                       'level').dropDuplicates().collect()
    users_table.filter(users_table.userId != " ").count()
    users_table=users_table.withColumnRenamed('userId','user_id') \
                                .withColumnRenamed('firstName','first_name') \
                                .withColumnRenamed('lastName','last_name')

    # write users table to parquet files
    users_table.write.parquet(output_data + "/users.parquet",
                              mode='overwrite',
                              compression='snappy')

    # create datetime column from original timestamp column
    get_datetime = udf(lambda x: datetime.fromtimestamp(x / 1000).datetime)
    df = df.withColumn("datetime", get_datetime(col("ts")))

    get_hour = udf(lambda x: datetime.fromtimestamp(x / 1000.0).hour)
    df = df.withColumn("hour", get_hour(df.ts))

    # create day column from datetime
    get_day = udf(lambda x: datetime.fromtimestamp(x / 1000.0).day)
    df = df.withColumn("day", get_day(df.ts))

    # create week column from datetime
    get_week = udf(
        lambda x: datetime.fromtimestamp(x / 1000.0).isocalendar()[1])
    df = df.withColumn("week", get_week(df.ts))

    # create month column from datetime
    get_month = udf(lambda x: datetime.fromtimestamp(x / 1000.0).month)
    df = df.withColumn("month", get_month(df.ts))

    # create year column from datetime
    get_year = udf(lambda x: datetime.fromtimestamp(x / 1000.0).year)
    df = df.withColumn("year", get_year(df.ts))

    # create weekday column from datetime
    get_weekday = udf(lambda x: datetime.fromtimestamp(x / 1000.0).weekday())
    df = df.withColumn("weekday", get_weekday(df.ts))

    # extract columns to create time table
    time_table = df.select(
        ["ts", "hour", "day", "week", "month", "year", "weekday"])

    print('--- Saving time_table')
    # write time table to parquet files partitioned by year and month
    time_table.write.mode('append').partitionBy(
        'year', 'month').parquet(output_data + "time_data")

    # read in song data to use for songplays table
    song_data = os.path.join(input_data, 'song_data/A/A/*/*.json')
    song_df = spark.read.json(song_data)

    print('--Preparing Songs Play Table--')

    #this UDF is used for conevrt ts into time stamp filed.
    get_timestamp = udf(lambda x: datetime.fromtimestamp(x / 1000),
                        TimestampType())

    getsongplays_table = df.join(song_df, (df.artist == song_df.artist_name) & (df.song == song_df.title),'inner') \
        .withColumn('start_time', get_timestamp(df.ts))\
        .withColumn("songplay_id", monotonically_increasing_id())

    # extract columns from joined song and log datasets to create songplays table
    songplays_table = getsongplays_table.selectExpr(
        ['songplay_id', 'start_time', 'userId as user_id', 'level', 'song_id', 'artist_id', 'sessionId as session_id',
         'location', 'userAgent as user_agent']) \
        .withColumn('year', year('start_time')) \
        .withColumn('month', month('start_time'))

    #write songplays table to parquet files partitioned by year and month
    songplays_table.write.partitionBy("year", "month").parquet(
        output_data + "/songplays.parquet",
        mode='overwrite',
        compression='snappy')
Ejemplo n.º 29
0
        "body_ic",
        "body_iv",
        "body_cu",
        ]

# End of column definition

# Start of schema definition
session_schema = StructType([
        StructField("fullVisitorId", StringType(), True),
        StructField("visitId", StringType(), True),
        StructField("userId", StringType(), True),
        StructField("visitNumber", IntegerType(), True), 
        StructField("visitStartTime", LongType(), True), 
        StructField("date", IntegerType(), True),
        StructField("timestamp", TimestampType(), True),
        StructField("trafficSource_campaign", StringType(), True),
        StructField("trafficSource_source", StringType(), True), 
        StructField("trafficSource_medium", StringType(), True),
        StructField("trafficSource_keyword", StringType(), True),
        StructField("trafficSource_ad_content", StringType(), True),
        StructField("totals_transactionRevenue", StringType(), True),
        StructField("landingPage", StringType(), True),
        StructField("hits_type", StringType(), True),
        StructField("touchpoints", ArrayType(StringType()), True),
        StructField("touchpoints_wo_direct", ArrayType(StringType()), True),
        StructField("first_touchpoint", StringType(), True),
        StructField("last_touchpoint", StringType(), True)
        ])

ga_fields = { 
Ejemplo n.º 30
0
#-*-coding:utf-8-*-
from pyspark.sql.types import IntegerType, TimestampType
from pyspark.sql.functions import *

from base import spark
from utils import uuidsha


columns = [
    col('docu_dk').alias('alrt_docu_dk'), 
    col('docu_nr_mp').alias('alrt_docu_nr_mp'), 
    col('dt_fim_prazo').alias('alrt_date_referencia').cast(TimestampType()),  
    col('docu_orgi_orga_dk_responsavel').alias('alrt_orgi_orga_dk'),
    col('elapsed').alias('alrt_dias_referencia'),
    col('nm_delegacia').alias('alrt_info_adicional'),
    col('alrt_key')
]

key_columns = [
    col('docu_dk'),
    col('dt_fim_prazo')
]

def alerta_bdpa(options):
    documento = spark.sql("from documento").filter('DOCU_TPST_DK = 3').filter('DOCU_FSDC_DK = 1')
    orga_externo = spark.table('%s.mprj_orgao_ext' % options['schema_exadata']).\
        withColumnRenamed('ORGE_NM_ORGAO', 'nm_delegacia')
    doc_origem = documento.join(
        orga_externo,
        documento.DOCU_ORGE_ORGA_DK_DELEG_ORIGEM == orga_externo.ORGE_ORGA_DK,
        'left'