def process_log_data(spark, input_data, output_data): ''' Purpose: Process log data, create users, time, and songplays tables :param spark: Spark session :param input_data: Input data file path :param output_data: Output data file path :return: ''' # get filepath to log data file log_data = input_data + 'log_data/*/*/*.json' # create schema schema = StructType([StructField('artist', StringType(), True), StructField('auth', StringType(), True), StructField('firstName', StringType(), True), StructField('gender', StringType(), True), StructField('itemInSession', IntegerType(), True), StructField('lastName', StringType(), True), StructField('length', DoubleType(), True), StructField('level', StringType(), True), StructField('location', StringType(), True), StructField('method', StringType(), True), StructField('page', StringType(), True), StructField('registration', StringType(), True), StructField('sessionId', IntegerType(), True), StructField('song', StringType(), True), StructField('status', IntegerType(), True), StructField('ts', TimestampType(), True), StructField('userAgent', StringType(), True), StructField('userId', IntegerType(), False)]) # read log data file df = spark.read.json(log_data, schema = schema) # filter by actions for song plays df = df.filter(df.page=="NextSong") # extract columns for users table users_table = df.select("userId as user_id", "firstName as first_name", "lastName as last_name", "gender", "level") \ .dropDuplicates() # write users table to parquet files users_table.write.parquet(output_data + "users/") # create timestamp column from original timestamp column get_datetime = udf(date_conversion, TimestampType()) df = df.withColumn("start_time", get_datetime('ts')) # extract columns to create time table time_table = df.select("start_time") \ .dropDuplicates() \ .withColumn("hour", hour(col("start_time"))) \ .withColumn("day", day(col("start_time"))) \ .withColumn("week", week(col('start_time'))) \ .withColumn("month", month(col('start_time'))) \ .withColumn("year", year(col('start_time'))) \ .withColumn("weekday", date_format(col('start_time'))) # write time table to parquet files partitioned by year and month time_table.write.partitionBy("year","month").parquet(output_data + "time/") # read in song data to use for songplays table songs_df = spark.read.parquet(output_data + 'song_data/*/*/*') artists_df = spark.read.parquet(output_data + 'artists/*') # extract columns from joined song and log datasets to create songplays table songs_logs_joined_df = df.join(songs_df, df.song == song_df.title) artists_songs_logs_joined_df = songs_logs_joined_df.join(artists_df, songs_logs_joined_df.artist == artists_df.name) songplays_table = artists_songs_logs_joined_df.join(time_table, \ artists_songs_logs_joined_df.ts == time_table.start_time, "left") \ .drop(artists_songs_logs_joined_df.year) songplays_table = songplays_table.select("start_time", "userId as user_id", "level", "song_id", "artist_id", \ "sessionId as session_id", "location", "userAgent as user_agent", "year", \ "month")
def _create_from_pandas_with_arrow(self, pdf, schema, timezone): """ Create a DataFrame from a given pandas.DataFrame by slicing it into partitions, converting to Arrow data, then sending to the JVM to parallelize. If a schema is passed in, the data types will be used to coerce the data in Pandas to Arrow conversion. """ from pyspark.sql import SparkSession from pyspark.sql.dataframe import DataFrame assert isinstance(self, SparkSession) from pyspark.sql.pandas.serializers import ArrowStreamPandasSerializer from pyspark.sql.types import TimestampType from pyspark.sql.pandas.types import from_arrow_type, to_arrow_type from pyspark.sql.pandas.utils import require_minimum_pandas_version, \ require_minimum_pyarrow_version require_minimum_pandas_version() require_minimum_pyarrow_version() from pandas.api.types import is_datetime64_dtype, is_datetime64tz_dtype import pyarrow as pa # Create the Spark schema from list of names passed in with Arrow types if isinstance(schema, (list, tuple)): arrow_schema = pa.Schema.from_pandas(pdf, preserve_index=False) struct = StructType() for name, field in zip(schema, arrow_schema): struct.add(name, from_arrow_type(field.type), nullable=field.nullable) schema = struct # Determine arrow types to coerce data when creating batches if isinstance(schema, StructType): arrow_types = [to_arrow_type(f.dataType) for f in schema.fields] elif isinstance(schema, DataType): raise ValueError("Single data type %s is not supported with Arrow" % str(schema)) else: # Any timestamps must be coerced to be compatible with Spark arrow_types = [to_arrow_type(TimestampType()) if is_datetime64_dtype(t) or is_datetime64tz_dtype(t) else None for t in pdf.dtypes] # Slice the DataFrame to be batched step = -(-len(pdf) // self.sparkContext.defaultParallelism) # round int up pdf_slices = (pdf.iloc[start:start + step] for start in range(0, len(pdf), step)) # Create list of Arrow (columns, type) for serializer dump_stream arrow_data = [[(c, t) for (_, c), t in zip(pdf_slice.iteritems(), arrow_types)] for pdf_slice in pdf_slices] jsqlContext = self._wrapped._jsqlContext safecheck = self._wrapped._conf.arrowSafeTypeConversion() col_by_name = True # col by name only applies to StructType columns, can't happen here ser = ArrowStreamPandasSerializer(timezone, safecheck, col_by_name) def reader_func(temp_filename): return self._jvm.PythonSQLUtils.readArrowStreamFromFile(jsqlContext, temp_filename) def create_RDD_server(): return self._jvm.ArrowRDDServer(jsqlContext) # Create Spark DataFrame from Arrow stream file, using one batch per partition jrdd = self._sc._serialize_to_jvm(arrow_data, ser, reader_func, create_RDD_server) jdf = self._jvm.PythonSQLUtils.toDataFrame(jrdd, schema.json(), jsqlContext) df = DataFrame(jdf, self._wrapped) df._schema = schema return df
def setUpClass(cls): from datetime import date, datetime from decimal import Decimal super(ArrowTests, cls).setUpClass() cls.warnings_lock = threading.Lock() # Synchronize default timezone between Python and Java cls.tz_prev = os.environ.get("TZ", None) # save current tz if set tz = "America/Los_Angeles" os.environ["TZ"] = tz time.tzset() cls.spark.conf.set("spark.sql.session.timeZone", tz) # Test fallback cls.spark.conf.set("spark.sql.execution.arrow.enabled", "false") assert cls.spark.conf.get( "spark.sql.execution.arrow.pyspark.enabled") == "false" cls.spark.conf.set("spark.sql.execution.arrow.enabled", "true") assert cls.spark.conf.get( "spark.sql.execution.arrow.pyspark.enabled") == "true" cls.spark.conf.set("spark.sql.execution.arrow.fallback.enabled", "true") assert cls.spark.conf.get( "spark.sql.execution.arrow.pyspark.fallback.enabled") == "true" cls.spark.conf.set("spark.sql.execution.arrow.fallback.enabled", "false") assert cls.spark.conf.get( "spark.sql.execution.arrow.pyspark.fallback.enabled") == "false" # Enable Arrow optimization in this tests. cls.spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true") # Disable fallback by default to easily detect the failures. cls.spark.conf.set( "spark.sql.execution.arrow.pyspark.fallback.enabled", "false") cls.schema_wo_null = StructType([ StructField("1_str_t", StringType(), True), StructField("2_int_t", IntegerType(), True), StructField("3_long_t", LongType(), True), StructField("4_float_t", FloatType(), True), StructField("5_double_t", DoubleType(), True), StructField("6_decimal_t", DecimalType(38, 18), True), StructField("7_date_t", DateType(), True), StructField("8_timestamp_t", TimestampType(), True), StructField("9_binary_t", BinaryType(), True), ]) cls.schema = cls.schema_wo_null.add("10_null_t", NullType(), True) cls.data_wo_null = [ ( "a", 1, 10, 0.2, 2.0, Decimal("2.0"), date(1969, 1, 1), datetime(1969, 1, 1, 1, 1, 1), bytearray(b"a"), ), ( "b", 2, 20, 0.4, 4.0, Decimal("4.0"), date(2012, 2, 2), datetime(2012, 2, 2, 2, 2, 2), bytearray(b"bb"), ), ( "c", 3, 30, 0.8, 6.0, Decimal("6.0"), date(2100, 3, 3), datetime(2100, 3, 3, 3, 3, 3), bytearray(b"ccc"), ), ( "d", 4, 40, 1.0, 8.0, Decimal("8.0"), date(2262, 4, 12), datetime(2262, 3, 3, 3, 3, 3), bytearray(b"dddd"), ), ] cls.data = [tuple(list(d) + [None]) for d in cls.data_wo_null]
def process_log_data(spark, input_data, output_data): """Process user log data creating the tables user, time and songplays Args: spark (SparkSession): The spark session object input_data (str): The input files path output_data (str): The output files path """ # read log data file LOGGER.info('read log data file') log_df = spark.read.json(input_data) # filter by actions for song plays LOGGER.info('filter by actions for song plays') log_df = log_df.where(F.col('page') == 'NextSong') # extract columns for users table LOGGER.info('extract columns for users table') user_table = log_df.select( ['userId', 'firstName', 'lastName', 'gender', 'level']) # write users table to parquet files LOGGER.info('write users table to parquet files') user_path = os.path.join(output_data, 'user') user_table.coalesce(1).write.mode('overwrite').parquet(user_path) # create datetime column from original timestamp column LOGGER.info('create datetime column from original timestamp column') get_timestamp = F.udf(lambda x: datetime.utcfromtimestamp(int(x) / 1000), TimestampType()) log_df = log_df.withColumn("start_time", get_timestamp("ts")) # extract columns to create time table LOGGER.info('extract columns to create time table') time_table = log_df.select( 'start_time', F.hour('start_time').alias('hour'), F.dayofmonth('start_time').alias('day'), F.weekofyear('start_time').alias('weekofyear'), F.month('start_time').alias('month'), F.year('start_time').alias('year'), F.dayofweek('start_time').alias('weekday')).drop_duplicates( ['start_time']) # write time table to parquet partitioned by year and month LOGGER.info('write time table to parquet partitioned by year and month') time_table.coalesce(1).write.mode('overwrite')\ .partitionBy('year', 'month')\ .parquet(os.path.join(output_data, 'time')) # read in song data to use for songplays table LOGGER.info('read in song data to use for songplays table') song_df = spark.read.parquet(os.path.join(output_data, 'song')) artist_df = spark.read.parquet(os.path.join(output_data, 'artist')) # join artist and song data LOGGER.info('join artist and song data') song_df = artist_df.select(['artist_name', 'artist_id'])\ .join(song_df, on='artist_id', how='inner') # extract columns from joined song and log datasets to create songplays LOGGER.info('extract columns from joined song and log datasets to create ' 'songplays') on_clause = \ (song_df.title == log_df.song) \ & (song_df.artist_name == log_df.artist) \ & (song_df.duration == log_df.length) songplays_table = log_df.join(song_df, on_clause, how='inner') # select columns and create year and month columns LOGGER.info('select columns and create year and month columns') songplays_table = songplays_table.select( 'start_time', F.col('userId').alias('user_id'), 'level', 'song_id', 'artist_id', F.col('itemInSession').alias('session_id'), 'location', F.col('userAgent').alias('user_agent'), F.month('start_time').alias('month'), F.year('start_time').alias('year')) # create songplay_id and drop duplicates by this column LOGGER.info('create songplay_id and drop duplicates by this column') key_columns = [ 'start_time', 'user_id', 'song_id', 'artist_id', 'session_id' ] songplays_table = songplays_table.withColumn( 'songplay_id', F.sha2(F.concat_ws("||", *key_columns), 256)).drop_duplicates(['songplay_id']) # write songplays table to parquet files partitioned by year and month LOGGER.info('write songplays table to parquet partitioned by year/month') songplays_table.coalesce(1).write.mode('overwrite')\ .partitionBy('year', 'month')\ .parquet(os.path.join(output_data, 'songplays'))
from datetime import datetime from pyspark.sql import Row from pyspark.sql.types import StructField, StructType, ArrayType, StringType, TimestampType # NOTE: please keep this schema definition alphabetized listen_schema = [ StructField('artist_mbids', ArrayType(StringType()), nullable=True), StructField('artist_msid', StringType(), nullable=False), StructField('artist_name', StringType(), nullable=False), StructField('listened_at', TimestampType(), nullable=False), StructField('recording_mbid', StringType(), nullable=True), StructField('recording_msid', StringType(), nullable=False), StructField('release_mbid', StringType(), nullable=True), StructField('release_msid', StringType(), nullable=True), StructField('release_name', StringType(), nullable=True), StructField('tags', ArrayType(StringType()), nullable=True), StructField('track_name', StringType(), nullable=False), StructField('user_name', StringType(), nullable=False), ] # The field names of the schema need to be sorted, otherwise we get weird # errors due to type mismatches when creating DataFrames using the schema # Although, we try to keep it sorted in the actual definition itself, we # also sort it programmatically just in case listen_schema = StructType(sorted(listen_schema, key=lambda field: field.name)) def convert_listen_to_row(listen): """ Convert a listen to a pyspark.sql.Row object. Args: listen (dict): a single dictionary representing a listen
.load() trackingMessageSchema = StructType() \ .add("countryCode", StringType()) \ .add("timestamp", IntegerType()) \ .add("newCases", IntegerType()) \ .add("newCuredCases", IntegerType()) trackingMessages = kafkaMessages.select( from_json( col("value").cast("string"), trackingMessageSchema ).alias("json") ).select( from_unixtime(column('json.timestamp')) .cast(TimestampType()) .alias("parsed_timestamp"), column("json.*") ).withColumn("date", to_date(col("parsed_timestamp"))) \ .withColumnRenamed('json.countryCode', 'countryCode') \ .withColumnRenamed('json.newCases', 'countryCode') \ .withColumnRenamed('json.newCuredCases', 'countryCode') \ .withWatermark("parsed_timestamp", windowDuration) cases = trackingMessages.groupBy( window( column("parsed_timestamp"), windowDuration, slidingDuration ),
def test_as_spark_type_koalas_dtype(self): type_mapper = { # binary np.character: (np.character, BinaryType()), np.bytes_: (np.bytes_, BinaryType()), np.string_: (np.bytes_, BinaryType()), bytes: (np.bytes_, BinaryType()), # integer np.int8: (np.int8, ByteType()), np.byte: (np.int8, ByteType()), np.int16: (np.int16, ShortType()), np.int32: (np.int32, IntegerType()), np.int64: (np.int64, LongType()), np.int: (np.int64, LongType()), int: (np.int64, LongType()), # floating np.float32: (np.float32, FloatType()), np.float: (np.float64, DoubleType()), np.float64: (np.float64, DoubleType()), float: (np.float64, DoubleType()), # string np.str: (np.unicode_, StringType()), np.unicode_: (np.unicode_, StringType()), str: (np.unicode_, StringType()), # bool np.bool: (np.bool, BooleanType()), bool: (np.bool, BooleanType()), # datetime np.datetime64: (np.datetime64, TimestampType()), datetime.datetime: (np.dtype("datetime64[ns]"), TimestampType()), # DateType datetime.date: (np.dtype("object"), DateType()), # DecimalType decimal.Decimal: (np.dtype("object"), DecimalType(38, 18)), # ArrayType np.ndarray: (np.dtype("object"), ArrayType(StringType())), List[bytes]: (np.dtype("object"), ArrayType(BinaryType())), List[np.character]: (np.dtype("object"), ArrayType(BinaryType())), List[np.bytes_]: (np.dtype("object"), ArrayType(BinaryType())), List[np.string_]: (np.dtype("object"), ArrayType(BinaryType())), List[bool]: (np.dtype("object"), ArrayType(BooleanType())), List[np.bool]: (np.dtype("object"), ArrayType(BooleanType())), List[datetime.date]: (np.dtype("object"), ArrayType(DateType())), List[np.int8]: (np.dtype("object"), ArrayType(ByteType())), List[np.byte]: (np.dtype("object"), ArrayType(ByteType())), List[decimal.Decimal]: (np.dtype("object"), ArrayType(DecimalType(38, 18))), List[float]: (np.dtype("object"), ArrayType(DoubleType())), List[np.float]: (np.dtype("object"), ArrayType(DoubleType())), List[np.float64]: (np.dtype("object"), ArrayType(DoubleType())), List[np.float32]: (np.dtype("object"), ArrayType(FloatType())), List[np.int32]: (np.dtype("object"), ArrayType(IntegerType())), List[int]: (np.dtype("object"), ArrayType(LongType())), List[np.int]: (np.dtype("object"), ArrayType(LongType())), List[np.int64]: (np.dtype("object"), ArrayType(LongType())), List[np.int16]: (np.dtype("object"), ArrayType(ShortType())), List[str]: (np.dtype("object"), ArrayType(StringType())), List[np.unicode_]: (np.dtype("object"), ArrayType(StringType())), List[datetime.datetime]: (np.dtype("object"), ArrayType(TimestampType())), List[np.datetime64]: (np.dtype("object"), ArrayType(TimestampType())), # CategoricalDtype CategoricalDtype(categories=["a", "b", "c"]): ( CategoricalDtype(categories=["a", "b", "c"]), LongType(), ), } for numpy_or_python_type, (dtype, spark_type) in type_mapper.items(): self.assertEqual(as_spark_type(numpy_or_python_type), spark_type) self.assertEqual(pandas_on_spark_type(numpy_or_python_type), (dtype, spark_type)) with self.assertRaisesRegex(TypeError, "Type uint64 was not understood."): as_spark_type(np.dtype("uint64")) with self.assertRaisesRegex(TypeError, "Type object was not understood."): as_spark_type(np.dtype("object")) with self.assertRaisesRegex(TypeError, "Type uint64 was not understood."): pandas_on_spark_type(np.dtype("uint64")) with self.assertRaisesRegex(TypeError, "Type object was not understood."): pandas_on_spark_type(np.dtype("object"))
def process_log_data(spark, input_data, output_data): """ Description: - Extract log data from JSON files stored in S3 bucket - Transforms log data into three separate DataFrames; users_table, time_table and songplays_table - Loads them back into s3 as parquet files stored in a separate s3-bucket for analytical purposes Arguments: - Parameter spark: the instantiated SparkSession - Parameter input_data: input path - Parameter output_data: output path Returns: - None """ # get filepath to log data file log_data = input_data + "log_data/*.json" # read log data file df = spark.read.json(log_data) # filter by actions for song plays df = df.where(df["page"] == "NextSong") # extract columns for users table users_table = df.selectExpr(["userId as user_id" ,\ "firstName as first_name" ,\ "lastName as last_name" ,\ "gender" ,\ "level"]).dropDuplicates() # write users table to parquet files users_table.write.mode("overwrite").parquet(output_data + "users.parquet") # create timestamp column from original timestamp column get_timestamp = udf( lambda epoch_time: datetime.fromtimestamp(epoch_time / 1000), TimestampType()) df = df.withColumn('timestamp', get_timestamp(col('ts'))) # create datetime column from original timestamp column get_datetime = udf( lambda epoch_time: datetime.fromtimestamp(epoch_time / 1000), DateType()) df = df.withColumn('datetime', get_datetime(col('ts'))) # extract columns to create time table time_table = df.select([hour("timestamp").alias("hour") ,\ dayofmonth("timestamp").alias("day") ,\ weekofyear("timestamp").alias("week") ,\ month("timestamp").alias("month") ,\ year("timestamp").alias("year") ,\ date_format("timestamp", 'E').alias("weekday")]).dropDuplicates() # write time table to parquet files partitioned by year and month time_table.write.mode("overwrite").partitionBy( "year", "month").parquet(output_data + "time.parquet") # read in song data to use for songplays table song_df = spark.read.parquet(output_data + "songs.parquet") # extract columns from joined song and log datasets to create songplays table songplays_table = song_df.join(df, df.song == song_df.title)\ .selectExpr(["timestamp as start_time" ,\ "userid as user_id" ,\ "level" ,\ "song_id" ,\ "artist_id" ,\ "sessionid as session_id" ,\ "location" ,\ "useragent as user_agent"]) \ .withColumn("year", year("start_time")) \ .withColumn("month", month("start_time")).dropDuplicates() # write songplays table to parquet files partitioned by year and month songplays_table.createOrReplaceTempView("songplays") spark.sql(""" SELECT row_number() over (order by start_time asc) as songplay_id, start_time, user_id, level, song_id, artist_id, session_id, location, user_agent, year, month FROM songplays """).write.mode("overwrite").partitionBy( "year", "month").parquet(output_data + "songplays.parquet")
#df_Update.show() #===== get all the time stamps for each user ======================== #test=df_Update.groupBy(['UserID']) new = df_Update.groupBy(['UserID']).agg(collect_list("time_date")) #test.show() #==========sort time stamps for each user=========================== #func=udf(lambda x:sorted(x.tolist())) def sorter(l): res = sorted(l) return [item for item in res] sort_udf = udf(sorter, ArrayType(TimestampType())) sorted_df = new.withColumn('sorted', sort_udf(new['collect_list(time_date)'])) print(sorted_df) #sorted_df.show () #====== get the difference between each two successive time stamps. #step 1 get a column of previous value #0 if y[0] else (y[i]-y[i-1] for i in def diff(l): currsession = 1 output = list() for i in range(len(l)): if (i == 0): output.append(currsession)
data_sqlServer = spark.read.jdbc(url, "CMP.data_prueba_SqlS", properties=properties) #Removing nulls from the data data_clean = data_sqlServer.filter( col("id").isNotNull() & col("company_id").isNotNull() & col("amount").isNotNull() & col("status").isNotNull() & col("created_at").isNotNull()) data_clean.createOrReplaceTempView("data_clean") #Transforming data to the final schema df_clean = spark.sql( "Select substr(id,1,24) as id, substr(name,1,130) as name, substr(company_id,1,24) as company_id, cast(amount as decimal(16,2)) as amount, cast(status as varchar(30)) as status, cast(created_at as timestamp) as created_at, cast(paid_at as timestamp) as paid_at from data_clean" ) data_rdd = df_clean.rdd data_schema = StructType([ \ StructField("id",StringType(),False), \ StructField("company_name",StringType(),True), \ StructField("company_id",StringType(),False), \ StructField("amount", DecimalType(18,2), False), \ StructField("status", StringType(), False), \ StructField("created_at", TimestampType(), False), \ StructField("updated_at", TimestampType(), True) \ ]) data_trasformed = spark.createDataFrame(data=data_rdd, schema=data_schema) #Writing final eschema in a hive table data_trasformed.write.mode("append").saveAsTable("final_data_table")
def process_log_data(spark, input_data, output_data): """Reads data file containing the logs from user activity and loads into parquet files for time_table, users_table and songplays_table in S3. Keyword arguments: spark -- the spark session input_data -- the input file directory in S3 output_data -- the output parquet file directory in S3 """ # get filepath to log data file log_data = input_data + "log_data/*/*/*" # read log data file print("Reading log data file..") df = spark.read.format("json").load(log_data) # filter by actions for song plays df = df.where(df.page == "NextSong") # create timestamp column from original timestamp column from pyspark.sql.types import TimestampType get_timestamp = udf(lambda x: datetime.fromtimestamp(x / 1000), TimestampType()) df = df.withColumn('start_time', get_timestamp('ts')) # create schema-on-read table for loag data df.createOrReplaceTempView("log_data") # extract columns for users table users_table = spark.sql(""" SELECT qry.userid, qry.firstname, qry.lastname, qry.gender, qry.level FROM ( SELECT start_time, userid, firstname, lastname, gender, level, RANK() OVER (PARTITION BY userid ORDER BY start_time DESC) AS rank FROM log_data ) AS qry WHERE qry.rank = 1 """) # write users table to parquet files print("Creating users table parquet file..") users_table.write.parquet(output_data + "users_table") # extract columns to create time table time_table = df.select("start_time", hour("start_time").alias('hour'), dayofmonth("start_time").alias('day'), weekofyear("start_time").alias('week'), month("start_time").alias('month'), year("start_time").alias('year'), date_format("start_time", "u").alias('weekday')).distinct() # write time table to parquet files partitioned by year and month print("Creating time table parquet file..") time_table.write.partitionBy("year", "month").parquet(output_data + "time_table") # read in song, artist and time data to use for songplays table print("Reading song parquet file..") song_df = spark.read.parquet( "s3a://gfkw-dend-udacity/project4/songs_table") song_df.createOrReplaceTempView("songs_table") print("Reading artist parquet file..") artist_df = spark.read.parquet( "s3a://gfkw-dend-udacity/project4/artists_table") artist_df.createOrReplaceTempView("artists_table") print("Reading time parquet file..") time_table.createOrReplaceTempView("time_table") # extract columns from joined song and log datasets to create songplays table songplays_table = spark.sql(""" SELECT l.start_time, t.year, t.month, l.userid, l.level, q.song_id, q.artist_id, l.sessionid, l.location, l.useragent FROM log_data l JOIN time_table t ON (l.start_time = t.start_time) LEFT JOIN ( SELECT s.song_id, s.title, a.artist_id, a.artist_name FROM songs_table s JOIN artists_table a ON (s.artist_id = a.artist_id) ) AS q ON (l.song = q.title AND l.artist = q.artist_name) """) # write songplays table to parquet files partitioned by year and month print("Creating songplays table parquet file..") songplays_table.write.partitionBy( "year", "month").parquet(output_data + "songplays_table")
def test_verify_type_not_nullable(self): import array import datetime import decimal schema = StructType([ StructField('s', StringType(), nullable=False), StructField('i', IntegerType(), nullable=True) ]) class MyObj: def __init__(self, **kwargs): for k, v in kwargs.items(): setattr(self, k, v) # obj, data_type success_spec = [ # String ("", StringType()), (u"", StringType()), (1, StringType()), (1.0, StringType()), ([], StringType()), ({}, StringType()), # UDT (ExamplePoint(1.0, 2.0), ExamplePointUDT()), # Boolean (True, BooleanType()), # Byte (-(2**7), ByteType()), (2**7 - 1, ByteType()), # Short (-(2**15), ShortType()), (2**15 - 1, ShortType()), # Integer (-(2**31), IntegerType()), (2**31 - 1, IntegerType()), # Long (-(2**63), LongType()), (2**63 - 1, LongType()), # Float & Double (1.0, FloatType()), (1.0, DoubleType()), # Decimal (decimal.Decimal("1.0"), DecimalType()), # Binary (bytearray([1, 2]), BinaryType()), # Date/Timestamp (datetime.date(2000, 1, 2), DateType()), (datetime.datetime(2000, 1, 2, 3, 4), DateType()), (datetime.datetime(2000, 1, 2, 3, 4), TimestampType()), # Array ([], ArrayType(IntegerType())), (["1", None], ArrayType(StringType(), containsNull=True)), ([1, 2], ArrayType(IntegerType())), ((1, 2), ArrayType(IntegerType())), (array.array('h', [1, 2]), ArrayType(IntegerType())), # Map ({}, MapType(StringType(), IntegerType())), ({ "a": 1 }, MapType(StringType(), IntegerType())), ({ "a": None }, MapType(StringType(), IntegerType(), valueContainsNull=True)), # Struct ({ "s": "a", "i": 1 }, schema), ({ "s": "a", "i": None }, schema), ({ "s": "a" }, schema), ({ "s": "a", "f": 1.0 }, schema), (Row(s="a", i=1), schema), (Row(s="a", i=None), schema), (["a", 1], schema), (["a", None], schema), (("a", 1), schema), (MyObj(s="a", i=1), schema), (MyObj(s="a", i=None), schema), (MyObj(s="a"), schema), ] # obj, data_type, exception class failure_spec = [ # String (match anything but None) (None, StringType(), ValueError), # UDT (ExamplePoint(1.0, 2.0), PythonOnlyUDT(), ValueError), # Boolean (1, BooleanType(), TypeError), ("True", BooleanType(), TypeError), ([1], BooleanType(), TypeError), # Byte (-(2**7) - 1, ByteType(), ValueError), (2**7, ByteType(), ValueError), ("1", ByteType(), TypeError), (1.0, ByteType(), TypeError), # Short (-(2**15) - 1, ShortType(), ValueError), (2**15, ShortType(), ValueError), # Integer (-(2**31) - 1, IntegerType(), ValueError), (2**31, IntegerType(), ValueError), # Float & Double (1, FloatType(), TypeError), (1, DoubleType(), TypeError), # Decimal (1.0, DecimalType(), TypeError), (1, DecimalType(), TypeError), ("1.0", DecimalType(), TypeError), # Binary (1, BinaryType(), TypeError), # Date/Timestamp ("2000-01-02", DateType(), TypeError), (946811040, TimestampType(), TypeError), # Array (["1", None], ArrayType(StringType(), containsNull=False), ValueError), ([1, "2"], ArrayType(IntegerType()), TypeError), # Map ({ "a": 1 }, MapType(IntegerType(), IntegerType()), TypeError), ({ "a": "1" }, MapType(StringType(), IntegerType()), TypeError), ({ "a": None }, MapType(StringType(), IntegerType(), valueContainsNull=False), ValueError), # Struct ({ "s": "a", "i": "1" }, schema, TypeError), (Row(s="a"), schema, ValueError), # Row can't have missing field (Row(s="a", i="1"), schema, TypeError), (["a"], schema, ValueError), (["a", "1"], schema, TypeError), (MyObj(s="a", i="1"), schema, TypeError), (MyObj(s=None, i="1"), schema, ValueError), ] # Check success cases for obj, data_type in success_spec: try: _make_type_verifier(data_type, nullable=False)(obj) except Exception: self.fail("verify_type(%s, %s, nullable=False)" % (obj, data_type)) # Check failure cases for obj, data_type, exp in failure_spec: msg = "verify_type(%s, %s, nullable=False) == %s" % ( obj, data_type, exp) with self.assertRaises(exp, msg=msg): _make_type_verifier(data_type, nullable=False)(obj)
def test_timestamp_microsecond(self): tst = TimestampType() self.assertEqual( tst.toInternal(datetime.datetime.max) % 1000000, 999999)
from datetime import datetime from pyspark.sql import SparkSession from pyspark.sql.functions import col, date_format, udf from pyspark.sql.types import (DateType, IntegerType, FloatType, StringType, StructField, StructType, TimestampType) spark = SparkSession.builder.appName("Read Transactions").getOrCreate() csv_schema = StructType([ StructField('customer_id', IntegerType()), StructField('amount', FloatType()), StructField('purchased_at', TimestampType()), ]) dataframe = spark.read.csv("transactions.csv", schema=csv_schema, header=True) dataframe.show() # Add a new column by formatting the original date formatted_df = dataframe.withColumn( "date_string", date_format(col("purchased_at"), 'MM/dd/yyyy')) formatted_df.show() # Create a user defined function string_to_date = \ udf(lambda text_date: datetime.strptime(text_date, '%m/%d/%Y'), DateType()) typed_df = formatted_df.withColumn("date", string_to_date(formatted_df.date_string))
def process_log_data(spark, input_data, output_data): # get filepath to log data file log_data = input_data + "log-data/*/*/*.json" # read log data file print('Input log data json file read started') df = spark.read.json(log_data, mode='PERMISSIVE', columnNameOfCorruptRecord='corrupt_record') print('Input log data json file read completed') # filter by actions for song plays df = df.filter(df.page == 'NextSong') # extract columns for users table print('users_table data exteaction started \n') users_table = df.select("userId", "firstName", "lastName", "gender", "level").drop_duplicates() print('users_table data exteaction completed \n') # write users table to parquet files print('users_table data write started \n') users_table.write.parquet(output_data + "users_table/", mode="overwrite") print('users_table write Completed') # create timestamp column from original timestamp column get_timestamp = udf(lambda x: datetime.fromtimestamp(int(x) / 1000), TimestampType()) df = df.withColumn("start_time", get_timestamp("ts")) # create datetime column from original timestamp column #get_datetime = udf() #df = # extract columns to create time table print('time_table data extraction started \n') time_table=df.select('start_time').drop_duplicates() \ .withColumn('hour', hour(col('start_time'))) \ .withColumn('day', dayofmonth(col('start_time'))) \ .withColumn('week', weekofyear(col('start_time'))) \ .withColumn('month', month(col('start_time'))) \ .withColumn('year', year(col('start_time'))) \ .withColumn('weekday', dayofweek(col('start_time'))) #time_table.show() print('time_table data extraction completed \n') # write time table to parquet files partitioned by year and month print('time_table data write started \n') time_table.write.parquet(output_data + "time_table/", mode="overwrite") print('time_table data write Completed \n') # read in song data to use for songplays table song_df = spark.read.format("parquet").option( "basePath", os.path.join(output_data, "songs/")).load(os.path.join(output_data, "songs/*/*/")) # extract columns from joined song and log datasets to create songplays table print('songplays_table data extraction started \n') songplays_table = df.join(song_df, df.song == song_df.title, how='inner')\ .select(monotonically_increasing_id().alias("songplay_id"), col("start_time"), col("userId").alias("user_id"), col("level"), col("song_id"), col("artist_id"), col("sessionId").alias("session_id"), col("location"), col("userAgent").alias("user_agent") ) print('songplays_table data extraction completed \n') # write songplays table to parquet files partitioned by year and month print('songplays_table data write started \n') songplays_table = songplays_table.write.parquet(output_data + "songplays/", mode="overwrite") print('songsplay_table data write completed \n')
def start_stream(args): validate_params(args) _, brokers, topic = args spark = create_spark_session() json = spark \ .readStream \ .format("kafka") \ .option("kafka.bootstrap.servers", brokers) \ .option("subscribe", topic) \ .load() json.printSchema() # Explicitly set schema schema = StructType([StructField("symbol", StringType(), False), StructField("timestamp", TimestampType(), False), StructField("price", DoubleType(), False)]) json_options = {"timestampFormat": "yyyy-MM-dd'T'HH:mm'Z'"} stocks_json = json \ .select(from_json(F.col("value").cast("string"), schema, json_options).alias("content")) stocks_json.printSchema stocks = stocks_json.select("content.*") #################################### # Stream to Parquet #################################### query = stocks \ .withColumn('year', year(F.col('timestamp'))) \ .withColumn('month', month(F.col('timestamp'))) \ .withColumn('day', dayofmonth(F.col('timestamp'))) \ .withColumn('hour', hour(F.col('timestamp'))) \ .withColumn('minute', minute(F.col('timestamp'))) \ .writeStream \ .format('parquet') \ .partitionBy('year', 'month', 'day', 'hour', 'minute') \ .option('startingOffsets', 'earliest') \ .option('checkpointLocation', '/dataset/checkpoint') \ .option('path', '/dataset/streaming.parquet') \ .trigger(processingTime='30 seconds') \ .start() avg_pricing = stocks \ .groupBy(F.col("symbol")) \ .agg(F.avg(F.col("price")).alias("avg_price")) #################################### # Console Output #################################### query2 = avg_pricing.writeStream \ .outputMode('complete') \ .format("console") \ .trigger(processingTime="10 seconds") \ .start() #################################### # Table in Memory #################################### # query3 = avg_pricing \ # .writeStream \ # .queryName("avgPricing") \ # .outputMode("complete") \ # .format("memory") \ # .trigger(processingTime="10 seconds") \ # .start() # # while True: # print('\n' + '_' * 30) # # interactively query in-memory table # spark.sql('SELECT * FROM avgPricing').show() # print(query3.lastProgress) # sleep(10) query2.awaitTermination() pass
# MAGIC <img alt="Hint" title="Hint" style="vertical-align: text-bottom; position: relative; height:1.75em; top:0.3em" src="https://files.training.databricks.com/static/images/icon-light-bulb.svg"/> **Hint:** Use `alias` to alias the name of your columns to the final name you want for them. # MAGIC <img alt="Hint" title="Hint" style="vertical-align: text-bottom; position: relative; height:1.75em; top:0.3em" src="https://files.training.databricks.com/static/images/icon-light-bulb.svg"/> **Hint:** `id` corresponds to `tweet_id` and `user.id` corresponds to `user_id`. # COMMAND ---------- from pyspark.sql.functions import unix_timestamp from pyspark.sql.types import TimestampType timestampFormat = "EEE MMM dd HH:mm:ss ZZZZZ yyyy" tweetDF = fullTweetFilteredDF.select( col("id").alias("tweetID"), col("user.id").alias("userID"), col("lang").alias("language"), col("text"), unix_timestamp("created_at", timestampFormat).cast(TimestampType()).alias("createdAt")) display(tweetDF) # COMMAND ---------- # TEST - Run this cell to test your solution from pyspark.sql.types import TimestampType t = tweetDF.select("createdAt").schema[0] dbTest("ET1-P-08-07-01", TimestampType(), t.dataType) print("Tests passed!") # COMMAND ----------
def process_log_data(spark, input_data, output_data): """ Loads the Log files, extracts the data for users table, time table and songplays table then saves it to parquet files Parameters: spark: spark session input_data: input files path output_data: output files path """ # get filepath to log data file log_data = input_data # read log data file df = spark.read.json(log_data) # filter by actions for song plays df = df.filter(df.page == "NextSong") # extract columns for users table users_table = df.select("userId", "firstName", "lastName", "gender", "level").drop_duplicates() # write users table to parquet files users_table.write.parquet(output_data + "users/", mode="overwrite") # create timestamp column from original timestamp column get_timestamp = udf(lambda ts: datetime.utcfromtimestamp(int(ts) / 1000), TimestampType()) df = df.withColumn("timestamp", get_timestamp(col("ts"))) # create datetime column from original timestamp column get_datetime = udf(lambda ts: to_date(ts), TimestampType()) df = df.withColumn("start_time", get_timestamp(col("ts"))) # extract columns to create time table df = df.withColumn("hour",hour("start_time"))\ .withColumn("day",dayofmonth("start_time"))\ .withColumn("week",weekofyear("start_time"))\ .withColumn("month",month("start_time"))\ .withColumn("year",year("start_time"))\ .withColumn("weekday",dayofweek("start_time")) time_table = df.select("start_time", "hour", "day", "week", "month", "year", "weekday").distinct() # write time table to parquet files partitioned by year and month time_table.write.parquet(output_data + "time_table/", mode='overwrite', partitionBy=["year", "month"]) # read in song data to use for songplays table song_df = spark.sql( "SELECT DISTINCT song_id, artist_id, artist_name FROM df_songs_table") # extract columns from joined song and log datasets to create songplays table songplays_table = df.join(song_df, song_df.artist_name == df.artist, "inner") \ .distinct() \ .select("start_time", "userId", "level", "sessionId", "location", "userAgent","song_id","artist_id", "month", "year") \ .withColumn("songplay_id", monotonically_increasing_id()) # write songplays table to parquet files partitioned by year and month songplays_table.write.parquet(OUTPUT_DATA + "songplays_table/", mode="overwrite", partitionBy=["year", "month"])
def main(base_path): APP_NAME = "train_spark_mllib_model.py" # SparkSession이 없으면 환경 생성 try: sc and spark except NameError as e: import findspark findspark.init() import pyspark import pyspark.sql sc = pyspark.SparkContext() spark = pyspark.sql.SparkSession(sc).builder.appName( APP_NAME).getOrCreate() # # { # "ArrDelay":5.0,"CRSArrTime":"2015-12-31T03:20:00.000-08:00","CRSDepTime":"2015-12-31T03:05:00.000-08:00", # "Carrier":"WN","DayOfMonth":31,"DayOfWeek":4,"DayOfYear":365,"DepDelay":14.0,"Dest":"SAN","Distance":368.0, # "FlightDate":"2015-12-30T16:00:00.000-08:00","FlightNum":"6109","Origin":"TUS" # } # from pyspark.sql.types import StringType, IntegerType, FloatType, DoubleType, DateType, TimestampType from pyspark.sql.types import StructType, StructField from pyspark.sql.functions import udf schema = StructType([ StructField("ArrDelay", DoubleType(), True), StructField("CRSArrTime", TimestampType(), True), StructField("CRSDepTime", TimestampType(), True), StructField("Carrier", StringType(), True), StructField("DayOfMonth", IntegerType(), True), StructField("DayOfWeek", IntegerType(), True), StructField("DayOfYear", IntegerType(), True), StructField("DepDelay", DoubleType(), True), StructField("Dest", StringType(), True), StructField("Distance", DoubleType(), True), StructField("FlightDate", DateType(), True), StructField("FlightNum", StringType(), True), StructField("Origin", StringType(), True), StructField("Route", StringType(), True), StructField("TailNum", StringType(), True), StructField("EngineManufacturer", StringType(), True), StructField("EngineModel", StringType(), True), StructField("Manufacturer", StringType(), True), StructField("ManufacturerYear", StringType(), True), StructField("OwnerState", StringType(), True), StructField("FlightTime", IntegerType(), True), ]) input_path = "{}/data/simple_flight_delay_features_flight_times.json".format( base_path) features = spark.read.json(input_path, schema=schema) features.first() # # 예정된 도착/출발 시간 추가 # from pyspark.sql.functions import hour features_with_hour = features.withColumn("CRSDepHourOfDay", hour(features.CRSDepTime)) features_with_hour = features_with_hour.withColumn( "CRSArrHourOfDay", hour(features.CRSArrTime)) features_with_hour.select("CRSDepTime", "CRSDepHourOfDay", "CRSArrTime", "CRSArrHourOfDay").show() # # Spark ML을 사용하기 전 특징에 널 값이 있는지 확인 # null_counts = [ (column, features_with_hour.where(features_with_hour[column].isNull()).count()) for column in features_with_hour.columns ] cols_with_nulls = filter(lambda x: x[1] > 0, null_counts) print("\nNull Value Report") print("-----------------") print(tabulate(cols_with_nulls, headers=["Column", "Nulls"])) # # pysmark.ml.feature.Bucketizer를 사용해서 ArrDelay를 on-time, slightly late, very late (0, 1, 2)으로 구간화 # from pyspark.ml.feature import Bucketizer # 구간화 모델 설정 splits = [-float("inf"), -15.0, 0, 30.0, float("inf")] arrival_bucketizer = Bucketizer(splits=splits, inputCol="ArrDelay", outputCol="ArrDelayBucket") # 모델 저장 arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format( base_path) arrival_bucketizer.write().overwrite().save(arrival_bucketizer_path) # 모델 적용 ml_bucketized_features = arrival_bucketizer.transform(features_with_hour) ml_bucketized_features.select("ArrDelay", "ArrDelayBucket").show() # # pyspark.ml.feature의 특징 도구 임포트 # from pyspark.ml.feature import StringIndexer, VectorAssembler # 범주 필드를 인덱스로 전환 string_columns = ["Carrier", "Origin", "Dest", "Route", "TailNum"] for column in string_columns: string_indexer = StringIndexer(inputCol=column, outputCol=column + "_index") string_indexer_model = string_indexer.fit(ml_bucketized_features) ml_bucketized_features = string_indexer_model.transform( ml_bucketized_features) # Save the pipeline model string_indexer_output_path = "{}/models/string_indexer_model_4.0.{}.bin".format( base_path, column) string_indexer_model.write().overwrite().save( string_indexer_output_path) # 연속형 숫자 필드를 범주형 필드의 인덱스와 결합해서 하나의 특징 벡터를 만듦 numeric_columns = [ "DepDelay", "Distance", "DayOfYear", "CRSDepHourOfDay", "CRSArrHourOfDay", "FlightTime" ] index_columns = [column + "_index" for column in string_columns] vector_assembler = VectorAssembler(inputCols=numeric_columns + index_columns, outputCol="Features_vec") final_vectorized_features = vector_assembler.transform( ml_bucketized_features) # 수치 벡터 어셈블러 저장 vector_assembler_path = "{}/models/numeric_vector_assembler_6.0.bin".format( base_path) vector_assembler.write().overwrite().save(vector_assembler_path) # 인덱스 열 제거 for column in index_columns: final_vectorized_features = final_vectorized_features.drop(column) # 확정된 특징 검사 final_vectorized_features.show() # # 분류 모델 교차 검증, 훈련, 평가: 4개의 지표에 대해 5번 반복 # from collections import defaultdict scores = defaultdict(list) feature_importances = defaultdict(list) metric_names = ["accuracy", "weightedPrecision", "weightedRecall", "f1"] split_count = 3 for i in range(1, split_count + 1): print("\nRun {} out of {} of test/train splits in cross validation...". format( i, split_count, )) # 테스트/훈련 데이터 분할 training_data, test_data = final_vectorized_features.randomSplit( [0.8, 0.2]) # 모든 데이터에 대해 랜덤 포레스트 분류 모델 인스턴스화 및 적합 from pyspark.ml.classification import RandomForestClassifier rfc = RandomForestClassifier( featuresCol="Features_vec", labelCol="ArrDelayBucket", predictionCol="Prediction", maxBins=4896, ) model = rfc.fit(training_data) # 새 모델을 이전 모델 위에 덮어쓰기 model_output_path = "{}/models/spark_random_forest_classifier.flight_delays.flight_time.bin".format( base_path) model.write().overwrite().save(model_output_path) # 테스트 데이터로 모델 평가 predictions = model.transform(test_data) # 이 테스트/훈련 데이터 분할의 결과를 각 지표별로평가 from pyspark.ml.evaluation import MulticlassClassificationEvaluator for metric_name in metric_names: evaluator = MulticlassClassificationEvaluator( labelCol="ArrDelayBucket", predictionCol="Prediction", metricName=metric_name) score = evaluator.evaluate(predictions) scores[metric_name].append(score) print("{} = {}".format(metric_name, score)) # # 특징 중요도 수집 # feature_names = vector_assembler.getInputCols() feature_importance_list = model.featureImportances for feature_name, feature_importance in zip(feature_names, feature_importance_list): feature_importances[feature_name].append(feature_importance) # # 지표별 평균과 표준편차 평가 및 표로 출력 # import numpy as np score_averages = defaultdict(float) # 표 데이터 계산 average_stds = [] # ha for metric_name in metric_names: metric_scores = scores[metric_name] average_accuracy = sum(metric_scores) / len(metric_scores) score_averages[metric_name] = average_accuracy std_accuracy = np.std(metric_scores) average_stds.append((metric_name, average_accuracy, std_accuracy)) # 표 출력 print("\nExperiment Log") print("--------------") print(tabulate(average_stds, headers=["Metric", "Average", "STD"])) # # 점수를 실행 사이에 존재하는 점수 로그에 유지 # import pickle # 점수 로그를 적재하거나 빈 로그를 초기화 try: score_log_filename = "{}/models/score_log.pickle".format(base_path) score_log = pickle.load(open(score_log_filename, "rb")) if not isinstance(score_log, list): score_log = [] except IOError: score_log = [] # 기존 점수 로그 계산 score_log_entry = { metric_name: score_averages[metric_name] for metric_name in metric_names } # 각 지표에 대한 점수 변화를 계산하고 디스플레이 try: last_log = score_log[-1] except (IndexError, TypeError, AttributeError): last_log = score_log_entry experiment_report = [] for metric_name in metric_names: run_delta = score_log_entry[metric_name] - last_log[metric_name] experiment_report.append((metric_name, run_delta)) print("\nExperiment Report") print("-----------------") print(tabulate(experiment_report, headers=["Metric", "Score"])) # 기존 평균 점수를 로그에 추가 score_log.append(score_log_entry) # 다음 번 실행을 위해 로그 유지 pickle.dump(score_log, open(score_log_filename, "wb")) # # 특징 중요도의 변화를 분석하고 보고 # # 각 특징에 대한 평균 계산 feature_importance_entry = defaultdict(float) for feature_name, value_list in feature_importances.items(): average_importance = sum(value_list) / len(value_list) feature_importance_entry[feature_name] = average_importance # 특징 중요도를 내림차순으로 정렬하고 출력 import operator sorted_feature_importances = sorted(feature_importance_entry.items(), key=operator.itemgetter(1), reverse=True) print("\nFeature Importances") print("-------------------") print(tabulate(sorted_feature_importances, headers=['Name', 'Importance'])) # # 이번 실행 결과인 특징 중요도와 이전 실행 결과와 비교 # # 특징 중요도 로그를 적재하거나 빈 로그를 초기화 try: feature_log_filename = "{}/models/feature_log.pickle".format(base_path) feature_log = pickle.load(open(feature_log_filename, "rb")) if not isinstance(feature_log, list): feature_log = [] except IOError: feature_log = [] # 각 특징에 대한 점수 변화를 계산하고 디스플레이 try: last_feature_log = feature_log[-1] except (IndexError, TypeError, AttributeError): last_feature_log = defaultdict(float) for feature_name, importance in feature_importance_entry.items(): last_feature_log[feature_name] = importance # 변동 값(delta) 계산 feature_deltas = {} for feature_name in feature_importances.keys(): run_delta = feature_importance_entry[feature_name] - last_feature_log[ feature_name] feature_deltas[feature_name] = run_delta # 특징 변동 값을 정렬해 가장 큰 변동이 있는 특징을 먼저 나오게 한다 import operator sorted_feature_deltas = sorted(feature_deltas.items(), key=operator.itemgetter(1), reverse=True) # 정렬된 특징 변동 값 디스플레이 print("\nFeature Importance Delta Report") print("-------------------------------") print(tabulate(sorted_feature_deltas, headers=["Feature", "Delta"])) # 로그에 기존 평균 변동 값을 추가 feature_log.append(feature_importance_entry) # 다음 실행을 위해 로그 유지 pickle.dump(feature_log, open(feature_log_filename, "wb"))
#now need the most accurate outage time possible for outage event #now find all the exact outage and restore times using millis def timeCorrect(time, millis, unplugMillis): if (unplugMillis == 0 or millis == None or unplugMillis == None or isnan(millis) or isnan(unplugMillis)): return time elif unplugMillis > millis: return time else: return time - timedelta( microseconds=(int(millis) - int(unplugMillis)) * 1000) udftimeCorrect = udf(timeCorrect, TimestampType()) pw_df = pw_df.withColumn( "outage_time", udftimeCorrect("time", "millis", "last_unplug_millis")) pw_df = pw_df.withColumn("outage_time", F.unix_timestamp("outage_time")) pw_df = pw_df.withColumn("r_time", udftimeCorrect("time", "millis", "last_plug_millis")) pw_df = pw_df.withColumn("r_time", F.unix_timestamp("r_time")) #now denote the end time of the outage for saidi reasons time_lead = lead("r_time", 1).over(w) pw_df = pw_df.withColumn("restore_time", time_lead) #now filter out everything that is not an outage. We should have a time and end_time for every outage pw_df = pw_df.filter("outage != 0") # Okay now that we have the outages and times we should join it with the number of sensors reporting above
# Create DataFrame representing the stream of input lines # from connection to localhost:9999 # lines = spark \ # .readStream \ # .format("socket") \ # .option("host", "localhost") \ # .option("port", 9999) \ # .load() # Read all the csv files written atomically in a directory # userA, userB, timestamp, interaction userSchema = StructType()\ .add("userA", "integer")\ .add("userB", "integer")\ .add("timestamp", TimestampType())\ .add("interaction","string") activity = spark \ .readStream \ .option("sep", ",") \ .schema(userSchema) \ .csv(staging_dir+"/*.csv") # Equivalent to format("csv").load("/path/to/directory") #.option("inferSchema", "true") # Split the lines into words # words = lines.select( # explode( # split(lines.value, " ") # ).alias("word")
exprs = [ "{} as {}".format(field[0],field[1]) for field in fields] df = df.selectExpr(*exprs) # extract columns for users table user_fields = ['user_id', 'first_name', 'last_name', 'gender', 'level'] users_table = df.select(user_fields).dropDuplicates() # write users table to parquet files users_table.write.parquet(output_data + 'users/') # create timestamp column from original timestamp column get_timestamp = udf(lambda x: x/1000, Dbl()) df = df.withColumn('ts2', get_timestamp('ts')) # create datetime column from original timestamp column df = df.withColumn('start_time', from_unixtime('ts2').cast(dataType=TimestampType())) # extract columns to create time table time_table = df.select('start_time')\ .dropDuplicates()\ .withColumn('hour', hour(col('start_time')))\ .withColumn('day', dayofmonth(col('start_time')))\ .withColumn('week', weekofyear(col('start_time')))\ .withColumn('month', month(col('start_time')))\ .withColumn('year', year(col('start_time')))\ .withColumn('weekday', date_format(col('start_time'), 'E') # write time table to parquet files partitioned by year and month time_table.write.parquet(output_data + 'time/') # read in song data to use for songplays table
def from_arrow_type(at: "pa.DataType", prefer_timestamp_ntz: bool = False) -> DataType: """Convert pyarrow type to Spark data type.""" from distutils.version import LooseVersion import pyarrow as pa import pyarrow.types as types spark_type: DataType if types.is_boolean(at): spark_type = BooleanType() elif types.is_int8(at): spark_type = ByteType() elif types.is_int16(at): spark_type = ShortType() elif types.is_int32(at): spark_type = IntegerType() elif types.is_int64(at): spark_type = LongType() elif types.is_float32(at): spark_type = FloatType() elif types.is_float64(at): spark_type = DoubleType() elif types.is_decimal(at): spark_type = DecimalType(precision=at.precision, scale=at.scale) elif types.is_string(at): spark_type = StringType() elif types.is_binary(at): spark_type = BinaryType() elif types.is_date32(at): spark_type = DateType() elif types.is_timestamp(at) and prefer_timestamp_ntz and at.tz is None: spark_type = TimestampNTZType() elif types.is_timestamp(at): spark_type = TimestampType() elif types.is_duration(at): spark_type = DayTimeIntervalType() elif types.is_list(at): if types.is_timestamp(at.value_type): raise TypeError("Unsupported type in conversion from Arrow: " + str(at)) spark_type = ArrayType(from_arrow_type(at.value_type)) elif types.is_map(at): if LooseVersion(pa.__version__) < LooseVersion("2.0.0"): raise TypeError( "MapType is only supported with pyarrow 2.0.0 and above") if types.is_timestamp(at.key_type) or types.is_timestamp(at.item_type): raise TypeError("Unsupported type in conversion from Arrow: " + str(at)) spark_type = MapType(from_arrow_type(at.key_type), from_arrow_type(at.item_type)) elif types.is_struct(at): if any(types.is_struct(field.type) for field in at): raise TypeError( "Nested StructType not supported in conversion from Arrow: " + str(at)) return StructType([ StructField(field.name, from_arrow_type(field.type), nullable=field.nullable) for field in at ]) elif types.is_dictionary(at): spark_type = from_arrow_type(at.value_type) elif types.is_null(at): spark_type = NullType() else: raise TypeError("Unsupported type in conversion from Arrow: " + str(at)) return spark_type
import pyspark from pyspark.sql import SparkSession from pyspark.sql.types import StructType, StringType, IntegerType from pyspark.sql.types import TimestampType HOME = '/usr/local/airflow/spark' spark = SparkSession.builder.appName('Example').getOrCreate() schema = StructType() \ .add('uid', IntegerType(), True) \ .add('page_name', StringType(), True) \ .add('page_url', StringType(), True) \ .add('time', TimestampType(), True) df = spark.read.format('csv').schema(schema).load(f'{HOME}/hitlog.csv') df.createOrReplaceTempView("hitlog") result_data = spark.sql("""With registered as ( select uid, time from hitlog where page_name == '/register') select t.page_name, t.page_url, count(*) as hits from hitlog t join registered r on t.uid==r.uid where t.page_url like '%/article%' and t.time <= r.time group by t.page_name, t.page_url order by hits desc LIMIT 3 """) result_data.write.format('csv').mode('overwrite').option( 'sep', ',').save(f'{HOME}/output.csv')
def process_log_data(spark, input_data, output_data): """ Reads logs data in a dataframe which is then used to create new dataframes for creating users and time tables. Reads songs data and join it with logs dataframe to create a data for songplays table. Drop duplicates, rename columns and finally saves all tables in parquet format. :param spark: Spark session object :param input_data: S3 or local dir containing song data :param output_data: Path for parquet output files """ # get filepath to log data file log_data = input_data + "log_data/*/*/*.json" # S3 dir structure # log_data = input_data + "log_data/*.json" # local dir structure # read log data file logger.info('Reading log data json files') df = spark.read.json(log_data) # filter by actions for song plays df = df[df['page'] == 'NextSong'] # extract columns for users table users_table = df[['userId', 'firstName', 'lastName', 'gender', 'level']] users_table = users_table \ .withColumnRenamed('userId', 'user_id') \ .withColumnRenamed('firstName', 'first_name') \ .withColumnRenamed('lastName', 'last_name') \ .dropDuplicates() # write users table to parquet files logger.info('Writing users table in parquet format') users_table.write.parquet(output_data + '/tbl_users.parquet') # create timestamp column from original timestamp column get_timestamp = udf(lambda x: datetime.fromtimestamp(x / 1000.0), TimestampType()) df = df.withColumn('start_time', get_timestamp(df.ts)) # create datetime columns from derived start_time column df = df.withColumn('hour', hour(df.start_time)) df = df.withColumn('day', dayofmonth(df.start_time)) df = df.withColumn('week', weekofyear(df.start_time)) df = df.withColumn('month', month(df.start_time)) df = df.withColumn('year', year(df.start_time)) df = df.withColumn('weekday', dayofweek(df.start_time)) # extract columns to create time table time_table = df[[ 'start_time', 'hour', 'day', 'week', 'month', 'year', 'weekday' ]] time_table = time_table.dropDuplicates() # write time table to parquet files partitioned by year and month logger.info( 'Writing time table partitioned by year and month in parquet format') time_table.write.partitionBy('year', 'month').parquet(output_data + '/tbl_time.parquet') # read in song data to use for songplays table logger.info("Reading song data for join") song_df = spark.read.json(input_data + 'song_data/*/*/*/*.json') song_df = song_df.withColumnRenamed('year', 'song_year') # extract columns from joined song and log datasets to create songplays table songplays_table = df.join(song_df, song_df.artist_name == df.artist, 'inner') songplays_table = songplays_table.withColumn( "songplay_id", F.monotonically_increasing_id()) songplays_table = songplays_table[[ 'songplay_id', 'start_time', 'userId', 'level', 'song_id', 'artist_id', 'sessionId', 'location', 'userAgent', 'month', 'year' ]] songplays_table = songplays_table \ .withColumnRenamed('userId', 'user_id') \ .withColumnRenamed('sessionId', 'session_id') \ .withColumnRenamed('userAgent', 'user_agent') # write songplays table to parquet files partitioned by year and month logger.info( 'Writing songplays table partitioned by year and month in parquet format' ) songplays_table.write.partitionBy( 'year', 'month').parquet(output_data + '/tbl_songplays.parquet')
#For databricks related packages #./bin/pyspark --packages com.databricks:spark-csv_2.10:1.3.0 #Before Spark 1.4 train = sqlContext.load(source="com.databricks.spark.csv", path = 'PATH/train.csv', header = True,inferSchema = True) test = sqlContext.load(source="com.databricks.spark.csv", path = 'PATH/test-comb.csv', header = True,inferSchema = True) #Current Spark 2.1 and ... from pyspark .sql import SparkSession spark = SparkSession.builder.master("yarn").getOrCreate() df = spark.read.csv('hdfs://hadoop-master:9000/index/train.csv',mode="DROPMALFORMED") #From local from pyspark.sql.types import StructType,StructField,LongType,StringType,TimestampType schema=StructType([StructField('', LongType(), True), StructField('col1', LongType(), True), StructField('col2', StringType(), True), StructField('col3', StringType(), True),StructField('col4',TimestampType(),True),StructField('col5',TimestampType(),True),StructField('col6',StringType(),True)]) df = spark.read.csv('file:///index/data_extract_restart2_without_cert/data_refined.csv',,mode="DROPMALFORMED"),schema=schema) #Creating UDF def dict(sk): new_sk=sk.replace(',','|')#replacing comma by pipe in column col2 and putting the result in column named new_column_name return new_sk udf_dict = udf(dict, StringType()) df.withColumn('new_column_name', udf_dict("col2")).write.csv(path="/index/skill_clean_v3")#col2 is the column to be changed #Executing SQL queries df.createOrReplaceTempView("data") sqlDF = spark.sql("SELECT * FROM data")
def process_log_data(spark, input_data, output_data): # get filepath to log data file log_data = input_data + "log_data/*/*/*.json" song_data = input_data + "song_data/*/*/*/*.json" #song_data = input_data + "song_data/A/A/A/*.json" DEBUG and print("Reading log data files from", log_data) # read log data file df = spark.read.json(log_data) # filter by actions for song plays df = df.filter(df.page == 'NextSong') \ .where(df.ts.isNotNull()) \ .withColumn("userId", df["userId"].cast(IntegerType())) \ .withColumn("sessionId", df["sessionId"].cast(IntegerType())) DEBUG and print("Preparing users table") # extract columns for users table users_table = df.select( "userId", "firstName", "lastName", "gender", "level").where(col("userId").isNotNull()).dropDuplicates(['userId']) DEBUG and print("Creating and persisting users table") # write users table to parquet files users_table.write.parquet(output_data + "users/", mode='overwrite') DEBUG and print("Creating and persisting time table") # create timestamp column from original timestamp column get_timestamp = udf(lambda ts: datetime.fromtimestamp(ts / 1000), TimestampType()) df = df.withColumn("start_time", get_timestamp(df.ts)) # extract columns to create time table time_table = df.withColumn("hour", hour(df.start_time)) \ .withColumn("day", dayofmonth(df.start_time)) \ .withColumn("week", weekofyear(df.start_time)) \ .withColumn("month", month(df.start_time)) \ .withColumn("year", year(df.start_time)) \ .withColumn("weekday", dayofweek(df.start_time)) \ .select("start_time", "hour", "day", "week", "month", "year", "weekday") \ .dropDuplicates(["start_time"]) # write time table to parquet files partitioned by year and month time_table.write.partitionBy(["year", "month"]).parquet(output_data + "times/", mode='overwrite') DEBUG and print("Creating and persisting songplays table") # read in song data to use for songplays table song_df = spark.read.json(song_data).select("song_id", "title", "artist_id", "artist_name") action_df = df.select("start_time", "userId", "level", "sessionId", "location", "userAgent", "artist", "song") # extract columns from joined song and log datasets to create songplays table songplays_table = action_df.join(song_df, (action_df.artist == song_df.artist_name) & (action_df.song == song_df.title)) \ .select(monotonically_increasing_id().alias("songplay_id"), "start_time", "userId", "level", "song_id", "artist_id", "sessionId", "location", "userAgent") \ .withColumn("month", month(df.start_time)) \ .withColumn("year", year(df.start_time)) # write songplays table to parquet files partitioned by year and month songplays_table.write.partitionBy(["year", "month" ]).parquet(output_data + "songplays/", mode='overwrite')
def process_log_data(spark, input_data, output_data): """ Description: This function loads log_data from S3 and extracts the songs and artist tablesafter processing and then write those generated tables to S3 in parquet format. Also output from previous function is used in by spark.read.json command Parameters: spark : Spark Session input_data : Location of log_data files output_data : S3 bucket where extracted tables are written in parquet format. """ # get filepath to log data file log_data = input_data + 'log_data/*/*/*.json' # read log data file df = spark.read.json(log_data) # filter by actions for song plays df = df.filter(df.page == 'NextSong') # extract columns for users table users_table = df.select('userId','firstName', 'lastname', 'gender', 'level').dropDuplicates()\ .where(df.userId.isNotNull()) # write users table to parquet files users_table.write.parquet(output + 'users/') # create UDF for timestamp column from original timestamp column @udf(TimestampType()) def conv_timestamp(ms): return datetime.fromtimestamp(ms / 1000.0) # Lets add one more column with correct usable time stamp format df = df.withColumn("start_time", conv_timestamp('ts')) # Create a dataframe which only has start_time log_time_data = df.select('start_time').dropDuplicates()\ .where(df.start_time.isNotNull()) # extract columns to create time table time_table = log_time_data.withColumn('hour',hour('start_time'))\ .withColumn('day',dayofmonth('start_time'))\ .withColumn('week', weekofyear('start_time'))\ .withColumn('month', month('start_time'))\ .withColumn('year',year('start_time'))\ .withColumn("weekday", date_format("start_time", 'E')) # write time table to parquet files partitioned by year and month time_table.write.partitionBy("year", "month").parquet(output_data + 'times/') # create a view for the log_data and we already have the view for song_data as song created at the start df.createOrReplaceTempView('log_data_filtered_timeformatted') # extract columns from joined song and log datasets to create songplays table songplays_table = spark.sql( """SELECT monotonically_increasing_id() AS songplay_id, start_time, userId AS user_id, level, song_id, artist_id, sessionId AS session_id, location, userAgent AS user_agent FROM log_data_filtered_timeformatted JOIN song ON artist = artist_name AND song = title """) # write songplays table to parquet files partitioned by year and month songplays_table.write.partitionBy("year", "month").parquet(output_data + 'songplays/')
def process_log_data(spark, input_data, output_data): """ Function that read and transform log_data files to save user_table, time_table and songplays_table on S3 (in parquet extension) """ # get filepath to log data file log_data = input_data + "log_data/*/*/*.json" #real path # log_data = input_data + "log_data/2018/11/2018-11-12-events.json" # read log data file df = spark.read.json(log_data) # filter by actions for song plays df = df.where("page='NextSong'") # extract columns for users table user_table = df.select(col("userId").cast("int").alias("user_id"),\ col("firstName").alias("first_name"),\ col("lastName").alias("last_name"),"gender","level") user_table = user_table.dropDuplicates() # write users table to parquet files user_table.write.parquet(output_data + 'users/', 'overwrite') # create timestamp column from original timestamp column get_timestamp = udf(lambda x: int(x) / 1000, IntegerType()) df = df.withColumn("timestamp", get_timestamp("ts")) # create datetime column from original timestamp column get_datetime = udf(lambda x: datetime.fromtimestamp(x), TimestampType()) df = df.withColumn("datetime", get_datetime("timestamp")) # print(df.limit(5).toPandas().head()) # extract columns to create time table time_table = df.select(col("timestamp").alias("start_time"),\ hour("datetime").alias("hour"),\ dayofmonth("datetime").alias("day"),\ weekofyear("datetime").alias("week"),\ month("datetime").alias("month"),\ year("datetime").alias("year"),\ date_format('datetime','E').alias('weekday') ) # print(time_table.limit(5).toPandas().head()) time_table = time_table.dropDuplicates() # write time table to parquet files partitioned by year and month time_table.write.partitionBy("year", "month").parquet(output_data + 'time/', 'overwrite') # read in song data to use for songplays table # song_df = spark.read.json(input_data + "song_data/A/B/C/TRABCEI128F424C983.json") song_df = spark.read.json(input_data + "song_data/*/*/*/*.json") # extract columns from joined song and log datasets to create songplays table songplays_table = df.alias("a").join(song_df.alias("b"),\ (df.song == song_df.title) & (df.artist == song_df.artist_name) & (df.length == song_df.duration)).\ select(col("a.ts").alias("start_time"),col("a.userId").cast("int").alias("a.user_id"),"level",\ col("a.sessionId").alias("session_id"),"a.location","a.userAgent","b.song_id","b.artist_id") get_start_time = udf(lambda x: datetime.fromtimestamp(int(x) / 1000), TimestampType()) songplays_table = songplays_table.withColumn("start_time", get_start_time("start_time")) songplays_table = songplays_table.withColumn("songplay_id", monotonically_increasing_id()) songplays_table = songplays_table.withColumn("year", year("start_time")) songplays_table = songplays_table.withColumn("month", month("start_time")) # print(songplays_table.limit(5).toPandas().head()) songplays_table = songplays_table.dropDuplicates() # write songplays table to parquet files partitioned by year and month songplays_table.write.partitionBy("year", "month").parquet( output_data + 'songplays/', 'overwrite')
def cal_performance(date, period, input_batch, output_batch): # 得到历史交易日 hist_dt = fetch_com_dt_hist(date) pef_horizions = { '1w': hist_dt.loc['B1W'].strftime('%Y%m%d'), '1m': hist_dt.loc['B1M'].strftime('%Y%m%d'), '3m': hist_dt.loc['B3M'].strftime('%Y%m%d'), '6m': hist_dt.loc['B6M'].strftime('%Y%m%d'), '1y': hist_dt.loc['B1Y'].strftime('%Y%m%d'), '3y': hist_dt.loc['B3Y'].strftime('%Y%m%d'), '5y': hist_dt.loc['B5Y'].strftime('%Y%m%d') } if period == 'all': start = None elif period == '1w': start = pef_horizions[period] elif period == '1m': start = pef_horizions[period] elif period == '3m': start = pef_horizions[period] elif period == '6m': start = pef_horizions[period] elif period == '1y': start = pef_horizions[period] elif period == '3y': start = pef_horizions[period] elif period == '5y': start = pef_horizions[period] ss = SparkSession \ .builder \ .appName(app_name + '_' + str(date) + '_' + period + '_' + str(is_debug)) \ .getOrCreate() ss.sparkContext.setLogLevel('WARN') # 从csv读取数据 并进行格式转换 schema = StructType([ StructField('date', TimestampType(), True), StructField('sec_id', StringType(), True), StructField('nav', FloatType(), True), StructField('ret', FloatType(), True), StructField('stock', FloatType(), True), StructField('treasury', FloatType(), True), StructField('credit', FloatType(), True), StructField('bench_ret', FloatType(), True), StructField('fnd_category', IntegerType(), True), ]) # ret_all_spark_df = ss.read.csv(data_source_csv_path + date + '/' + str(input_batch) + '/ret_all.csv', header=True, # schema=schema) ret_all_spark_df = ss.read.csv(data_source_csv_path + '20200320/1/ret_all.csv', header=True, schema=schema) # debug模式下只取部分基金 if is_debug: logging.info('use debug') # sec_id_list = ['000006JK', '000028JK', '000134JK', '000135JK'] # sec_id_list = ['005503JK', '005368JK', '004892JK', '150066JK', # '000189JK', '000270JK', '000327JK'] # sec_id_list = ['150066JK'] # sec_id_list = ['006382JK'] today_spark_df = ret_all_spark_df.filter( ret_all_spark_df.date == datetime.strptime(date, '%Y%m%d')) rank_w = Window.orderBy('sec_id') today_spark_df = today_spark_df.withColumn( 'row_no', func.row_number().over(rank_w)) today_spark_df = today_spark_df.filter( today_spark_df.row_no <= 100).select('sec_id') ret_all_spark_df = ret_all_spark_df.join(today_spark_df, on='sec_id', how='inner') # ret_all_spark_df = ret_all_spark_df[ret_all_spark_df.sec_id.isin(sec_id_list)] else: logging.info('use release') # 只取date以前的数据 切片 date start是%Y%m%d 注意转换为timestamp ret_all_spark_df = ret_all_spark_df[ ret_all_spark_df.date <= datetime.strptime(date, '%Y%m%d')] # date 小于最后一天 w = Window.partitionBy('sec_id').orderBy('date').rowsBetween( Window.unboundedPreceding, Window.unboundedFollowing) ret_all_spark_df = ret_all_spark_df.withColumn('the_last_date', func.last('date').over(w)) ret_all_spark_df = ret_all_spark_df.where( ret_all_spark_df.the_last_date >= datetime.strptime(date, '%Y%m%d')) if period == 'all': # 自定义函数 udf_mean = func.udf(lambda x: float(pd.Series(x).mean()), FloatType()) udf_std = func.udf(lambda x: float(pd.Series(x).std()), FloatType()) udf_min = func.udf(lambda x: float(pd.Series(x).min()), FloatType()) udf_max = func.udf(lambda x: float(pd.Series(x).max()), FloatType()) udf_p25 = func.udf(lambda x: float(pd.Series(x).quantile(0.25)), FloatType()) udf_median = func.udf(lambda x: float(pd.Series(x).median()), FloatType()) udf_p75 = func.udf(lambda x: float(pd.Series(x).quantile(0.75)), FloatType()) udf_skew = func.udf(lambda x: float(pd.Series(x).skew()), FloatType()) udf_kurt = func.udf(lambda x: float(pd.Series(x).kurt()), FloatType()) udf_start = func.udf(lambda x: str(x[0].strftime('%Y%m%d')), StringType()) udf_end = func.udf(lambda x: str(x[-1].strftime('%Y%m%d')), StringType()) udf_cagr = func.udf(lambda x: float(Measure.cal_cagr(pd.Series(x))), FloatType()) udf_cumret = func.udf( lambda x: float(Measure.cal_cumret(pd.Series(x))), FloatType()) udf_standard_deviation = func.udf( lambda x: float(Measure.cal_standard_deviation(pd.Series(x))), FloatType()) udf_max_drawdown = func.udf( lambda x, y: float(Measure.cal_max_drawdown(pd.Series(x, index=y)) ), FloatType()) udf_sharpe = func.udf( lambda x: float(Measure.cal_sharpe(pd.Series(x))), FloatType()) udf_downside_deviation = func.udf( lambda x: float(Measure.cal_downside_deviation(pd.Series(x))), FloatType()) udf_alpha = func.udf( lambda x, y, z, w, f: float( Measure.cal_alpha( pd.Series(x), pd.DataFrame({ 'stock': y, 'treasury': z, 'credit': w }), f)), FloatType()) udf_marketbeta = func.udf( lambda x, y: float( Measure.cal_marketbeta(pd.Series(x), pd.Series(y))), FloatType()) udf_information = func.udf( lambda x, y: float( Measure.cal_information(pd.Series(x), pd.Series(y))), FloatType()) udf_treynor = func.udf( lambda x, y: float(Measure.cal_treynor(pd.Series(x), pd.Series(y)) ), FloatType()) # 过滤基金数据长度不够 ret_all_spark_df = ret_all_spark_df.withColumn( 'fund_length', func.count('date').over(w)) ret_all_spark_df = ret_all_spark_df[ ret_all_spark_df['fund_length'] >= 2] nt_val_spark_df = ret_all_spark_df[ ret_all_spark_df.date == datetime.strptime(date, '%Y%m%d')].select( 'sec_id', 'nav').withColumnRenamed('nav', 'nt_val') # 做一下排序 保证ret按date有序 否則date在collect_list后不是順序 ret_all_spark_df = ret_all_spark_df.withColumn('ret_list',func.when(func.col('date') != func.col('the_last_date'), func.array(func.lit(0))).otherwise(func.collect_list('ret').over(w)))\ .withColumn('stock_ret_list', func.when(func.col('date') != func.col('the_last_date'), func.array(func.lit(0))).otherwise(func.collect_list('stock').over(w)))\ .withColumn('treasury_ret_list', func.when(func.col('date') != func.col('the_last_date'), func.array(func.lit(0))).otherwise(func.collect_list('treasury').over(w)))\ .withColumn('credit_ret_list', func.when(func.col('date') != func.col('the_last_date'), func.array(func.lit(0))).otherwise(func.collect_list('credit').over(w)))\ .withColumn('date_list', func.when(func.col('date') != func.col('the_last_date'), func.array(func.lit(datetime.strptime('2020-03-06','%Y-%m-%d')))).otherwise(func.collect_list('date').over(w))) nav_agg_part_1 = ret_all_spark_df[ ret_all_spark_df.date == ret_all_spark_df.the_last_date].select( 'sec_id', 'ret_list', 'stock_ret_list', 'treasury_ret_list', 'credit_ret_list', 'date_list', 'fnd_category') if is_debug: nav_agg_part_1.show() # 后面不需要用到ret_all_spark_df 把所有列全部drop掉 ret_all_spark_df = ret_all_spark_df.drop( 'sec_id', 'date', 'nav', 'ret', 'stock', 'treasury', 'credit', 'bench_ret', 'fnd_category', 'the_last_date', 'fund_length', 'ret_list', 'stock_ret_list', 'treasury_ret_list', 'credit_ret_list', 'date_list') if is_debug: ret_all_spark_df.show() # 取当前日净值 nav_agg_part_1 = nav_agg_part_1.join(nt_val_spark_df, on=['sec_id'], how='left') nav_agg_part_1 = nav_agg_part_1.withColumn('ret_mean', udf_mean('ret_list')) \ .withColumn('ret_std', udf_std('ret_list')) \ .withColumn('ret_min', udf_min('ret_list')) \ .withColumn('ret_max', udf_max('ret_list')) \ .withColumn('ret_p25', udf_p25('ret_list')) \ .withColumn('ret_median', udf_median('ret_list')) \ .withColumn('ret_p75', udf_p75('ret_list')) \ .withColumn('ret_skew', udf_skew('ret_list')) \ .withColumn('ret_kurtosis', udf_kurt('ret_list')) \ .withColumn('ret_start', udf_start('date_list')) \ .withColumn('cagr_sf', udf_cagr('ret_list'))\ .withColumn('cumret_sf', udf_cumret('ret_list'))\ .withColumn('vol_sf', udf_standard_deviation('ret_list'))\ .withColumn('md_sf', udf_max_drawdown('ret_list','date_list'))\ .withColumn('sharpe_sf', udf_sharpe('ret_list'))\ .withColumn('dvol_sf', udf_downside_deviation('ret_list'))\ .withColumn('alpha_sf', udf_alpha('ret_list','stock_ret_list','treasury_ret_list','credit_ret_list','fnd_category'))\ .withColumn('beta_sf', udf_marketbeta('ret_list','stock_ret_list'))\ .withColumn('ir_sf', udf_information('ret_list','stock_ret_list'))\ .withColumn('treynor_sf', udf_treynor('ret_list','stock_ret_list')) # drop 掉中间列 nav_agg_part_1 = nav_agg_part_1.drop('ret_list', 'stock_ret_list', 'treasury_ret_list', 'credit_ret_list', 'date_list', 'fnd_category') if is_debug: nav_agg_part_1.show() if is_write_file: nav_agg_part_1.write.option( 'header', 'true').mode('overwrite').csv(output_csv_path + str(date) + "/" + str(output_batch) + "/" + period) else: # 自定義函數 udf_cagr = func.udf(lambda x: float(Measure.cal_cagr(pd.Series(x))), FloatType()) udf_cumret = func.udf( lambda x: float(Measure.cal_cumret(pd.Series(x))), FloatType()) udf_aar = func.udf(lambda x: float(Measure.cal_aar(pd.Series(x))), FloatType()) udf_alpha = func.udf( lambda x, y, z, w, f: float( Measure.cal_alpha( pd.Series(x), pd.DataFrame({ 'stock': y, 'treasury': z, 'credit': w }), f)), FloatType()) udf_standard_deviation = func.udf( lambda x: float(Measure.cal_standard_deviation(pd.Series(x))), FloatType()) udf_downside_deviation = func.udf( lambda x: float(Measure.cal_downside_deviation(pd.Series(x))), FloatType()) udf_max_drawdown = func.udf( lambda x, y: float(Measure.cal_max_drawdown(pd.Series(x, index=y)) ), FloatType()) udf_marketbeta = func.udf( lambda x, y: float( Measure.cal_marketbeta(pd.Series(x), pd.Series(y))), FloatType()) udf_var = func.udf(lambda x: float(Measure.cal_var(pd.Series(x))), FloatType()) udf_sharpe = func.udf( lambda x: float(Measure.cal_sharpe(pd.Series(x))), FloatType()) udf_sortino = func.udf( lambda x: float(Measure.cal_sortino(pd.Series(x))), FloatType()) udf_calmar = func.udf( lambda x: float(Measure.cal_calmar(pd.Series(x))), FloatType()) udf_omega = func.udf(lambda x: float(Measure.cal_omega(pd.Series(x))), FloatType()) udf_information = func.udf( lambda x, y: float( Measure.cal_information(pd.Series(x), pd.Series(y))), FloatType()) udf_treynor = func.udf( lambda x, y: float(Measure.cal_treynor(pd.Series(x), pd.Series(y)) ), FloatType()) udf_m_square = func.udf( lambda x, y: float(Measure.cal_m_square(pd.Series(x), pd.Series(y)) ), FloatType()) udf_sterling = func.udf( lambda x: float(Measure.cal_sterling(pd.Series(x))), FloatType()) udf_burke = func.udf(lambda x: float(Measure.cal_burke(pd.Series(x))), FloatType()) udf_tail = func.udf(lambda x: float(Measure.cal_tail(pd.Series(x))), FloatType()) udf_rachev = func.udf( lambda x: float(Measure.cal_rachev(pd.Series(x))), FloatType()) udf_stability = func.udf( lambda x: float(Measure.cal_stability(pd.Series(x))), FloatType()) udf_min_monthly_return = func.udf( lambda x, y: float( Measure.cal_min_monthly_return(pd.Series(x, index=y))), FloatType()) udf_max_monthly_return = func.udf( lambda x, y: float( Measure.cal_max_monthly_return(pd.Series(x, index=y))), FloatType()) udf_monthly_odds = func.udf( lambda x, y: float(Measure.cal_monthly_odds(pd.Series(x, index=y)) ), FloatType()) udf_picking = func.udf( lambda x, y: float( Measure.cal_picking(pd.Series(x), pd.Series(y, name='stock'))), FloatType()) udf_timing = func.udf( lambda x, y: float( Measure.cal_timing(pd.Series(x), pd.Series(y, name='stock'))), FloatType()) udf_trackerror = func.udf( lambda x, y, z: float( Measure.cal_trackerror(pd.Series(x), pd.Series(y), z)), FloatType()) # 过滤出开始日期在基金发行日期之后的数据 ret_all_spark_df = ret_all_spark_df.withColumn( 'the_first_date', func.first('date').over(w)) ret_all_spark_df = ret_all_spark_df[ret_all_spark_df.the_first_date <= datetime.strptime(start, '%Y%m%d')] # 取start日期之前的数据 切片 ret_all_spark_df = ret_all_spark_df[ ret_all_spark_df.date >= datetime.strptime(start, '%Y%m%d')] # 过滤基金数据长度不够 又进行了切片 所以需要重新统计基金长度 ret_all_spark_df = ret_all_spark_df.withColumn( 'fund_length', func.count('date').over(w)) ret_all_spark_df = ret_all_spark_df[ ret_all_spark_df['fund_length'] >= 2] # 做一下排序 保证ret按date有序 否則date在collect_list后不是順序 ret_all_spark_df = ret_all_spark_df.withColumn('ret_list',func.when(func.col('date') != func.col('the_last_date'), func.array(func.lit(0))).otherwise(func.collect_list('ret').over(w)))\ .withColumn('stock_ret_list', func.when(func.col('date') != func.col('the_last_date'), func.array(func.lit(0))).otherwise(func.collect_list('stock').over(w)))\ .withColumn('treasury_ret_list', func.when(func.col('date') != func.col('the_last_date'), func.array(func.lit(0))).otherwise(func.collect_list('treasury').over(w)))\ .withColumn('credit_ret_list', func.when(func.col('date') != func.col('the_last_date'), func.array(func.lit(0))).otherwise(func.collect_list('credit').over(w))) \ .withColumn('bench_ret_list', func.when(func.col('date') != func.col('the_last_date'), func.array(func.lit(0))).otherwise(func.collect_list('bench_ret').over(w))) \ .withColumn('date_list', func.when(func.col('date') != func.col('the_last_date'), func.array(func.lit(datetime.strptime('2020-03-06','%Y-%m-%d')))).otherwise(func.collect_list('date').over(w))) nav_agg_part_2 = ret_all_spark_df[ ret_all_spark_df.date == ret_all_spark_df.the_last_date].select( 'sec_id', 'ret_list', 'stock_ret_list', 'treasury_ret_list', 'credit_ret_list', 'bench_ret_list', 'date_list', 'fnd_category') if is_debug: nav_agg_part_2.show() # 后面不需要用到ret_all_spark_df 把所有列全部drop掉 ret_all_spark_df = ret_all_spark_df.drop( 'sec_id', 'date', 'nav', 'ret', 'stock', 'treasury', 'credit', 'bench_ret', 'fnd_category', 'the_last_date', 'the_first_date', 'fund_length', 'ret_list', 'stock_ret_list', 'treasury_ret_list', 'credit_ret_list', 'bench_ret_list', 'date_list') if is_debug: ret_all_spark_df.show() nav_agg_part_2 = nav_agg_part_2.withColumn('cagr_' + period, udf_cagr('ret_list'))\ .withColumn('cumret_' + period, udf_cumret('ret_list'))\ .withColumn('aar_' + period, udf_aar('ret_list'))\ .withColumn('alpha_' + period, udf_alpha('ret_list','stock_ret_list','treasury_ret_list','credit_ret_list', 'fnd_category'))\ .withColumn('vol_' + period, udf_standard_deviation('ret_list'))\ .withColumn('dvol_' + period, udf_downside_deviation('ret_list'))\ .withColumn('md_' + period, udf_max_drawdown('ret_list', 'date_list'))\ .withColumn('beta_' + period, udf_marketbeta('ret_list', 'stock_ret_list'))\ .withColumn('var_' + period, udf_var('ret_list'))\ .withColumn('sharpe_' + period, udf_sharpe('ret_list'))\ .withColumn('sortino_' + period, udf_sortino('ret_list'))\ .withColumn('calmar_' + period, udf_calmar('ret_list'))\ .withColumn('omega_' + period, udf_omega('ret_list'))\ .withColumn('ir_' + period, udf_information('ret_list','stock_ret_list'))\ .withColumn('treynor_' + period, udf_treynor('ret_list','stock_ret_list'))\ .withColumn('m_square_' + period, udf_m_square('ret_list','stock_ret_list'))\ .withColumn('sterling_' + period, udf_sterling('ret_list'))\ .withColumn('burke_' + period, udf_burke('ret_list'))\ .withColumn('tail_' + period, udf_tail('ret_list'))\ .withColumn('rachev_' + period, udf_rachev('ret_list'))\ .withColumn('stability_' + period, udf_stability('ret_list')) if period in ['3m', '6m', '1y', '3y', '5y']: nav_agg_part_2 = nav_agg_part_2.withColumn('min_monthly_ret_' + period, udf_min_monthly_return('ret_list','date_list'))\ .withColumn('max_monthly_ret_' + period, udf_max_monthly_return('ret_list','date_list'))\ .withColumn('monthly_odds_' + period, udf_monthly_odds('ret_list', 'date_list')) if period in ['1m', '3m', '6m', '1y', '3y', '5y']: nav_agg_part_2 = nav_agg_part_2.withColumn('picking_' + period, udf_picking('ret_list', 'stock_ret_list'))\ .withColumn('timing_' + period, udf_timing('ret_list', 'stock_ret_list'))\ .withColumn('te_' + period, udf_trackerror('ret_list', 'bench_ret_list', 'fnd_category')) # drop 掉中间列表 nav_agg_part_2 = nav_agg_part_2.drop('ret_list', 'stock_ret_list', 'treasury_ret_list', 'credit_ret_list', 'bench_ret_list', 'date_list', 'fnd_category') if is_debug: nav_agg_part_2.show() if is_write_file: nav_agg_part_2.write.option( 'header', 'true').mode('overwrite').csv(output_csv_path + str(date) + "/" + str(output_batch) + '/' + period)